diff --git a/.travis.yml b/.travis.yml
index fb8ddb2c..d7e3f7cf 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -2,14 +2,14 @@ language: python
 python:
   - "2.7"
 
-#Set virtual env with system-site-packages to true
-virtualenv:
-  system_site_packages: true
-
 # command to install dependencies, e.g. pip install -r requirements.txt --use-mirrors
-before_install: 
-  - sudo apt-get install -qq python-scipy python-pip
-  - sudo apt-get install -qq python-matplotlib
+before_install:
+  #Install a mini version of anaconda such that we can easily install our dependencies
+  - wget http://repo.continuum.io/miniconda/Miniconda-latest-Linux-x86_64.sh -O miniconda.sh
+  - chmod +x miniconda.sh
+  - ./miniconda.sh -b
+  - export PATH=/home/travis/miniconda/bin:$PATH
+  - conda update --yes conda
   # Workaround for a permissions issue with Travis virtual machine images
   # that breaks Python's multiprocessing:
   # https://github.com/travis-ci/travis-cookbooks/issues/155
@@ -17,10 +17,10 @@ before_install:
   - sudo ln -s /run/shm /dev/shm
 
 install:
-  - pip install --upgrade numpy==1.7.1 
-  - pip install sphinx 
-  - pip install nose
-  - pip install . --use-mirrors
+  - conda install --yes python=$TRAVIS_PYTHON_VERSION atlas numpy=1.7 scipy=0.12 matplotlib nose sphinx pip nose
+  - pip install . 
+  #--use-mirrors
+  #
 # command to run tests, e.g. python setup.py test
 script: 
   - nosetests GPy/testing
diff --git a/GPy/FAQ.txt b/GPy/FAQ.txt
deleted file mode 100644
index 66ba4834..00000000
--- a/GPy/FAQ.txt
+++ /dev/null
@@ -1,8 +0,0 @@
-Frequently Asked Questions
---------------------------
-
-Unit tests are run through Travis-Ci. They can be run locally through entering the GPy route diretory and writing
-
-nosetests testing/
-
-Documentation is handled by Sphinx. To build the documentation:
diff --git a/GPy/coding_style_guide.txt b/GPy/coding_style_guide.txt
deleted file mode 100644
index 0cc732e4..00000000
--- a/GPy/coding_style_guide.txt
+++ /dev/null
@@ -1,10 +0,0 @@
-In this text document we will describe coding conventions to be used in GPy to keep things consistent.
-
-All arrays containing data are two dimensional. The first dimension is the number of data, the second dimension is number of features. This keeps things consistent with the idea of a design matrix.
-
-Input matrices are either X or t, output matrices are Y.
-
-Input dimensionality is input_dim, output dimensionality is output_dim, number of data is num_data.
-
-Data sets are preprocessed in the datasets.py file. This file also records where the data set was obtained from in the dictionary stored in the file. Long term we should move this dictionary to sqlite or similar.
-
diff --git a/GPy/core/__init__.py b/GPy/core/__init__.py
index fb40a9e0..ebed29bb 100644
--- a/GPy/core/__init__.py
+++ b/GPy/core/__init__.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2012, GPy authors (see AUTHORS.txt).
+# Copyright (c) 2012-2014, GPy authors (see AUTHORS.txt).
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 
 from model import *
@@ -7,5 +7,6 @@ from parameterization.param import Param, ParamConcatenation
 from parameterization.observable_array import ObsAr
 
 from gp import GP
+from svgp import SVGP
 from sparse_gp import SparseGP
 from mapping import *
diff --git a/GPy/core/gp.py b/GPy/core/gp.py
index 5c69d92b..3252ac08 100644
--- a/GPy/core/gp.py
+++ b/GPy/core/gp.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2012, GPy authors (see AUTHORS.txt).
+# Copyright (c) 2012-2014, GPy authors (see AUTHORS.txt).
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 
 import numpy as np
@@ -124,6 +124,7 @@ class GP(Model):
             else:
                 self.X = ObsAr(X)
         self.update_model(True)
+        self._trigger_params_changed()
 
     def set_X(self,X):
         """
diff --git a/GPy/core/mapping.py b/GPy/core/mapping.py
index 049f1699..111fec6f 100644
--- a/GPy/core/mapping.py
+++ b/GPy/core/mapping.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2013, GPy authors (see AUTHORS.txt).
+# Copyright (c) 2013,2014, GPy authors (see AUTHORS.txt).
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 
 import sys
diff --git a/GPy/core/model.py b/GPy/core/model.py
index ac5a9732..c5d318e7 100644
--- a/GPy/core/model.py
+++ b/GPy/core/model.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2012, 2013, GPy authors (see AUTHORS.txt).
+# Copyright (c) 2012-2014, GPy authors (see AUTHORS.txt).
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 
 
@@ -10,6 +10,8 @@ import multiprocessing as mp
 import numpy as np
 from numpy.linalg.linalg import LinAlgError
 import itertools
+import sys
+from .verbose_optimization import VerboseOptimization
 # import numdifftools as ndt
 
 class Model(Parameterized):
@@ -24,12 +26,13 @@ class Model(Parameterized):
         from .parameterization.ties_and_remappings import Tie
         self.tie = Tie()
         self.link_parameter(self.tie, -1)
+        self.obj_grads = None
         self.add_observer(self.tie, self.tie._parameters_changed_notification, priority=-500)
 
     def log_likelihood(self):
         raise NotImplementedError, "this needs to be implemented to use the model class"
     def _log_likelihood_gradients(self):
-        return self.gradient
+        return self.gradient.copy()
 
     def optimize_restarts(self, num_restarts=10, robust=False, verbose=True, parallel=False, num_processes=None, **kwargs):
         """
@@ -165,14 +168,14 @@ class Model(Parameterized):
         try:
             # self._set_params_transformed(x)
             self.optimizer_array = x
-            obj_grads = self._transform_gradients(self.objective_function_gradients())
+            self.obj_grads = self._transform_gradients(self.objective_function_gradients())
             self._fail_count = 0
         except (LinAlgError, ZeroDivisionError, ValueError):
             if self._fail_count >= self._allowed_failures:
                 raise
             self._fail_count += 1
-            obj_grads = np.clip(self._transform_gradients(self.objective_function_gradients()), -1e100, 1e100)
-        return obj_grads
+            self.obj_grads = np.clip(self._transform_gradients(self.objective_function_gradients()), -1e100, 1e100)
+        return self.obj_grads
 
     def _objective(self, x):
         """
@@ -200,17 +203,17 @@ class Model(Parameterized):
     def _objective_grads(self, x):
         try:
             self.optimizer_array = x
-            obj_f, obj_grads = self.objective_function(), self._transform_gradients(self.objective_function_gradients())
+            obj_f, self.obj_grads = self.objective_function(), self._transform_gradients(self.objective_function_gradients())
             self._fail_count = 0
         except (LinAlgError, ZeroDivisionError, ValueError):
             if self._fail_count >= self._allowed_failures:
                 raise
             self._fail_count += 1
             obj_f = np.inf
-            obj_grads = np.clip(self._transform_gradients(self.objective_function_gradients()), -1e100, 1e100)
-        return obj_f, obj_grads
+            self.obj_grads = np.clip(self._transform_gradients(self.objective_function_gradients()), -1e10, 1e10)
+        return obj_f, self.obj_grads
 
-    def optimize(self, optimizer=None, start=None, **kwargs):
+    def optimize(self, optimizer=None, start=None, messages=False, max_iters=1000, ipython_notebook=True, **kwargs):
         """
         Optimize the model using self.log_likelihood and self.log_likelihood_gradient, as well as self.priors.
 
@@ -218,8 +221,8 @@ class Model(Parameterized):
 
         :param max_f_eval: maximum number of function evaluations
         :type max_f_eval: int
-        :messages: whether to display during optimisation
-        :type messages: bool
+        :messages: True: Display messages during optimisation, "ipython_notebook":
+        :type messages: bool"string
         :param optimizer: which optimizer to use (defaults to self.preferred optimizer)
         :type optimizer: string
 
@@ -233,13 +236,11 @@ class Model(Parameterized):
 
 
         """
-        if self.is_fixed:
-            print 'nothing to optimize'
-        if self.size == 0:
+        if self.is_fixed or self.size == 0:
             print 'nothing to optimize'
 
         if not self.update_model():
-            print "setting updates on again"
+            print "updates were off, setting updates on again"
             self.update_model(True)
 
         if start == None:
@@ -253,9 +254,11 @@ class Model(Parameterized):
             opt.model = self
         else:
             optimizer = optimization.get_optimizer(optimizer)
-            opt = optimizer(start, model=self, **kwargs)
-
-        opt.run(f_fp=self._objective_grads, f=self._objective, fp=self._grads)
+            opt = optimizer(start, model=self, max_iters=max_iters, **kwargs)
+                        
+        with VerboseOptimization(self, opt, maxiters=max_iters, verbose=messages, ipython_notebook=ipython_notebook) as vo:
+            opt.run(f_fp=self._objective_grads, f=self._objective, fp=self._grads)
+            vo.finish(opt)
 
         self.optimization_runs.append(opt)
 
@@ -367,8 +370,8 @@ class Model(Parameterized):
                 f1 = self._objective(xx)
                 xx[xind] -= 2.*step
                 f2 = self._objective(xx)
-                df_ratio = np.abs((f1-f2)/min(f1,f2))
-                df_unstable = df_ratio<df_tolerance
+                df_ratio = np.abs((f1 - f2) / min(f1, f2))
+                df_unstable = df_ratio < df_tolerance
                 numerical_gradient = (f1 - f2) / (2 * step)
                 if np.all(gradient[xind] == 0): ratio = (f1 - f2) == gradient[xind]
                 else: ratio = (f1 - f2) / (2 * step * gradient[xind])
@@ -398,16 +401,26 @@ class Model(Parameterized):
         """Representation of the model in html for notebook display."""
         model_details = [['<b>Model</b>', self.name + '<br>'],
                          ['<b>Log-likelihood</b>', '{}<br>'.format(float(self.log_likelihood()))],
-                         ["<b>Number of Parameters</b>", '{}<br>'.format(self.size)]]
+                         ["<b>Number of Parameters</b>", '{}<br>'.format(self.size)],
+                         ["<b>Updates</b>", '{}<br>'.format(self._update_on)],
+                         ]
         from operator import itemgetter
-        to_print = [""] + ["{}: {}".format(name, detail) for name, detail in model_details] + ["<br><b>Parameters</b>:"]
+        to_print = ["""<style type="text/css">
+.pd{
+    font-family: "Courier New", Courier, monospace !important;
+    width: 100%;
+    padding: 3px;
+}
+</style>\n"""] + ["<p class=pd>"] + ["{}: {}".format(name, detail) for name, detail in model_details] + ["</p>"]
         to_print.append(super(Model, self)._repr_html_())
         return "\n".join(to_print)
 
     def __str__(self):
         model_details = [['Name', self.name],
                          ['Log-likelihood', '{}'.format(float(self.log_likelihood()))],
-                         ["Number of Parameters", '{}'.format(self.size)]]
+                         ["Number of Parameters", '{}'.format(self.size)],
+                         ["Updates", '{}'.format(self._update_on)],
+                         ]
         from operator import itemgetter
         max_len = reduce(lambda a, b: max(len(b[0]), a), model_details, 0)
         to_print = [""] + ["{0:{l}} : {1}".format(name, detail, l=max_len) for name, detail in model_details] + ["Parameters:"]
diff --git a/GPy/core/parameterization/domains.py b/GPy/core/parameterization/domains.py
index cf930ed8..c04b414f 100644
--- a/GPy/core/parameterization/domains.py
+++ b/GPy/core/parameterization/domains.py
@@ -1,7 +1,7 @@
 # Copyright (c) 2012, GPy authors (see AUTHORS.txt).
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 
-'''
+"""
 (Hyper-)Parameter domains defined for :py:mod:`~GPy.core.priors` and :py:mod:`~GPy.kern`.
 These domains specify the legitimate realm of the parameters to live in.
 
@@ -10,14 +10,14 @@ These domains specify the legitimate realm of the parameters to live in.
 
 :const:`~GPy.core.domains._POSITIVE`:
     positive domain, only positive real values are allowed
-    
+
 :const:`~GPy.core.domains._NEGATIVE`:
     same as :const:`~GPy.core.domains._POSITIVE`, but only negative values are allowed
-    
+
 :const:`~GPy.core.domains._BOUNDED`:
     only values within the bounded range are allowed,
     the bounds are specified withing the object with the bounded range
-'''
+"""
 
 _REAL = 'real'
 _POSITIVE = "positive"
diff --git a/GPy/core/parameterization/index_operations.py b/GPy/core/parameterization/index_operations.py
index ddd689ed..61c82da1 100644
--- a/GPy/core/parameterization/index_operations.py
+++ b/GPy/core/parameterization/index_operations.py
@@ -1,8 +1,6 @@
-'''
-Created on Oct 2, 2013
+# Copyright (c) 2014, Max Zwiessele
+# Licensed under the BSD 3-clause license (see LICENSE.txt)
 
-@author: maxzwiessele
-'''
 import numpy
 from numpy.lib.function_base import vectorize
 from lists_and_dicts import IntArrayDict
diff --git a/GPy/core/parameterization/lists_and_dicts.py b/GPy/core/parameterization/lists_and_dicts.py
index 0343909e..5afbb8ed 100644
--- a/GPy/core/parameterization/lists_and_dicts.py
+++ b/GPy/core/parameterization/lists_and_dicts.py
@@ -1,8 +1,5 @@
-'''
-Created on 27 Feb 2014
-
-@author: maxz
-'''
+# Copyright (c) 2014, Max Zwiessele
+# Licensed under the BSD 3-clause license (see LICENSE.txt)
 
 from collections import defaultdict
 import weakref
diff --git a/GPy/core/parameterization/observable.py b/GPy/core/parameterization/observable.py
index ad55b44c..8a85c6ca 100644
--- a/GPy/core/parameterization/observable.py
+++ b/GPy/core/parameterization/observable.py
@@ -1,8 +1,5 @@
-'''
-Created on 30 Oct 2014
-
-@author: maxz
-'''
+# Copyright (c) 2014, Max Zwiessele
+# Licensed under the BSD 3-clause license (see LICENSE.txt)
 
 
 class Observable(object):
@@ -17,6 +14,10 @@ class Observable(object):
         super(Observable, self).__init__()
         from lists_and_dicts import ObserverList
         self.observers = ObserverList()
+        self._update_on = True
+
+    def set_updates(self, on=True):
+        self._update_on = on
 
     def add_observer(self, observer, callble, priority=0):
         """
@@ -54,16 +55,17 @@ class Observable(object):
         :param min_priority: only notify observers with priority > min_priority
                              if min_priority is None, notify all observers in order
         """
-        if which is None:
-            which = self
-        if min_priority is None:
-            [callble(self, which=which) for _, _, callble in self.observers]
-        else:
-            for p, _, callble in self.observers:
-                if p <= min_priority:
-                    break
-                callble(self, which=which)
+        if self._update_on:
+            if which is None:
+                which = self
+            if min_priority is None:
+                [callble(self, which=which) for _, _, callble in self.observers]
+            else:
+                for p, _, callble in self.observers:
+                    if p <= min_priority:
+                        break
+                    callble(self, which=which)
 
     def change_priority(self, observer, callble, priority):
         self.remove_observer(observer, callble)
-        self.add_observer(observer, callble, priority)
\ No newline at end of file
+        self.add_observer(observer, callble, priority)
diff --git a/GPy/core/parameterization/observable_array.py b/GPy/core/parameterization/observable_array.py
index 4a7bdf85..271fe7b9 100644
--- a/GPy/core/parameterization/observable_array.py
+++ b/GPy/core/parameterization/observable_array.py
@@ -1,7 +1,6 @@
-# Copyright (c) 2012, GPy authors (see AUTHORS.txt).
+# Copyright (c) 2014, Max Zwiessele
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 
-__updated__ = '2014-11-11'
 
 import numpy as np
 from parameter_core import Pickleable
diff --git a/GPy/core/parameterization/param.py b/GPy/core/parameterization/param.py
index 5560a362..1246bc18 100644
--- a/GPy/core/parameterization/param.py
+++ b/GPy/core/parameterization/param.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2012, GPy authors (see AUTHORS.txt).
+# Copyright (c) 2014, Max Zwiessele
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 
 import itertools
@@ -84,6 +84,7 @@ class Param(Parameterizable, ObsAr):
         self._original_ = getattr(obj, '_original_', None)
         self._name = getattr(obj, '_name', None)
         self._gradient_array_ = getattr(obj, '_gradient_array_', None)
+        self._update_on = getattr(obj, '_update_on', None)
         self.constraints = getattr(obj, 'constraints', None)
         self.priors = getattr(obj, 'priors', None)
 
@@ -264,15 +265,21 @@ class Param(Parameterizable, ObsAr):
         ties = [' '.join(map(lambda x: x, t)) for t in ties]
         header_format = """
 <tr>
-  <td><b>{i}</b></td>
-  <td><b>{x}</b></td>
-  <td><b>{c}</b></td>
-  <td><b>{p}</b></td>
-  <td><b>{t}</b></td>
+  <th><b>{i}</b></th>
+  <th><b>{x}</b></th>
+  <th><b>{c}</b></th>
+  <th><b>{p}</b></th>
+  <th><b>{t}</b></th>
 </tr>"""
         header = header_format.format(x=self.hierarchy_name(), c=__constraints_name__, i=__index_name__, t=__tie_name__, p=__priors_name__)  # nice header for printing
         if not ties: ties = itertools.cycle([''])
-        return "\n".join(['<table>'] + [header] + ["<tr><td>{i}</td><td align=\"right\">{x}</td><td>{c}</td><td>{p}</td><td>{t}</td></tr>".format(x=x, c=" ".join(map(str, c)), p=" ".join(map(str, p)), t=(t or ''), i=i) for i, x, c, t, p in itertools.izip(indices, vals, constr_matrix, ties, prirs)] + ["</table>"])  
+        return "\n".join(["""<style type="text/css">
+.tg  {padding:2px 3px;word-break:normal;border-collapse:collapse;border-spacing:0;border-color:#DCDCDC;margin:0px auto;width:100%;}
+.tg td{font-family:"Courier New", Courier, monospace !important;font-weight:bold;color:#444;background-color:#F7FDFA;border-style:solid;border-width:1px;overflow:hidden;word-break:normal;border-color:#DCDCDC;}
+.tg th{font-family:"Courier New", Courier, monospace !important;font-weight:normal;color:#fff;background-color:#26ADE4;border-style:solid;border-width:1px;overflow:hidden;word-break:normal;border-color:#DCDCDC;}
+.tg .tg-left{font-family:"Courier New", Courier, monospace !important;font-weight:normal;text-align:left;}
+.tg .tg-right{font-family:"Courier New", Courier, monospace !important;font-weight:normal;text-align:right;}
+</style>"""] + ['<table class="tg">'] + [header] + ["<tr><td class=tg-left>{i}</td><td  class=tg-right>{x}</td><td class=tg-left>{c}</td><td class=tg-left>{p}</td><td class=tg-left>{t}</td></tr>".format(x=x, c=" ".join(map(str, c)), p=" ".join(map(str, p)), t=(t or ''), i=i) for i, x, c, t, p in itertools.izip(indices, vals, constr_matrix, ties, prirs)] + ["</table>"])
 
     def __str__(self, constr_matrix=None, indices=None, prirs=None, ties=None, lc=None, lx=None, li=None, lp=None, lt=None, only_name=False):
         filter_ = self._current_slice_
@@ -354,7 +361,7 @@ class ParamConcatenation(object):
     #===========================================================================
     def update_all_params(self):
         for par in self.parents:
-            par.notify_observers()
+            par.trigger_update(trigger_parent=False)
 
     def constrain(self, constraint, warning=True):
         [param.constrain(constraint, trigger_parent=False) for param in self.params]
diff --git a/GPy/core/parameterization/parameter_core.py b/GPy/core/parameterization/parameter_core.py
index 6add95b0..bee160b2 100644
--- a/GPy/core/parameterization/parameter_core.py
+++ b/GPy/core/parameterization/parameter_core.py
@@ -471,7 +471,7 @@ class Indexable(Nameable, Updateable):
             self.param_array[...] = transform.initialize(self.param_array)
         reconstrained = self.unconstrain()
         added = self._add_to_index_operations(self.constraints, reconstrained, transform, warning)
-        self.notify_observers(self, None if trigger_parent else -np.inf)
+        self.trigger_update(trigger_parent)
         return added
 
     def unconstrain(self, *transforms):
@@ -684,6 +684,17 @@ class OptimizationHandlable(Indexable):
         if self._has_fixes(): return g[self._fixes_]
         return g
 
+    def _transform_gradients_non_natural(self, g):
+        """
+        Transform the gradients by multiplying the gradient factor for each
+        constraint to it.
+        """
+        self._highest_parent_.tie.collate_gradient()
+        [np.put(g, i, c.gradfactor_non_natural(self.param_array[i], g[i])) for c, i in self.constraints.iteritems() if c != __fixed__]
+        if self._has_fixes(): return g[self._fixes_]
+        return g
+
+
     @property
     def num_params(self):
         """
@@ -798,7 +809,7 @@ class Parameterizable(OptimizationHandlable):
     A parameterisable class.
 
     This class provides the parameters list (ArrayList) and standard parameter handling,
-    such as {add|remove}_parameter(), traverse hierarchy and param_array, gradient_array
+    such as {link|unlink}_parameter(), traverse hierarchy and param_array, gradient_array
     and the empty parameters_changed().
 
     This class is abstract and should not be instantiated.
@@ -1031,6 +1042,9 @@ class Parameterizable(OptimizationHandlable):
                     p = param_to_array(p)
                     d = f.create_dataset(n,p.shape,dtype=p.dtype)
                     d[:] = p
+                if hasattr(self, 'param_array'):
+                    d = f.create_dataset('param_array',self.param_array.shape, dtype=self.param_array.dtype)
+                    d[:] = self.param_array
                 f.close()
             except:
                 raise 'Fails to write the parameters into a HDF5 file!'
diff --git a/GPy/core/parameterization/parameterized.py b/GPy/core/parameterization/parameterized.py
index 2c91b235..44173f58 100644
--- a/GPy/core/parameterization/parameterized.py
+++ b/GPy/core/parameterization/parameterized.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2012, GPy authors (see AUTHORS.txt).
+# Copyright (c) 2014, Max Zwiessele, James Hensman
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 
 
@@ -215,9 +215,9 @@ class Parameterized(Parameterizable):
         self._highest_parent_._notify_parent_change()
 
     def add_parameter(self, *args, **kwargs):
-        raise DeprecationWarning, "add_parameter was renamed to link_parameter to avoid confusion of setting variables"
+        raise DeprecationWarning, "add_parameter was renamed to link_parameter to avoid confusion of setting variables, use link_parameter instead"
     def remove_parameter(self, *args, **kwargs):
-        raise DeprecationWarning, "remove_parameter was renamed to link_parameter to avoid confusion of setting variables"
+        raise DeprecationWarning, "remove_parameter was renamed to unlink_parameter to avoid confusion of setting variables, use unlink_parameter instead"
 
     def _connect_parameters(self, ignore_added_names=False):
         # connect parameterlist to this parameterized object
@@ -296,7 +296,7 @@ class Parameterized(Parameterizable):
                 self.param_array[name] = value
             except:
                 raise ValueError, "Setting by slice or index only allowed with array-like"
-            self._trigger_params_changed()
+            self.trigger_update()
         else:
             try: param = self.__getitem__(name, paramlist)
             except: raise
@@ -377,7 +377,7 @@ class Parameterized(Parameterizable):
         cl = max([len(str(x)) if x else 0 for x in constrs + ["Constraint"]])
         tl = max([len(str(x)) if x else 0 for x in ts + ["Tied to"]])
         pl = max([len(str(x)) if x else 0 for x in prirs + ["Prior"]])
-        format_spec = "<tr><td>{{name:<{0}s}}</td><td align=\"right\">{{desc:>{1}s}}</td><td>{{const:^{2}s}}</td><td>{{pri:^{3}s}}</td><td>{{t:^{4}s}}</td></tr>".format(nl, sl, cl, pl, tl)
+        format_spec = "<tr><td class=tg-left>{{name:<{0}s}}</td><td class=tg-right>{{desc:>{1}s}}</td><td class=tg-left>{{const:^{2}s}}</td><td class=tg-left>{{pri:^{3}s}}</td><td class=tg-left>{{t:^{4}s}}</td></tr>".format(nl, sl, cl, pl, tl)
         to_print = []
         for n, d, c, t, p in itertools.izip(names, desc, constrs, ts, prirs):
             to_print.append(format_spec.format(name=n, desc=d, const=c, t=t, pri=p))
@@ -385,13 +385,21 @@ class Parameterized(Parameterizable):
         if header:
             header = """
 <tr>
-  <td><b>{name}</b>
-  <td><b>Value</b></td>
-  <td><b>Constraint</b></td>
-  <td><b>Prior</b></td>
-  <td><b>Tied to</b></td>""".format(name=name)
+  <th><b>{name}</b></th>
+  <th><b>Value</b></th>
+  <th><b>Constraint</b></th>
+  <th><b>Prior</b></th>
+  <th><b>Tied to</b></th>
+</tr>""".format(name=name)
             to_print.insert(0, header)
-        return '<table>' + '\n'.format(sep).join(to_print) + '\n</table>'
+        style = """<style type="text/css">
+.tg  {font-family:"Courier New", Courier, monospace !important;padding:2px 3px;word-break:normal;border-collapse:collapse;border-spacing:0;border-color:#DCDCDC;margin:0px auto;width:100%;}
+.tg td{font-family:"Courier New", Courier, monospace !important;font-weight:bold;color:#444;background-color:#F7FDFA;border-style:solid;border-width:1px;overflow:hidden;word-break:normal;border-color:#DCDCDC;}
+.tg th{font-family:"Courier New", Courier, monospace !important;font-weight:normal;color:#fff;background-color:#26ADE4;border-style:solid;border-width:1px;overflow:hidden;word-break:normal;border-color:#DCDCDC;}
+.tg .tg-left{font-family:"Courier New", Courier, monospace !important;font-weight:normal;text-align:left;}
+.tg .tg-right{font-family:"Courier New", Courier, monospace !important;font-weight:normal;text-align:right;}
+</style>"""
+        return style + '\n' + '<table class="tg">' + '\n'.format(sep).join(to_print) + '\n</table>'
 
     def __str__(self, header=True):
         name = adjust_name_for_printing(self.name) + "."
diff --git a/GPy/core/parameterization/priors.py b/GPy/core/parameterization/priors.py
index 8c0c4b44..20a78691 100644
--- a/GPy/core/parameterization/priors.py
+++ b/GPy/core/parameterization/priors.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2012, GPy authors (see AUTHORS.txt).
+# Copyright (c) 2012 - 2014, GPy authors (see AUTHORS.txt).
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 
 
diff --git a/GPy/core/parameterization/transformations.py b/GPy/core/parameterization/transformations.py
index 291076a1..d929b1d9 100644
--- a/GPy/core/parameterization/transformations.py
+++ b/GPy/core/parameterization/transformations.py
@@ -42,6 +42,8 @@ class Transformation(object):
             \frac{\frac{\partial L}{\partial f}\left(\left.\partial f(x)}{\partial x}\right|_{x=f^{-1}(f)\right)}
         """
         raise NotImplementedError
+    def gradfactor_non_natural(self, model_param, dL_dmodel_param):
+        return self.gradfactor(model_param, dL_dmodel_param)
     def initialize(self, f):
         """ produce a sensible initial value for f(x)"""
         raise NotImplementedError
@@ -77,8 +79,10 @@ class Logexp(Transformation):
 
 
 class NormalTheta(Transformation):
+    "Do not use, not officially supported!"
     _instances = []
-    def __new__(cls, mu_indices, var_indices):
+    def __new__(cls, mu_indices=None, var_indices=None):
+        "Do not use, not officially supported!"
         if cls._instances:
             cls._instances[:] = [instance for instance in cls._instances if instance()]
             for instance in cls._instances:
@@ -98,6 +102,7 @@ class NormalTheta(Transformation):
         # that the values are ok
         # Before:
         theta[self.var_indices] = np.abs(-.5/theta[self.var_indices])
+        #theta[self.var_indices] = np.exp(-.5/theta[self.var_indices])
         theta[self.mu_indices] *= theta[self.var_indices]
         return theta # which is now {mu, var}
 
@@ -106,6 +111,7 @@ class NormalTheta(Transformation):
         varp = muvar[self.var_indices]
         muvar[self.mu_indices] /= varp
         muvar[self.var_indices] = -.5/varp
+        #muvar[self.var_indices] = -.5/np.log(varp)
 
         return muvar # which is now {theta1, theta2}
 
@@ -139,9 +145,10 @@ class NormalTheta(Transformation):
         self.var_indices = state[1]
 
 class NormalNaturalAntti(NormalTheta):
+    "Do not use, not officially supported!"
     _instances = []
-    _logexp = Logexp()
-    def __new__(cls, mu_indices, var_indices):
+    def __new__(cls, mu_indices=None, var_indices=None):
+        "Do not use, not officially supported!"
         if cls._instances:
             cls._instances[:] = [instance for instance in cls._instances if instance()]
             for instance in cls._instances:
@@ -178,8 +185,10 @@ class NormalNaturalAntti(NormalTheta):
         return "natantti"
 
 class NormalEta(Transformation):
+    "Do not use, not officially supported!"
     _instances = []
-    def __new__(cls, mu_indices, var_indices):
+    def __new__(cls, mu_indices=None, var_indices=None):
+        "Do not use, not officially supported!"
         if cls._instances:
             cls._instances[:] = [instance for instance in cls._instances if instance()]
             for instance in cls._instances:
@@ -219,8 +228,10 @@ class NormalEta(Transformation):
         return "eta"
 
 class NormalNaturalThroughTheta(NormalTheta):
+    "Do not use, not officially supported!"
     _instances = []
-    def __new__(cls, mu_indices, var_indices):
+    def __new__(cls, mu_indices=None, var_indices=None):
+        "Do not use, not officially supported!"
         if cls._instances:
             cls._instances[:] = [instance for instance in cls._instances if instance()]
             for instance in cls._instances:
@@ -243,6 +254,56 @@ class NormalNaturalThroughTheta(NormalTheta):
         dmuvar[self.mu_indices] -= 2*mu*dmuvar[self.var_indices]
         #=======================================================================
 
+        #=======================================================================
+        # This is by going through theta fully and then going into eta direction:
+        #dmu = dmuvar[self.mu_indices]
+        #dmuvar[self.var_indices] += dmu*mu*(var + 4/var)
+        #=======================================================================
+        return dmuvar # which is now the gradient multiplicator
+
+    def gradfactor_non_natural(self, muvar, dmuvar):
+        mu = muvar[self.mu_indices]
+        var = muvar[self.var_indices]
+        #=======================================================================
+        # theta gradients
+        # This works and the gradient checks!
+        dmuvar[self.mu_indices] *= var
+        dmuvar[self.var_indices] *= 2*(var)**2
+        dmuvar[self.var_indices] += 2*dmuvar[self.mu_indices]*mu
+        #=======================================================================
+
+        return dmuvar # which is now the gradient multiplicator for {theta1, theta2}
+
+    def __str__(self):
+        return "natgrad"
+
+
+class NormalNaturalWhooot(NormalTheta):
+    "Do not use, not officially supported!"
+    _instances = []
+    def __new__(cls, mu_indices=None, var_indices=None):
+        "Do not use, not officially supported!"
+        if cls._instances:
+            cls._instances[:] = [instance for instance in cls._instances if instance()]
+            for instance in cls._instances:
+                if np.all(instance().mu_indices==mu_indices, keepdims=False) and np.all(instance().var_indices==var_indices, keepdims=False):
+                    return instance()
+        o = super(Transformation, cls).__new__(cls, mu_indices, var_indices)
+        cls._instances.append(weakref.ref(o))
+        return cls._instances[-1]()
+
+    def __init__(self, mu_indices, var_indices):
+        self.mu_indices = mu_indices
+        self.var_indices = var_indices
+
+    def gradfactor(self, muvar, dmuvar):
+        #mu = muvar[self.mu_indices]
+        #var = muvar[self.var_indices]
+
+        #=======================================================================
+        # This is just eta direction:
+        #dmuvar[self.mu_indices] -= 2*mu*dmuvar[self.var_indices]
+        #=======================================================================
 
         #=======================================================================
         # This is by going through theta fully and then going into eta direction:
@@ -255,8 +316,10 @@ class NormalNaturalThroughTheta(NormalTheta):
         return "natgrad"
 
 class NormalNaturalThroughEta(NormalEta):
+    "Do not use, not officially supported!"
     _instances = []
-    def __new__(cls, mu_indices, var_indices):
+    def __new__(cls, mu_indices=None, var_indices=None):
+        "Do not use, not officially supported!"
         if cls._instances:
             cls._instances[:] = [instance for instance in cls._instances if instance()]
             for instance in cls._instances:
diff --git a/GPy/core/parameterization/updateable.py b/GPy/core/parameterization/updateable.py
index daf07d3c..379e92e1 100644
--- a/GPy/core/parameterization/updateable.py
+++ b/GPy/core/parameterization/updateable.py
@@ -11,18 +11,9 @@ class Updateable(Observable):
     A model can be updated or not.
     Make sure updates can be switched on and off.
     """
-    _updates = True
     def __init__(self, *args, **kwargs):
         super(Updateable, self).__init__(*args, **kwargs)
 
-    @property
-    def updates(self):
-        raise DeprecationWarning("updates is now a function, see update(True|False|None)")
-
-    @updates.setter
-    def updates(self, ups):
-        raise DeprecationWarning("updates is now a function, see update(True|False|None)")
-
     def update_model(self, updates=None):
         """
         Get or set, whether automatic updates are performed. When updates are
@@ -35,18 +26,18 @@ class Updateable(Observable):
             None: get the current update state
         """
         if updates is None:
-            p = getattr(self, '_highest_parent_', None)
-            if p is not None:
-                self._updates = p._updates
-            return self._updates
+            return self._update_on
         assert isinstance(updates, bool), "updates are either on (True) or off (False)"
         p = getattr(self, '_highest_parent_', None)
-        if p is not None:
-            p._updates = updates
-        self._updates = updates
+        def turn_updates(s):
+            s._update_on = updates
+        p.traverse(turn_updates)
         self.trigger_update()
 
     def toggle_update(self):
+        print "deprecated: toggle_update was renamed to update_toggle for easier access"
+        self.update_toggle()
+    def update_toggle(self):
         self.update_model(not self.update_model())
 
     def trigger_update(self, trigger_parent=True):
diff --git a/GPy/core/sparse_gp.py b/GPy/core/sparse_gp.py
index 73c80c76..005ef2ac 100644
--- a/GPy/core/sparse_gp.py
+++ b/GPy/core/sparse_gp.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2012, GPy authors (see AUTHORS.txt).
+# Copyright (c) 2012-2014, GPy authors (see AUTHORS.txt).
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 
 import numpy as np
@@ -6,7 +6,8 @@ from gp import GP
 from parameterization.param import Param
 from ..inference.latent_function_inference import var_dtc
 from .. import likelihoods
-from parameterization.variational import VariationalPosterior
+from parameterization.variational import VariationalPosterior, NormalPosterior
+from ..util.linalg import mdot
 
 import logging
 from GPy.inference.latent_function_inference.posterior import Posterior
@@ -102,7 +103,15 @@ class SparseGP(GP):
 
     def _raw_predict(self, Xnew, full_cov=False, kern=None):
         """
-        Make a prediction for the latent function values
+        Make a prediction for the latent function values. 
+    
+        For certain inputs we give back a full_cov of shape NxN,
+        if there is missing data, each dimension has its own full_cov of shape NxNxD, and if full_cov is of, 
+        we take only the diagonal elements across N.
+        
+        For uncertain inputs, the SparseGP bound produces a full covariance structure across D, so for full_cov we 
+        return a NxDxD matrix and in the not full_cov case, we return the diagonal elements across D (NxD).
+        This is for both with and without missing data.
         """
 
         if kern is None: kern = self.kern
@@ -121,12 +130,32 @@ class SparseGP(GP):
                 Kxx = kern.Kdiag(Xnew)
                 var = (Kxx - np.sum(np.dot(np.atleast_3d(self.posterior.woodbury_inv).T, Kx) * Kx[None,:,:], 1)).T
         else:
-            Kx = kern.psi1(self.Z, Xnew)
-            mu = np.dot(Kx, self.posterior.woodbury_vector)
-            if full_cov:
-                raise NotImplementedError, "TODO"
-            else:
-                Kxx = kern.psi0(self.Z, Xnew)
-                psi2 = kern.psi2(self.Z, Xnew)
-                var = Kxx - np.sum(np.sum(psi2 * Kmmi_LmiBLmi[None, :, :], 1), 1)
+            psi0_star = self.kern.psi0(self.Z, Xnew)
+            psi1_star = self.kern.psi1(self.Z, Xnew)
+            #psi2_star = self.kern.psi2(self.Z, Xnew) # Only possible if we get NxMxM psi2 out of the code.
+            la = self.posterior.woodbury_vector
+            mu = np.dot(psi1_star, la) # TODO: dimensions?
+            
+            if full_cov: 
+                var = np.empty((Xnew.shape[0], la.shape[1], la.shape[1]))
+                di = np.diag_indices(la.shape[1])
+            else: 
+                var = np.empty((Xnew.shape[0], la.shape[1]))
+                
+            for i in range(Xnew.shape[0]):
+                _mu, _var = Xnew.mean.values[[i]], Xnew.variance.values[[i]]
+                psi2_star = self.kern.psi2(self.Z, NormalPosterior(_mu, _var))
+                tmp = (psi2_star[:, :] - psi1_star[[i]].T.dot(psi1_star[[i]]))
+
+                var_ = mdot(la.T, tmp, la)
+                p0 = psi0_star[i]
+                t = np.atleast_3d(self.posterior.woodbury_inv)
+                t2 = np.trace(t.T.dot(psi2_star), axis1=1, axis2=2)
+                
+                if full_cov:
+                    var_[di] += p0
+                    var_[di] += -t2
+                    var[i] = var_
+                else:
+                    var[i] = np.diag(var_)+p0-t2
         return mu, var
diff --git a/GPy/core/sparse_gp_mpi.py b/GPy/core/sparse_gp_mpi.py
index e8779f51..15d3ad76 100644
--- a/GPy/core/sparse_gp_mpi.py
+++ b/GPy/core/sparse_gp_mpi.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2012, GPy authors (see AUTHORS.txt).
+# Copyright (c) 2012-2014, GPy authors (see AUTHORS.txt).
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 
 import numpy as np
diff --git a/GPy/core/svgp.py b/GPy/core/svgp.py
new file mode 100644
index 00000000..42044b1b
--- /dev/null
+++ b/GPy/core/svgp.py
@@ -0,0 +1,98 @@
+# Copyright (c) 2014, James Hensman, Alex Matthews
+# Distributed under the terms of the GNU General public License, see LICENSE.txt
+
+import numpy as np
+from ..util import choleskies
+from sparse_gp import SparseGP
+from parameterization.param import Param
+from ..inference.latent_function_inference import SVGP as svgp_inf
+
+
+class SVGP(SparseGP):
+    def __init__(self, X, Y, Z, kernel, likelihood, name='SVGP', Y_metadata=None, batchsize=None):
+        """
+        Stochastic Variational GP.
+
+        For Gaussian Likelihoods, this implements
+
+        Gaussian Processes for Big data, Hensman, Fusi and Lawrence, UAI 2013,
+
+        But without natural gradients. We'll use the lower-triangluar
+        representation of the covariance matrix to ensure
+        positive-definiteness.
+
+        For Non Gaussian Likelihoods, this implements
+
+        Hensman, Matthews and Ghahramani, Scalable Variational GP Classification, ArXiv 1411.2005
+        """
+        if batchsize is None:
+            batchsize = X.shape[0]
+
+        self.X_all, self.Y_all = X, Y
+        # how to rescale the batch likelihood in case of minibatches
+        self.batchsize = batchsize
+        batch_scale = float(self.X_all.shape[0])/float(self.batchsize)
+        #KL_scale = 1./np.float64(self.mpi_comm.size)
+        KL_scale = 1.0
+
+        import climin.util
+        #Make a climin slicer to make drawing minibatches much quicker. Annoyingly, this doesn;t pickle.
+        self.slicer = climin.util.draw_mini_slices(self.X_all.shape[0], self.batchsize)
+        X_batch, Y_batch = self.new_batch()
+
+        #create the SVI inference method
+        inf_method = svgp_inf()
+
+        SparseGP.__init__(self, X_batch, Y_batch, Z, kernel, likelihood, inference_method=inf_method,
+                 name=name, Y_metadata=Y_metadata, normalizer=False)
+
+        self.m = Param('q_u_mean', np.zeros((self.num_inducing, Y.shape[1])))
+        chol = choleskies.triang_to_flat(np.tile(np.eye(self.num_inducing)[:,:,None], (1,1,Y.shape[1])))
+        self.chol = Param('q_u_chol', chol)
+        self.link_parameter(self.chol)
+        self.link_parameter(self.m)
+
+    def parameters_changed(self):
+        self.posterior, self._log_marginal_likelihood, self.grad_dict = self.inference_method.inference(self.q_u_mean, self.q_u_chol, self.kern, self.X, self.Z, self.likelihood, self.Y, self.Y_metadata, KL_scale=1.0, batch_scale=float(self.X_all.shape[0])/float(self.X.shape[0]))
+
+        #update the kernel gradients
+        self.kern.update_gradients_full(self.grad_dict['dL_dKmm'], self.Z)
+        grad = self.kern.gradient.copy()
+        self.kern.update_gradients_full(self.grad_dict['dL_dKmn'], self.Z, self.X)
+        grad += self.kern.gradient
+        self.kern.update_gradients_diag(self.grad_dict['dL_dKdiag'], self.X)
+        self.kern.gradient += grad
+        if not self.Z.is_fixed:# only compute these expensive gradients if we need them
+            self.Z.gradient = self.kern.gradients_X(self.grad_dict['dL_dKmm'], self.Z) + self.kern.gradients_X(self.grad_dict['dL_dKmn'], self.Z, self.X)
+
+        self.likelihood.update_gradients(self.grad_dict['dL_dthetaL'])
+        #update the variational parameter gradients:
+        self.m.gradient = self.grad_dict['dL_dm']
+        self.chol.gradient = self.grad_dict['dL_dchol']
+
+    def set_data(self, X, Y):
+        """
+        Set the data without calling parameters_changed to avoid wasted computation
+        If this is called by the stochastic_grad function this will immediately update the gradients
+        """
+        assert X.shape[1]==self.Z.shape[1]
+        self.X, self.Y = X, Y
+
+    def new_batch(self):
+        """
+        Return a new batch of X and Y by taking a chunk of data from the complete X and Y
+        """
+        i = self.slicer.next()
+        return self.X_all[i], self.Y_all[i]
+
+    def stochastic_grad(self, parameters):
+        self.set_data(*self.new_batch())
+        return self._grads(parameters)
+
+    def optimizeWithFreezingZ(self):
+        self.Z.fix()
+        self.kern.fix()
+        self.optimize('bfgs')
+        self.Z.unfix()
+        self.kern.constrain_positive()
+        self.optimize('bfgs')
diff --git a/GPy/core/symbolic.py b/GPy/core/symbolic.py
index 24623f91..ed3a9d59 100644
--- a/GPy/core/symbolic.py
+++ b/GPy/core/symbolic.py
@@ -11,8 +11,8 @@ from sympy.utilities.lambdify import lambdastr, _imp_namespace, _get_namespace
 from sympy.utilities.iterables import numbered_symbols
 import scipy
 import GPy
-#from scipy.special import gammaln, gamma, erf, erfc, erfcx, polygamma
-#@NDL you removed this file! #from GPy.util.symbolic import normcdf, normcdfln, logistic, logisticln, erfcx, erfc, gammaln
+
+
 def getFromDict(dataDict, mapList):
     return reduce(lambda d, k: d[k], mapList, dataDict)
 
diff --git a/GPy/core/verbose_optimization.py b/GPy/core/verbose_optimization.py
new file mode 100644
index 00000000..1a87b3da
--- /dev/null
+++ b/GPy/core/verbose_optimization.py
@@ -0,0 +1,150 @@
+# Copyright (c) 2012-2014, Max Zwiessele.
+# Licensed under the BSD 3-clause license (see LICENSE.txt)
+
+
+import numpy as np
+import sys
+import time
+
+def exponents(fnow, current_grad):
+    exps = [np.abs(np.float(fnow)), current_grad]
+    return np.sign(exps) * np.log10(exps).astype(int)
+
+class VerboseOptimization(object):
+    def __init__(self, model, opt, maxiters, verbose=False, current_iteration=0, ipython_notebook=True):
+        self.verbose = verbose
+        if self.verbose:
+            self.model = model
+            self.iteration = current_iteration
+            self.p_iter = self.iteration
+            self.maxiters = maxiters
+            self.len_maxiters = len(str(maxiters))
+            self.opt_name = opt.opt_name
+            self.model.add_observer(self, self.print_status)
+            self.status = 'running'
+
+            self.update()
+
+            try:
+                from IPython.display import display
+                from IPython.html.widgets import FloatProgressWidget, HTMLWidget, ContainerWidget
+                self.text = HTMLWidget()
+                self.progress = FloatProgressWidget()
+                self.model_show = HTMLWidget()
+                self.ipython_notebook = ipython_notebook
+            except:
+                # Not in Ipython notebook
+                self.ipython_notebook = False
+
+            if self.ipython_notebook:
+                self.text.set_css('width', '100%')
+                #self.progress.set_css('width', '100%')
+
+                left_col = ContainerWidget(children = [self.progress, self.text])
+                right_col = ContainerWidget(children = [self.model_show])
+                hor_align = ContainerWidget(children = [left_col, right_col])
+
+                display(hor_align)
+
+                left_col.set_css({
+                         'padding': '2px',
+                         'width': "100%",
+                         })
+
+                right_col.set_css({
+                         'padding': '2px',
+                         })
+
+                hor_align.set_css({
+                         'width': "100%",
+                         })
+
+                hor_align.remove_class('vbox')
+                hor_align.add_class('hbox')
+
+                left_col.add_class("box-flex1")
+                right_col.add_class('box-flex0')
+
+                #self.text.add_class('box-flex2')
+                #self.progress.add_class('box-flex1')
+            else:
+                self.exps = exponents(self.fnow, self.current_gradient)
+                print 'Running {} Code:'.format(self.opt_name)
+                print ' {3:7s}   {0:{mi}s}   {1:11s}    {2:11s}'.format("i", "f", "|g|", "secs", mi=self.len_maxiters)
+
+    def __enter__(self):
+        self.start = time.time()
+        return self
+
+    def print_out(self):
+        if self.ipython_notebook:
+            names_vals = [['optimizer', "{:s}".format(self.opt_name)],
+                          ['runtime [s]', "{:> g}".format(time.time()-self.start)],
+                          ['evaluation', "{:>0{l}}".format(self.iteration, l=self.len_maxiters)],
+                          ['objective', "{: > 12.3E}".format(self.fnow)],
+                          ['||gradient||', "{: >+12.3E}".format(float(self.current_gradient))],
+                          ['status', "{:s}".format(self.status)],
+                      ]
+            #message = "Lik:{:5.3E} Grad:{:5.3E} Lik:{:5.3E} Len:{!s}".format(float(m.log_likelihood()), np.einsum('i,i->', grads, grads), float(m.likelihood.variance), " ".join(["{:3.2E}".format(l) for l in m.kern.lengthscale.values]))
+            html_begin = """<style type="text/css">
+    .tg-opt  {font-family:"Courier New", Courier, monospace !important;padding:2px 3px;word-break:normal;border-collapse:collapse;border-spacing:0;border-color:#DCDCDC;margin:0px auto;width:100%;}
+    .tg-opt td{font-family:"Courier New", Courier, monospace !important;font-weight:bold;color:#444;background-color:#F7FDFA;border-style:solid;border-width:1px;overflow:hidden;word-break:normal;border-color:#DCDCDC;}
+    .tg-opt th{font-family:"Courier New", Courier, monospace !important;font-weight:normal;color:#fff;background-color:#26ADE4;border-style:solid;border-width:1px;overflow:hidden;word-break:normal;border-color:#DCDCDC;}
+    .tg-opt .tg-left{font-family:"Courier New", Courier, monospace !important;font-weight:normal;text-align:left;}
+    .tg-opt .tg-right{font-family:"Courier New", Courier, monospace !important;font-weight:normal;text-align:right;}
+    </style>
+    <table class="tg-opt">"""
+            html_end = "</table>"
+            html_body = ""
+            for name, val in names_vals:
+                html_body += "<tr>"
+                html_body += "<td class='tg-left'>{}</td>".format(name)
+                html_body += "<td class='tg-right'>{}</td>".format(val)
+                html_body += "</tr>"
+            self.text.value = html_begin + html_body + html_end
+            self.progress.value = 100*(self.iteration+1)/self.maxiters
+            self.model_show.value = self.model._repr_html_()
+        else:
+            n_exps = exponents(self.fnow, self.current_gradient)
+            if self.iteration - self.p_iter >= 20 * np.random.rand():
+                a = self.iteration >= self.p_iter * 2.78
+                b = np.any(n_exps < self.exps)
+                if a or b:
+                    self.p_iter = self.iteration
+                    print ''
+                if b:
+                    self.exps = n_exps
+            print '\r',
+            print '{3:> 7.2g}  {0:>0{mi}g}  {1:> 12e}  {2:> 12e}'.format(self.iteration, float(self.fnow), float(self.current_gradient), time.time()-self.start, mi=self.len_maxiters), # print 'Iteration:', iteration, ' Objective:', fnow, '  Scale:', beta, '\r',
+            sys.stdout.flush()
+
+    def print_status(self, me, which=None):
+        self.update()
+
+        #sys.stdout.write(" "*len(self.message))
+        self.print_out()
+
+        self.iteration += 1
+
+    def update(self):
+        self.fnow = self.model.objective_function()
+        if self.model.obj_grads is not None:
+            grad = self.model.obj_grads
+            self.current_gradient = np.dot(grad, grad)
+        else:
+            self.current_gradient = np.nan
+
+    def finish(self, opt):
+        self.status = opt.status
+
+    def __exit__(self, type, value, traceback):
+        if self.verbose:
+            self.stop = time.time()
+            self.model.remove_observer(self)
+            self.print_out()
+
+            if not self.ipython_notebook:
+                print ''
+                print 'Optimization finished in {0:.5g} Seconds'.format(self.stop-self.start)
+                print 'Optimization status: {0:.5g}'.format(self.status)
+                print
diff --git a/GPy/examples/__init__.py b/GPy/examples/__init__.py
index 93994175..968333e0 100644
--- a/GPy/examples/__init__.py
+++ b/GPy/examples/__init__.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2012, GPy authors (see AUTHORS.txt).
+# Copyright (c) 2012-2014, GPy authors (see AUTHORS.txt).
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 
 import classification
diff --git a/GPy/examples/classification.py b/GPy/examples/classification.py
index b9d488d6..b3780073 100644
--- a/GPy/examples/classification.py
+++ b/GPy/examples/classification.py
@@ -1,9 +1,9 @@
-# Copyright (c) 2012, GPy authors (see AUTHORS.txt).
+# Copyright (c) 2012-2014, GPy authors (see AUTHORS.txt).
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 
 
 """
-Gaussian Processes classification
+Gaussian Processes classification examples
 """
 import GPy
 
diff --git a/GPy/examples/coreg_example.py b/GPy/examples/coreg_example.py
index 6ec635eb..4e9566dc 100644
--- a/GPy/examples/coreg_example.py
+++ b/GPy/examples/coreg_example.py
@@ -1,3 +1,6 @@
+# Copyright (c) 2012-2014, GPy authors (see AUTHORS.txt).
+# Licensed under the BSD 3-clause license (see LICENSE.txt)
+
 import numpy as np
 try:
     import pylab as pb
diff --git a/GPy/examples/dimensionality_reduction.py b/GPy/examples/dimensionality_reduction.py
index dc0b3dea..df9093a2 100644
--- a/GPy/examples/dimensionality_reduction.py
+++ b/GPy/examples/dimensionality_reduction.py
@@ -1,6 +1,7 @@
-# Copyright (c) 2012, GPy authors (see AUTHORS.txt).
+# Copyright (c) 2012-2014, GPy authors (see AUTHORS.txt).
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 import numpy as _np
+default_seed = 123344
 
 # default_seed = _np.random.seed(123344)
 
diff --git a/GPy/examples/non_gaussian.py b/GPy/examples/non_gaussian.py
index 57c841e4..ddac8813 100644
--- a/GPy/examples/non_gaussian.py
+++ b/GPy/examples/non_gaussian.py
@@ -1,3 +1,6 @@
+# Copyright (c) 2014, Alan Saul
+# Licensed under the BSD 3-clause license (see LICENSE.txt)
+
 import GPy
 import numpy as np
 from GPy.util import datasets
diff --git a/GPy/examples/regression.py b/GPy/examples/regression.py
index 14cf0602..37a18f63 100644
--- a/GPy/examples/regression.py
+++ b/GPy/examples/regression.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2012, GPy authors (see AUTHORS.txt).
+# Copyright (c) 2012-2014, GPy authors (see AUTHORS.txt).
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 
 """
diff --git a/GPy/inference/__init__.py b/GPy/inference/__init__.py
index f1ffd595..7b1307e3 100644
--- a/GPy/inference/__init__.py
+++ b/GPy/inference/__init__.py
@@ -1,2 +1,3 @@
 import latent_function_inference
-import optimization
\ No newline at end of file
+import optimization
+import mcmc
diff --git a/GPy/inference/latent_function_inference/__init__.py b/GPy/inference/latent_function_inference/__init__.py
index 3faf594c..67f57638 100644
--- a/GPy/inference/latent_function_inference/__init__.py
+++ b/GPy/inference/latent_function_inference/__init__.py
@@ -1,3 +1,6 @@
+# Copyright (c) 2012-2014, Max Zwiessele, James Hensman
+# Licensed under the BSD 3-clause license (see LICENSE.txt)
+
 __doc__ = """
 Inference over Gaussian process latent functions
 
@@ -66,6 +69,7 @@ from expectation_propagation_dtc import EPDTC
 from dtc import DTC
 from fitc import FITC
 from var_dtc_parallel import VarDTC_minibatch
+from svgp import SVGP
 
 # class FullLatentFunctionData(object):
 #
diff --git a/GPy/inference/latent_function_inference/dtc.py b/GPy/inference/latent_function_inference/dtc.py
index aa398166..5590a079 100644
--- a/GPy/inference/latent_function_inference/dtc.py
+++ b/GPy/inference/latent_function_inference/dtc.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2012, James Hensman
+# Copyright (c) 2012-2014, James Hensman
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 
 from posterior import Posterior
diff --git a/GPy/inference/latent_function_inference/exact_gaussian_inference.py b/GPy/inference/latent_function_inference/exact_gaussian_inference.py
index 0c02efe3..1312d36a 100644
--- a/GPy/inference/latent_function_inference/exact_gaussian_inference.py
+++ b/GPy/inference/latent_function_inference/exact_gaussian_inference.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2012, GPy authors (see AUTHORS.txt).
+# Copyright (c) 2012-2014, GPy authors (see AUTHORS.txt).
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 
 from posterior import Posterior
diff --git a/GPy/inference/latent_function_inference/expectation_propagation.py b/GPy/inference/latent_function_inference/expectation_propagation.py
index 1afc8100..26144974 100644
--- a/GPy/inference/latent_function_inference/expectation_propagation.py
+++ b/GPy/inference/latent_function_inference/expectation_propagation.py
@@ -1,3 +1,5 @@
+# Copyright (c) 2012-2014, GPy authors (see AUTHORS.txt).
+# Licensed under the BSD 3-clause license (see LICENSE.txt)
 import numpy as np
 from ...util.linalg import pdinv,jitchol,DSYR,tdot,dtrtrs, dpotrs
 from posterior import Posterior
diff --git a/GPy/inference/latent_function_inference/expectation_propagation_dtc.py b/GPy/inference/latent_function_inference/expectation_propagation_dtc.py
index 9ffb4945..35b1b7dc 100644
--- a/GPy/inference/latent_function_inference/expectation_propagation_dtc.py
+++ b/GPy/inference/latent_function_inference/expectation_propagation_dtc.py
@@ -1,3 +1,6 @@
+# Copyright (c) 2012-2014, GPy authors (see AUTHORS.txt).
+# Licensed under the BSD 3-clause license (see LICENSE.txt)
+
 import numpy as np
 from ...util import diag
 from ...util.linalg import mdot, jitchol, backsub_both_sides, tdot, dtrtrs, dtrtri, dpotri, dpotrs, symmetrify, DSYR
diff --git a/GPy/inference/latent_function_inference/inferenceX.py b/GPy/inference/latent_function_inference/inferenceX.py
index 25d8d799..f68f17cb 100644
--- a/GPy/inference/latent_function_inference/inferenceX.py
+++ b/GPy/inference/latent_function_inference/inferenceX.py
@@ -1,5 +1,6 @@
-"""
-"""
+# Copyright (c) 2014, Zhenwen Dai
+# Licensed under the BSD 3-clause license (see LICENSE.txt)
+
 import numpy as np
 from ...core import Model
 from ...core.parameterization import variational
@@ -7,27 +8,27 @@ from ...core.parameterization import variational
 def infer_newX(model, Y_new, optimize=True, init='L2'):
     """
     Infer the distribution of X for the new observed data *Y_new*.
-    
+
     :param model: the GPy model used in inference
     :type model: GPy.core.Model
     :param Y_new: the new observed data for inference
     :type Y_new: numpy.ndarray
     :param optimize: whether to optimize the location of new X (True by default)
     :type optimize: boolean
-    :return: a tuple containing the estimated posterior distribution of X and the model that optimize X 
+    :return: a tuple containing the estimated posterior distribution of X and the model that optimize X
     :rtype: (GPy.core.parameterization.variational.VariationalPosterior, GPy.core.Model)
     """
     infr_m = InferenceX(model, Y_new, init=init)
-    
+
     if optimize:
         infr_m.optimize()
-        
+
     return infr_m.X, infr_m
 
 class InferenceX(Model):
     """
     The class for inference of new X with given new Y. (do_test_latent)
-    
+
     :param model: the GPy model used in inference
     :type model: GPy.core.Model
     :param Y: the new observed data for inference
@@ -67,12 +68,12 @@ class InferenceX(Model):
         self.Y = Y
         self.X = self._init_X(model, Y, init=init)
         self.compute_dL()
-        
+
         self.link_parameter(self.X)
-    
+
     def _init_X(self, model, Y_new, init='L2'):
         # Initialize the new X by finding the nearest point in Y space.
-        
+
         Y = model.Y
         if self.missing_data:
             Y = Y[:,self.valid_dim]
@@ -86,7 +87,7 @@ class InferenceX(Model):
             elif init=='rand':
                 dist = np.random.rand(Y_new.shape[0],Y.shape[0])
         idx = dist.argmin(axis=1)
-        
+
         from ...models import SSGPLVM
         from ...util.misc import param_to_array
         if isinstance(model, SSGPLVM):
@@ -99,9 +100,9 @@ class InferenceX(Model):
             else:
                 from ...core import Param
                 X = Param('latent mean',param_to_array(model.X[idx]).copy())
-        
+
         return X
-        
+
     def compute_dL(self):
         # Common computation
         beta = 1./np.fmax(self.likelihood.variance, 1e-6)
@@ -120,7 +121,7 @@ class InferenceX(Model):
             self.dL_dpsi2 = beta*(output_dim*self.posterior.woodbury_inv - np.einsum('md,od->mo',wv, wv))/2.
             self.dL_dpsi1 = beta*np.dot(self.Y, wv.T)
             self.dL_dpsi0 = -beta/2.*output_dim* np.ones(self.Y.shape[0])
-                
+
     def parameters_changed(self):
         if self.uncertain_input:
             psi0 = self.kern.psi0(self.Z, self.X)
@@ -132,7 +133,7 @@ class InferenceX(Model):
             psi2 = np.dot(psi1.T,psi1)
 
         self._log_marginal_likelihood = (self.dL_dpsi2*psi2).sum()+(self.dL_dpsi1*psi1).sum()+(self.dL_dpsi0*psi0).sum()
-        
+
         if self.uncertain_input:
             X_grad = self.kern.gradients_qX_expectations(variational_posterior=self.X, Z=self.Z, dL_dpsi0=self.dL_dpsi0, dL_dpsi1=self.dL_dpsi1, dL_dpsi2=self.dL_dpsi2)
             self.X.set_gradients(X_grad)
@@ -141,7 +142,7 @@ class InferenceX(Model):
             X_grad = self.kern.gradients_X_diag(self.dL_dpsi0, self.X)
             X_grad += self.kern.gradients_X(dL_dpsi1, self.X, self.Z)
             self.X.gradient = X_grad
-        
+
         if self.uncertain_input:
             from ...core.parameterization.variational import SpikeAndSlabPrior
             if isinstance(self.variational_prior, SpikeAndSlabPrior):
@@ -155,7 +156,7 @@ class InferenceX(Model):
                 # update for the KL divergence
                 self.variational_prior.update_gradients_KL(self.X)
             self._log_marginal_likelihood += -KL_div
-        
+
     def log_likelihood(self):
         return self._log_marginal_likelihood
 
diff --git a/GPy/inference/latent_function_inference/laplace.py b/GPy/inference/latent_function_inference/laplace.py
index 2c741b9d..05711b0b 100644
--- a/GPy/inference/latent_function_inference/laplace.py
+++ b/GPy/inference/latent_function_inference/laplace.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2013, 2014 GPy authors (see AUTHORS.txt).
+# Copyright (c) 2013, 2014 Alan Saul
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 #
 #Parts of this file were influenced by the Matlab GPML framework written by
diff --git a/GPy/inference/latent_function_inference/posterior.py b/GPy/inference/latent_function_inference/posterior.py
index 66c68261..34f0b3bb 100644
--- a/GPy/inference/latent_function_inference/posterior.py
+++ b/GPy/inference/latent_function_inference/posterior.py
@@ -158,9 +158,11 @@ class Posterior(object):
                 #self._woodbury_inv, _ = dpotrs(self.woodbury_chol, np.eye(self.woodbury_chol.shape[0]), lower=1)
                 symmetrify(self._woodbury_inv)
             elif self._covariance is not None:
-                B = self._K - self._covariance
-                tmp, _ = dpotrs(self.K_chol, B)
-                self._woodbury_inv, _ = dpotrs(self.K_chol, tmp.T)                
+                B = np.atleast_3d(self._K) - np.atleast_3d(self._covariance)
+                self._woodbury_inv = np.empty_like(B)
+                for i in xrange(B.shape[-1]):
+                    tmp, _ = dpotrs(self.K_chol, B[:,:,i])
+                    self._woodbury_inv[:,:,i], _ = dpotrs(self.K_chol, tmp.T)
         return self._woodbury_inv
 
     @property
diff --git a/GPy/inference/latent_function_inference/svgp.py b/GPy/inference/latent_function_inference/svgp.py
new file mode 100644
index 00000000..52db242c
--- /dev/null
+++ b/GPy/inference/latent_function_inference/svgp.py
@@ -0,0 +1,72 @@
+from . import LatentFunctionInference
+from ...util import linalg
+from ...util import choleskies
+import numpy as np
+from posterior import Posterior
+
+class SVGP(LatentFunctionInference):
+
+    def inference(self, q_u_mean, q_u_chol, kern, X, Z, likelihood, Y, Y_metadata=None, KL_scale=1.0, batch_scale=1.0):
+        num_inducing = Z.shape[0]
+        num_data, num_outputs = Y.shape
+
+        #expand cholesky representation
+        L = choleskies.flat_to_triang(q_u_chol)
+        S = np.einsum('ijk,ljk->ilk', L, L) #L.dot(L.T)
+        #Si,_ = linalg.dpotri(np.asfortranarray(L), lower=1)
+        Si = choleskies.multiple_dpotri(L)
+        logdetS = np.array([2.*np.sum(np.log(np.abs(np.diag(L[:,:,i])))) for i in range(L.shape[-1])])
+
+        if np.any(np.isinf(Si)):
+            raise ValueError("Cholesky representation unstable")
+            #S = S + np.eye(S.shape[0])*1e-5*np.max(np.max(S))
+            #Si, Lnew, _,_ = linalg.pdinv(S)
+
+        #compute kernel related stuff
+        Kmm = kern.K(Z)
+        Knm = kern.K(X, Z)
+        Knn_diag = kern.Kdiag(X)
+        Kmmi, Lm, Lmi, logdetKmm = linalg.pdinv(Kmm)
+
+        #compute the marginal means and variances of q(f)
+        A = np.dot(Knm, Kmmi)
+        mu = np.dot(A, q_u_mean)
+        v = Knn_diag[:,None] - np.sum(A*Knm,1)[:,None] + np.sum(A[:,:,None] * np.einsum('ij,jkl->ikl', A, S),1)
+
+        #compute the KL term
+        Kmmim = np.dot(Kmmi, q_u_mean)
+        KLs = -0.5*logdetS -0.5*num_inducing + 0.5*logdetKmm + 0.5*np.einsum('ij,ijk->k', Kmmi, S) + 0.5*np.sum(q_u_mean*Kmmim,0)
+        KL = KLs.sum()
+        dKL_dm = Kmmim
+        dKL_dS = 0.5*(Kmmi[:,:,None] - Si)
+        dKL_dKmm = 0.5*num_outputs*Kmmi - 0.5*Kmmi.dot(S.sum(-1)).dot(Kmmi) - 0.5*Kmmim.dot(Kmmim.T)
+
+
+        #quadrature for the likelihood
+        F, dF_dmu, dF_dv, dF_dthetaL = likelihood.variational_expectations(Y, mu, v)
+
+        #rescale the F term if working on a batch
+        F, dF_dmu, dF_dv =  F*batch_scale, dF_dmu*batch_scale, dF_dv*batch_scale
+
+        #derivatives of expected likelihood
+        Adv = A.T[:,:,None]*dF_dv[None,:,:] # As if dF_Dv is diagonal
+        Admu = A.T.dot(dF_dmu)
+        #AdvA = np.einsum('ijk,jl->ilk', Adv, A)
+        #AdvA = np.dot(A.T, Adv).swapaxes(0,1)
+        AdvA = np.dstack([np.dot(A.T, Adv[:,:,i].T) for i in range(num_outputs)])
+        tmp = np.einsum('ijk,jlk->il', AdvA, S).dot(Kmmi)
+        dF_dKmm = -Admu.dot(Kmmim.T) + AdvA.sum(-1) - tmp - tmp.T
+        dF_dKmm = 0.5*(dF_dKmm + dF_dKmm.T) # necessary? GPy bug?
+        tmp = 2.*(np.einsum('ij,jlk->ilk', Kmmi,S) - np.eye(num_inducing)[:,:,None])
+        dF_dKmn = np.einsum('ijk,jlk->il', tmp, Adv) + Kmmim.dot(dF_dmu.T)
+        dF_dm = Admu
+        dF_dS = AdvA
+
+        #sum (gradients of) expected likelihood and KL part
+        log_marginal = F.sum() - KL
+        dL_dm, dL_dS, dL_dKmm, dL_dKmn = dF_dm - dKL_dm, dF_dS- dKL_dS, dF_dKmm- dKL_dKmm, dF_dKmn
+
+        dL_dchol = np.dstack([2.*np.dot(dL_dS[:,:,i], L[:,:,i]) for i in range(num_outputs)])
+        dL_dchol = choleskies.triang_to_flat(dL_dchol)
+
+        return Posterior(mean=q_u_mean, cov=S, K=Kmm), log_marginal, {'dL_dKmm':dL_dKmm, 'dL_dKmn':dL_dKmn, 'dL_dKdiag': dF_dv, 'dL_dm':dL_dm, 'dL_dchol':dL_dchol, 'dL_dthetaL':dF_dthetaL}
diff --git a/GPy/inference/latent_function_inference/var_dtc.py b/GPy/inference/latent_function_inference/var_dtc.py
index d61e7f0f..9c4d51bb 100644
--- a/GPy/inference/latent_function_inference/var_dtc.py
+++ b/GPy/inference/latent_function_inference/var_dtc.py
@@ -21,7 +21,7 @@ class VarDTC(LatentFunctionInference):
     For efficiency, we sometimes work with the cholesky of Y*Y.T. To save repeatedly recomputing this, we cache it.
 
     """
-    const_jitter = 1e-6
+    const_jitter = 1e-8
     def __init__(self, limit=1):
         #self._YYTfactor_cache = caching.cache()
         from ...util.caching import Cacher
diff --git a/GPy/inference/latent_function_inference/var_dtc_parallel.py b/GPy/inference/latent_function_inference/var_dtc_parallel.py
index 86842687..cac69872 100644
--- a/GPy/inference/latent_function_inference/var_dtc_parallel.py
+++ b/GPy/inference/latent_function_inference/var_dtc_parallel.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2012, GPy authors (see AUTHORS.txt).
+# Copyright (c) 2014, GPy authors (see AUTHORS.txt).
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 
 from posterior import Posterior
@@ -24,7 +24,7 @@ class VarDTC_minibatch(LatentFunctionInference):
     For efficiency, we sometimes work with the cholesky of Y*Y.T. To save repeatedly recomputing this, we cache it.
 
     """
-    const_jitter = 1e-6
+    const_jitter = 1e-8
     def __init__(self, batchsize=None, limit=1, mpi_comm=None):
 
         self.batchsize = batchsize
diff --git a/GPy/inference/mcmc/hmc.py b/GPy/inference/mcmc/hmc.py
index 54893769..ec6399b6 100644
--- a/GPy/inference/mcmc/hmc.py
+++ b/GPy/inference/mcmc/hmc.py
@@ -1,4 +1,4 @@
-# ## Copyright (c) 2012, GPy authors (see AUTHORS.txt).
+# ## Copyright (c) 2014 Mu Niu, Zhenwen Dai and GPy Authors
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 
 import numpy as np
diff --git a/GPy/inference/mcmc/samplers.py b/GPy/inference/mcmc/samplers.py
index fdb3df76..444d99d7 100644
--- a/GPy/inference/mcmc/samplers.py
+++ b/GPy/inference/mcmc/samplers.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2012, GPy authors (see AUTHORS.txt).
+# ## Copyright (c) 2014, Zhenwen Dai
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 
 
diff --git a/GPy/inference/optimization/conjugate_gradient_descent.py b/GPy/inference/optimization/conjugate_gradient_descent.py
index 8f90b536..dfc4a48d 100644
--- a/GPy/inference/optimization/conjugate_gradient_descent.py
+++ b/GPy/inference/optimization/conjugate_gradient_descent.py
@@ -1,8 +1,6 @@
-'''
-Created on 24 Apr 2013
+# Copyright (c) 2012-2014, Max Zwiessele
+# Licensed under the BSD 3-clause license (see LICENSE.txt)
 
-@author: maxz
-'''
 from gradient_descent_update_rules import FletcherReeves, \
     PolakRibiere
 from Queue import Empty
diff --git a/GPy/inference/optimization/gradient_descent_update_rules.py b/GPy/inference/optimization/gradient_descent_update_rules.py
index 1c14ed63..9536549c 100644
--- a/GPy/inference/optimization/gradient_descent_update_rules.py
+++ b/GPy/inference/optimization/gradient_descent_update_rules.py
@@ -1,8 +1,6 @@
-'''
-Created on 24 Apr 2013
+# Copyright (c) 2012-2014, Max Zwiessele
+# Licensed under the BSD 3-clause license (see LICENSE.txt)
 
-@author: maxz
-'''
 import numpy
 
 class GDUpdateRule():
diff --git a/GPy/inference/optimization/optimization.py b/GPy/inference/optimization/optimization.py
index 45586a1d..aa9be793 100644
--- a/GPy/inference/optimization/optimization.py
+++ b/GPy/inference/optimization/optimization.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2012, GPy authors (see AUTHORS.txt).
+# Copyright (c) 2012-2014, GPy authors (see AUTHORS.txt).
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 
 import datetime as dt
@@ -31,12 +31,13 @@ class Optimizer():
                  ftol=None, gtol=None, xtol=None, bfgs_factor=None):
         self.opt_name = None
         self.x_init = x_init
-        self.messages = messages
+        # Turning messages off and using internal structure for print outs:
+        self.messages = False #messages
         self.f_opt = None
         self.x_opt = None
         self.funct_eval = None
         self.status = None
-        self.max_f_eval = int(max_f_eval)
+        self.max_f_eval = int(max_iters)
         self.max_iters = int(max_iters)
         self.bfgs_factor = bfgs_factor
         self.trace = None
@@ -225,13 +226,11 @@ class opt_SCG(Optimizer):
         self.status = opt_result[3]
 
 def get_optimizer(f_min):
-    from sgd import opt_SGD
 
     optimizers = {'fmin_tnc': opt_tnc,
           'simplex': opt_simplex,
           'lbfgsb': opt_lbfgsb,
-          'scg': opt_SCG,
-          'sgd': opt_SGD}
+          'scg': opt_SCG}
 
     if rasm_available:
         optimizers['rasmussen'] = opt_rasm
diff --git a/GPy/inference/optimization/scg.py b/GPy/inference/optimization/scg.py
index e183b7a8..34dd181f 100644
--- a/GPy/inference/optimization/scg.py
+++ b/GPy/inference/optimization/scg.py
@@ -1,4 +1,4 @@
-# Copyright I. Nabney, N.Lawrence and James Hensman (1996 - 2012)
+# Copyright I. Nabney, N.Lawrence and James Hensman (1996 - 2014)
 
 # Scaled Conjuagte Gradients, originally in Matlab as part of the Netlab toolbox by I. Nabney, converted to python N. Lawrence and given a pythonic interface by James Hensman
 
@@ -61,6 +61,7 @@ def SCG(f, gradf, x, optargs=(), maxiters=500, max_f_eval=np.inf, display=True,
     function_eval = 1
     fnow = fold
     gradnew = gradf(x, *optargs) # Initial gradient.
+    function_eval += 1
     #if any(np.isnan(gradnew)):
     #    raise UnexpectedInfOrNan, "Gradient contribution resulted in a NaN value"
     current_grad = np.dot(gradnew, gradnew)
@@ -96,6 +97,7 @@ def SCG(f, gradf, x, optargs=(), maxiters=500, max_f_eval=np.inf, display=True,
             sigma = sigma0 / np.sqrt(kappa)
             xplus = x + sigma * d
             gplus = gradf(xplus, *optargs)
+            function_eval += 1
             theta = np.dot(d, (gplus - gradnew)) / sigma
 
         # Increase effective curvature and evaluate step size alpha.
@@ -111,10 +113,10 @@ def SCG(f, gradf, x, optargs=(), maxiters=500, max_f_eval=np.inf, display=True,
         fnew = f(xnew, *optargs)
         function_eval += 1
 
-#         if function_eval >= max_f_eval:
-#             status = "maximum number of function evaluations exceeded"
-#             break
-#             return x, flog, function_eval, status
+        if function_eval >= max_f_eval:
+            status = "maximum number of function evaluations exceeded"
+            break
+            return x, flog, function_eval, status
 
         Delta = 2.*(fnew - fold) / (alpha * mu)
         if Delta >= 0.:
@@ -156,6 +158,7 @@ def SCG(f, gradf, x, optargs=(), maxiters=500, max_f_eval=np.inf, display=True,
                 # Update variables for new position
                 gradold = gradnew
                 gradnew = gradf(x, *optargs)
+                function_eval += 1
                 current_grad = np.dot(gradnew, gradnew)
                 fold = fnew
                 # If the gradient is zero then we are done.
diff --git a/GPy/inference/optimization/sgd.py b/GPy/inference/optimization/sgd.py
deleted file mode 100644
index fd089bf5..00000000
--- a/GPy/inference/optimization/sgd.py
+++ /dev/null
@@ -1,346 +0,0 @@
-import numpy as np
-import scipy as sp
-import scipy.sparse
-from optimization import Optimizer
-from scipy import linalg, optimize
-import copy, sys, pickle
-
-class opt_SGD(Optimizer):
-    """
-    Optimize using stochastic gradient descent.
-
-    :param Model: reference to the Model object
-    :param iterations: number of iterations
-    :param learning_rate: learning rate
-    :param momentum: momentum
-
-    """
-
-    def __init__(self, start, iterations = 10, learning_rate = 1e-4, momentum = 0.9, model = None, messages = False, batch_size = 1, self_paced = False, center = True, iteration_file = None, learning_rate_adaptation=None, actual_iter=None, schedule=None, **kwargs):
-        self.opt_name = "Stochastic Gradient Descent"
-
-        self.Model = model
-        self.iterations = iterations
-        self.momentum = momentum
-        self.learning_rate = learning_rate
-        self.x_opt = None
-        self.f_opt = None
-        self.messages = messages
-        self.batch_size = batch_size
-        self.self_paced = self_paced
-        self.center = center
-        self.param_traces = [('noise',[])]
-        self.iteration_file = iteration_file
-        self.learning_rate_adaptation = learning_rate_adaptation
-        self.actual_iter = actual_iter
-        if self.learning_rate_adaptation != None:
-            if self.learning_rate_adaptation == 'annealing':
-                self.learning_rate_0 = self.learning_rate
-            else:
-                self.learning_rate_0 = self.learning_rate.mean()
-
-        self.schedule = schedule
-        # if len([p for p in self.model.kern.parts if p.name == 'bias']) == 1:
-        #     self.param_traces.append(('bias',[]))
-        # if len([p for p in self.model.kern.parts if p.name == 'linear']) == 1:
-        #     self.param_traces.append(('linear',[]))
-        # if len([p for p in self.model.kern.parts if p.name == 'rbf']) == 1:
-        #     self.param_traces.append(('rbf_var',[]))
-
-        self.param_traces = dict(self.param_traces)
-        self.fopt_trace = []
-
-        num_params = len(self.Model._get_params())
-        if isinstance(self.learning_rate, float):
-            self.learning_rate = np.ones((num_params,)) * self.learning_rate
-
-        assert (len(self.learning_rate) == num_params), "there must be one learning rate per parameter"
-
-    def __str__(self):
-        status = "\nOptimizer: \t\t\t %s\n" % self.opt_name
-        status += "f(x_opt): \t\t\t %.4f\n" % self.f_opt
-        status += "Number of iterations: \t\t %d\n" % self.iterations
-        status += "Learning rate: \t\t\t max %.3f, min %.3f\n" % (self.learning_rate.max(), self.learning_rate.min())
-        status += "Momentum: \t\t\t %.3f\n" % self.momentum
-        status += "Batch size: \t\t\t %d\n" % self.batch_size
-        status += "Time elapsed: \t\t\t %s\n" % self.time
-        return status
-
-    def plot_traces(self):
-        """
-        See GPy.plotting.matplot_dep.inference_plots
-        """
-        assert "matplotlib" in sys.modules, "matplotlib package has not been imported."
-        from ..plotting.matplot_dep import inference_plots
-        inference_plots.plot_sgd_traces(self)
-
-    def non_null_samples(self, data):
-        return (np.isnan(data).sum(axis=1) == 0)
-
-    def check_for_missing(self, data):
-        if sp.sparse.issparse(self.Model.likelihood.Y):
-            return True
-        else:
-            return np.isnan(data).sum() > 0
-
-    def subset_parameter_vector(self, x, samples, param_shapes):
-        subset = np.array([], dtype = int)
-        x = np.arange(0, len(x))
-        i = 0
-
-        for s in param_shapes:
-            N, input_dim = s
-            X = x[i:i+N*input_dim].reshape(N, input_dim)
-            X = X[samples]
-            subset = np.append(subset, X.flatten())
-            i += N*input_dim
-
-        subset = np.append(subset, x[i:])
-
-        return subset
-
-    def shift_constraints(self, j):
-
-        constrained_indices = copy.deepcopy(self.Model.constrained_indices)
-
-        for c, constraint in enumerate(constrained_indices):
-            mask = (np.ones_like(constrained_indices[c]) == 1)
-            for i in range(len(constrained_indices[c])):
-                pos = np.where(j == constrained_indices[c][i])[0]
-                if len(pos) == 1:
-                    self.Model.constrained_indices[c][i] = pos
-                else:
-                    mask[i] = False
-
-            self.Model.constrained_indices[c] = self.Model.constrained_indices[c][mask]
-        return constrained_indices
-        # back them up
-        # bounded_i = copy.deepcopy(self.Model.constrained_bounded_indices)
-        # bounded_l = copy.deepcopy(self.Model.constrained_bounded_lowers)
-        # bounded_u = copy.deepcopy(self.Model.constrained_bounded_uppers)
-
-        # for b in range(len(bounded_i)): # for each group of constraints
-        #     for bc in range(len(bounded_i[b])):
-        #         pos = np.where(j == bounded_i[b][bc])[0]
-        #         if len(pos) == 1:
-        #             pos2 = np.where(self.Model.constrained_bounded_indices[b] == bounded_i[b][bc])[0][0]
-        #             self.Model.constrained_bounded_indices[b][pos2] = pos[0]
-        #         else:
-        #             if len(self.Model.constrained_bounded_indices[b]) == 1:
-        #                 # if it's the last index to be removed
-        #                 # the logic here is just a mess. If we remove the last one, then all the
-        #                 # b-indices change and we have to iterate through everything to find our
-        #                 # current index. Can't deal with this right now.
-        #                 raise NotImplementedError
-
-        #             else: # just remove it from the indices
-        #                 mask = self.Model.constrained_bounded_indices[b] != bc
-        #                 self.Model.constrained_bounded_indices[b] = self.Model.constrained_bounded_indices[b][mask]
-
-
-        # # here we shif the positive constraints. We cycle through each positive
-        # # constraint
-        # positive = self.Model.constrained_positive_indices.copy()
-        # mask = (np.ones_like(positive) == 1)
-        # for p in range(len(positive)):
-        #     # we now check whether the constrained index appears in the j vector
-        #     # (the vector of the "active" indices)
-        #     pos = np.where(j == self.Model.constrained_positive_indices[p])[0]
-        #     if len(pos) == 1:
-        #         self.Model.constrained_positive_indices[p] = pos
-        #     else:
-        #         mask[p] = False
-        # self.Model.constrained_positive_indices = self.Model.constrained_positive_indices[mask]
-
-        # return (bounded_i, bounded_l, bounded_u), positive
-
-    def restore_constraints(self, c):#b, p):
-        # self.Model.constrained_bounded_indices = b[0]
-        # self.Model.constrained_bounded_lowers = b[1]
-        # self.Model.constrained_bounded_uppers = b[2]
-        # self.Model.constrained_positive_indices = p
-        self.Model.constrained_indices = c
-
-    def get_param_shapes(self, N = None, input_dim = None):
-        model_name = self.Model.__class__.__name__
-        if model_name == 'GPLVM':
-            return [(N, input_dim)]
-        if model_name == 'Bayesian_GPLVM':
-            return [(N, input_dim), (N, input_dim)]
-        else:
-            raise NotImplementedError
-
-    def step_with_missing_data(self, f_fp, X, step, shapes):
-        N, input_dim = X.shape
-
-        if not sp.sparse.issparse(self.Model.likelihood.Y):
-            Y = self.Model.likelihood.Y
-            samples = self.non_null_samples(self.Model.likelihood.Y)
-            self.Model.N = samples.sum()
-            Y = Y[samples]
-        else:
-            samples = self.Model.likelihood.Y.nonzero()[0]
-            self.Model.N = len(samples)
-            Y = np.asarray(self.Model.likelihood.Y[samples].todense(), dtype = np.float64)
-
-        if self.Model.N == 0 or Y.std() == 0.0:
-            return 0, step, self.Model.N
-
-        self.Model.likelihood._offset = Y.mean()
-        self.Model.likelihood._scale = Y.std()
-        self.Model.likelihood.set_data(Y)
-        # self.Model.likelihood.V = self.Model.likelihood.Y*self.Model.likelihood.precision
-
-        sigma = self.Model.likelihood._variance
-        self.Model.likelihood._variance = None # invalidate cache
-        self.Model.likelihood._set_params(sigma)
-
-
-        j = self.subset_parameter_vector(self.x_opt, samples, shapes)
-        self.Model.X = X[samples]
-
-        model_name = self.Model.__class__.__name__
-
-        if model_name == 'Bayesian_GPLVM':
-            self.Model.likelihood.YYT = np.dot(self.Model.likelihood.Y, self.Model.likelihood.Y.T)
-            self.Model.likelihood.trYYT = np.trace(self.Model.likelihood.YYT)
-
-        ci = self.shift_constraints(j)
-        f, fp = f_fp(self.x_opt[j])
-
-        step[j] = self.momentum * step[j] + self.learning_rate[j] * fp
-        self.x_opt[j] -= step[j]
-        self.restore_constraints(ci)
-
-        self.Model.grads[j] = fp
-        # restore likelihood _offset and _scale, otherwise when we call set_data(y) on
-        # the next feature, it will get normalized with the mean and std of this one.
-        self.Model.likelihood._offset = 0
-        self.Model.likelihood._scale = 1
-
-        return f, step, self.Model.N
-
-    def adapt_learning_rate(self, t, D):
-        if self.learning_rate_adaptation == 'adagrad':
-            if t > 0:
-                g_k = self.Model.grads
-                self.s_k += np.square(g_k)
-                t0 = 100.0
-                self.learning_rate = 0.1/(t0 + np.sqrt(self.s_k))
-
-                import pdb; pdb.set_trace()
-            else:
-                self.learning_rate = np.zeros_like(self.learning_rate)
-                self.s_k = np.zeros_like(self.x_opt)
-
-        elif self.learning_rate_adaptation == 'annealing':
-            #self.learning_rate = self.learning_rate_0/(1+float(t+1)/10)
-            self.learning_rate = np.ones_like(self.learning_rate) * self.schedule[t]
-
-
-        elif self.learning_rate_adaptation == 'semi_pesky':
-            if self.Model.__class__.__name__ == 'Bayesian_GPLVM':
-                g_t = self.Model.grads
-                if t == 0:
-                    self.hbar_t = 0.0
-                    self.tau_t = 100.0
-                    self.gbar_t = 0.0
-
-                self.gbar_t = (1-1/self.tau_t)*self.gbar_t + 1/self.tau_t * g_t
-                self.hbar_t = (1-1/self.tau_t)*self.hbar_t + 1/self.tau_t * np.dot(g_t.T, g_t)
-                self.learning_rate = np.ones_like(self.learning_rate)*(np.dot(self.gbar_t.T, self.gbar_t) / self.hbar_t)
-                tau_t = self.tau_t*(1-self.learning_rate) + 1
-
-
-    def opt(self, f_fp=None, f=None, fp=None):
-        self.x_opt = self.Model._get_params_transformed()
-        self.grads = []
-
-        X, Y = self.Model.X.copy(), self.Model.likelihood.Y.copy()
-
-        self.Model.likelihood.YYT = 0
-        self.Model.likelihood.trYYT = 0
-        self.Model.likelihood._offset = 0.0
-        self.Model.likelihood._scale = 1.0
-
-        N, input_dim = self.Model.X.shape
-        D = self.Model.likelihood.Y.shape[1]
-        num_params = self.Model._get_params()
-        self.trace = []
-        missing_data = self.check_for_missing(self.Model.likelihood.Y)
-
-        step = np.zeros_like(num_params)
-        for it in range(self.iterations):
-            if self.actual_iter != None:
-                it = self.actual_iter
-
-            self.Model.grads = np.zeros_like(self.x_opt) # TODO this is ugly
-
-            if it == 0 or self.self_paced is False:
-                features = np.random.permutation(Y.shape[1])
-            else:
-                features = np.argsort(NLL)
-
-            b = len(features)/self.batch_size
-            features = [features[i::b] for i in range(b)]
-            NLL = []
-            for count, j in enumerate(features):
-                self.Model.input_dim = len(j)
-                self.Model.likelihood.input_dim = len(j)
-                self.Model.likelihood.set_data(Y[:, j])
-                # self.Model.likelihood.V = self.Model.likelihood.Y*self.Model.likelihood.precision
-
-                sigma = self.Model.likelihood._variance
-                self.Model.likelihood._variance = None # invalidate cache
-                self.Model.likelihood._set_params(sigma)
-
-                if missing_data:
-                    shapes = self.get_param_shapes(N, input_dim)
-                    f, step, Nj = self.step_with_missing_data(f_fp, X, step, shapes)
-                else:
-                    self.Model.likelihood.YYT = np.dot(self.Model.likelihood.Y, self.Model.likelihood.Y.T)
-                    self.Model.likelihood.trYYT = np.trace(self.Model.likelihood.YYT)
-                    Nj = N
-                    f, fp = f_fp(self.x_opt)
-                    self.Model.grads = fp.copy()
-                    step = self.momentum * step + self.learning_rate * fp
-                    self.x_opt -= step
-
-                if self.messages == 2:
-                    noise = self.Model.likelihood._variance
-                    status = "evaluating {feature: 5d}/{tot: 5d} \t f: {f: 2.3f} \t non-missing: {nm: 4d}\t noise: {noise: 2.4f}\r".format(feature = count, tot = len(features), f = f, nm = Nj, noise = noise)
-                    sys.stdout.write(status)
-                    sys.stdout.flush()
-                    self.param_traces['noise'].append(noise)
-
-                self.adapt_learning_rate(it+count, D)
-                NLL.append(f)
-                self.fopt_trace.append(NLL[-1])
-
-                # for k in self.param_traces.keys():
-                #     self.param_traces[k].append(self.Model.get(k)[0])
-            self.grads.append(self.Model.grads.tolist())
-            # should really be a sum(), but earlier samples in the iteration will have a very crappy ll
-            self.f_opt = np.mean(NLL)
-            self.Model.N = N
-            self.Model.X = X
-            self.Model.input_dim = D
-            self.Model.likelihood.N = N
-            self.Model.likelihood.input_dim = D
-            self.Model.likelihood.Y = Y
-            sigma = self.Model.likelihood._variance
-            self.Model.likelihood._variance = None # invalidate cache
-            self.Model.likelihood._set_params(sigma)
-
-            self.trace.append(self.f_opt)
-            if self.iteration_file is not None:
-                f = open(self.iteration_file + "iteration%d.pickle" % it, 'w')
-                data = [self.x_opt, self.fopt_trace, self.param_traces]
-                pickle.dump(data, f)
-                f.close()
-
-            if self.messages != 0:
-                sys.stdout.write('\r' + ' '*len(status)*2 + '  \r')
-                status = "SGD Iteration: {0: 3d}/{1: 3d}  f: {2: 2.3f}   max eta: {3: 1.5f}\n".format(it+1, self.iterations, self.f_opt, self.learning_rate.max())
-                sys.stdout.write(status)
-                sys.stdout.flush()
diff --git a/GPy/inference/optimization/stochastics.py b/GPy/inference/optimization/stochastics.py
index f19c3c2e..dc71d539 100644
--- a/GPy/inference/optimization/stochastics.py
+++ b/GPy/inference/optimization/stochastics.py
@@ -1,8 +1,5 @@
-'''
-Created on 9 Oct 2014
-
-@author: maxz
-'''
+# Copyright (c) 2012-2014, Max Zwiessele
+# Licensed under the BSD 3-clause license (see LICENSE.txt)
 
 class StochasticStorage(object):
     '''
@@ -56,4 +53,4 @@ class SparseGPStochastics(StochasticStorage):
 
     def reset(self):
         self.current_dim = -1
-        self.d = None
\ No newline at end of file
+        self.d = None
diff --git a/GPy/kern/__init__.py b/GPy/kern/__init__.py
index c400277c..718be74f 100644
--- a/GPy/kern/__init__.py
+++ b/GPy/kern/__init__.py
@@ -1,7 +1,7 @@
 from _src.kern import Kern
 from _src.rbf import RBF
 from _src.linear import Linear, LinearFull
-from _src.static import Bias, White
+from _src.static import Bias, White, Fixed
 from _src.brownian import Brownian
 from _src.stationary import Exponential, OU, Matern32, Matern52, ExpQuad, RatQuad, Cosine
 from _src.mlp import MLP
@@ -13,6 +13,7 @@ from _src.ODE_UYC import ODE_UYC
 from _src.ODE_st import ODE_st
 from _src.ODE_t import ODE_t
 from _src.poly import Poly
+from _src.eq_ode2 import EQ_ODE2
 
 from _src.trunclinear import TruncLinear,TruncLinear_inf
 from _src.splitKern import SplitKern,DiffGenomeKern
diff --git a/GPy/kern/_src/coregionalize.py b/GPy/kern/_src/coregionalize.py
index 3fcf1c98..291402ec 100644
--- a/GPy/kern/_src/coregionalize.py
+++ b/GPy/kern/_src/coregionalize.py
@@ -127,7 +127,7 @@ class Coregionalize(Kern):
                 config.set('weave', 'working', 'False')
                 dL_dK_small = self._gradient_reduce_weave(dL_dK, index, index2)
         else:
-            dL_dK_small = self._gradient_reduce_weave(dL_dK, index, index2)
+            dL_dK_small = self._gradient_reduce_numpy(dL_dK, index, index2)
 
 
 
@@ -154,9 +154,9 @@ class Coregionalize(Kern):
     def _gradient_reduce_numpy(self, dL_dK, index, index2):
         index, index2 = index[:,0], index2[:,0]
         dL_dK_small = np.zeros_like(self.B)
-        for i in range(k.output_dim):
+        for i in range(self.output_dim):
             tmp1 = dL_dK[index==i]
-            for j in range(k.output_dim):
+            for j in range(self.output_dim):
                 dL_dK_small[j,i] = tmp1[:,index2==j].sum()
         return dL_dK_small
 
diff --git a/GPy/kern/_src/eq_ode2.py b/GPy/kern/_src/eq_ode2.py
new file mode 100644
index 00000000..59f67b8b
--- /dev/null
+++ b/GPy/kern/_src/eq_ode2.py
@@ -0,0 +1,1331 @@
+# Copyright (c) 2014, Cristian Guarnizo.
+# Licensed under the BSD 3-clause license (see LICENSE.txt)
+
+import numpy as np
+from scipy.special import wofz
+from kern import Kern
+from ...core.parameterization import Param
+from ...core.parameterization.transformations import Logexp
+from ...util.caching import Cache_this
+
+class EQ_ODE2(Kern):
+    """
+    Covariance function for second order differential equation driven by an exponentiated quadratic covariance.
+
+    This outputs of this kernel have the form
+    .. math::
+       \frac{\text{d}^2y_j(t)}{\text{d}^2t} + C_j\frac{\text{d}y_j(t)}{\text{d}t} + B_jy_j(t) = \sum_{i=1}^R w_{j,i} u_i(t)
+
+    where :math:`R` is the rank of the system, :math:`w_{j,i}` is the sensitivity of the :math:`j`th output to the :math:`i`th latent function, :math:`d_j` is the decay rate of the :math:`j`th output and :math:`f_i(t)` and :math:`g_i(t)` are independent latent Gaussian processes goverened by an exponentiated quadratic covariance.
+
+    :param output_dim: number of outputs driven by latent function.
+    :type output_dim: int
+    :param W: sensitivities of each output to the latent driving function.
+    :type W: ndarray (output_dim x rank).
+    :param rank: If rank is greater than 1 then there are assumed to be a total of rank latent forces independently driving the system, each with identical covariance.
+    :type rank: int
+    :param C: damper constant for the second order system.
+    :type C: array of length output_dim.
+    :param B: spring constant for the second order system.
+    :type B: array of length output_dim.
+
+    """
+    #This code will only work for the sparseGP model, due to limitations in models for this kernel
+    def __init__(self, input_dim=2, output_dim=1, rank=1, W=None, lengthscale=None, C=None, B=None, active_dims=None, name='eq_ode2'):
+        #input_dim should be 1, but kern._slice_X is not returning index information required to evaluate kernels        
+        assert input_dim == 2, "only defined for 1 input dims"
+        super(EQ_ODE2, self).__init__(input_dim=input_dim, active_dims=active_dims, name=name)
+        self.rank = rank
+        self.output_dim = output_dim
+
+        if lengthscale is None:
+            lengthscale = .5+np.random.rand(self.rank)
+        else:
+            lengthscale = np.asarray(lengthscale)
+            assert lengthscale.size in [1, self.rank], "Bad number of lengthscales"
+            if lengthscale.size != self.rank:
+                lengthscale = np.ones(self.input_dim)*lengthscale
+
+        if W is None:
+            #W = 0.5*np.random.randn(self.output_dim, self.rank)/np.sqrt(self.rank)
+            W = np.ones((self.output_dim, self.rank))
+        else:
+            assert W.shape == (self.output_dim, self.rank)
+
+        if C is None:
+            C = np.ones(self.output_dim)
+
+        if B is None:
+            B = np.ones(self.output_dim)
+
+        self.C = Param('C', C, Logexp())
+        self.B = Param('B', B, Logexp())
+        self.lengthscale = Param('lengthscale', lengthscale, Logexp())
+        self.W = Param('W', W)
+        self.link_parameters(self.lengthscale, self.C, self.B, self.W)
+
+    @Cache_this(limit=2)
+    def K(self, X, X2=None):
+        #This way is not working, indexes are lost after using k._slice_X
+        #index = np.asarray(X, dtype=np.int)
+        #index = index.reshape(index.size,)
+        if hasattr(X, 'values'):
+            X = X.values
+        index = np.int_(X[:, 1])
+        index = index.reshape(index.size,)
+        X_flag = index[0] >= self.output_dim
+        if X2 is None:
+            if X_flag:
+                #Calculate covariance function for the latent functions
+                index -= self.output_dim
+                return self._Kuu(X, index)
+            else:
+                raise NotImplementedError
+        else:
+            #This way is not working, indexes are lost after using k._slice_X
+            #index2 = np.asarray(X2, dtype=np.int)
+            #index2 = index2.reshape(index2.size,)
+            if hasattr(X2, 'values'):
+                X2 = X2.values
+            index2 = np.int_(X2[:, 1])
+            index2 = index2.reshape(index2.size,)
+            X2_flag = index2[0] >= self.output_dim
+            #Calculate cross-covariance function
+            if not X_flag and X2_flag:
+                index2 -= self.output_dim
+                return self._Kfu(X, index, X2, index2) #Kfu
+            else:
+                index -= self.output_dim
+                return self._Kfu(X2, index2, X, index).T #Kuf
+
+    #Calculate the covariance function for diag(Kff(X,X))
+    def Kdiag(self, X):
+        #This way is not working, indexes are lost after using k._slice_X
+        #index = np.asarray(X, dtype=np.int)
+        #index = index.reshape(index.size,)
+        if hasattr(X, 'values'):
+            X = X.values
+        index = np.int_(X[:, 1])
+        index = index.reshape(index.size,)
+        
+        #terms that move along t
+        t = X[:, 0].reshape(X.shape[0], 1)
+        d = np.unique(index) #Output Indexes
+        B = self.B.values[d]
+        C = self.C.values[d]
+        S = self.W.values[d, :]
+        #Index transformation
+        indd = np.arange(self.output_dim)
+        indd[d] = np.arange(d.size)
+        index = indd[index]
+        #Check where wd becomes complex
+        wbool = C*C >= 4.*B
+        B = B.reshape(B.size, 1)
+        C = C.reshape(C.size, 1)
+        alpha = .5*C
+        C2 = C*C
+
+        wbool2 = wbool[index]
+        ind2t = np.where(wbool2)
+        ind3t = np.where(np.logical_not(wbool2))
+
+        #Terms that move along q
+        lq = self.lengthscale.values.reshape(1, self.lengthscale.size)
+        S2 = S*S
+        kdiag = np.empty((t.size, ))
+
+        indD = np.arange(B.size)
+        #(1) When wd is real
+        if np.any(np.logical_not(wbool)):
+            #Indexes of index and t related to (2)
+            t1 = t[ind3t]
+            ind = index[ind3t]
+            d = np.asarray(np.where(np.logical_not(wbool))[0]) #Selection of outputs
+            indd = indD.copy()
+            indd[d] = np.arange(d.size)
+            ind = indd[ind]
+            #Dx1 terms
+            S2lq = S2[d]*(.5*lq)
+            c0 = S2lq*np.sqrt(np.pi)
+            w = .5*np.sqrt(4.*B[d] - C2[d])
+            alphad = alpha[d]
+            w2 = w*w
+            gam = alphad + 1j*w
+            gamc = alphad - 1j*w
+            c1 = .5/(alphad*w2)
+            c2 = .5/(gam*w2)
+            c = c1 - c2
+            #DxQ terms
+            nu = lq*(gam*.5)
+            K01 = c0*c
+            #Nx1 terms
+            gamt = -gam[ind]*t1
+            gamct = -gamc[ind]*t1
+            egamt = np.exp(gamt)
+            ec = egamt*c2[ind] - np.exp(gamct)*c1[ind]
+            #NxQ terms
+            t_lq = t1/lq
+
+            # Upsilon Calculations
+            # Using wofz
+            wnu = wofz(1j*nu)
+            lwnu = np.log(wnu)
+            t2_lq2 = -t_lq*t_lq
+            upm = wnu[ind] - np.exp(t2_lq2 + gamt + np.log(wofz(1j*(t_lq + nu[ind]))))
+            upm[t1[:, 0] == 0, :] = 0.
+
+            nu2 = nu*nu
+            z1 = nu[ind] - t_lq
+            indv1 = np.where(z1.real >= 0.)
+            indv2 = np.where(z1.real < 0.)
+            upv = -np.exp(lwnu[ind] + gamt)
+            if indv1[0].shape > 0:
+                upv[indv1] += np.exp(t2_lq2[indv1] + np.log(wofz(1j*z1[indv1])))
+            if indv2[0].shape > 0:
+                upv[indv2] += np.exp(nu2[ind[indv2[0]], indv2[1]] + gamt[indv2[0], 0] + np.log(2.))\
+                             - np.exp(t2_lq2[indv2] + np.log(wofz(-1j*z1[indv2])))
+            upv[t1[:, 0] == 0, :] = 0.
+
+            #Covariance calculation
+            kdiag[ind3t] = np.sum(np.real(K01[ind]*upm), axis=1)
+            kdiag[ind3t] += np.sum(np.real((c0[ind]*ec)*upv), axis=1)
+
+        #(2) When w_d is complex
+        if np.any(wbool):
+            t1 = t[ind2t]
+            ind = index[ind2t]
+            #Index transformation
+            d = np.asarray(np.where(wbool)[0])
+            indd = indD.copy()
+            indd[d] = np.arange(d.size)
+            ind = indd[ind]
+            #Dx1 terms
+            S2lq = S2[d]*(lq*.25)
+            c0 = S2lq*np.sqrt(np.pi)
+            w = .5*np.sqrt(C2[d] - 4.*B[d])
+            alphad = alpha[d]
+            gam = alphad - w
+            gamc = alphad + w
+            w2 = -w*w
+            c1 = .5/(alphad*w2)
+            c21 = .5/(gam*w2)
+            c22 = .5/(gamc*w2)
+            c = c1 - c21
+            c2 = c1 - c22
+            #DxQ terms
+            K011 = c0*c
+            K012 = c0*c2
+            nu = lq*(.5*gam)
+            nuc = lq*(.5*gamc)
+            #Nx1 terms
+            gamt = -gam[ind]*t1
+            gamct = -gamc[ind]*t1
+            egamt = np.exp(gamt)
+            egamct = np.exp(gamct)
+            ec = egamt*c21[ind] - egamct*c1[ind]
+            ec2 = egamct*c22[ind] - egamt*c1[ind]
+            #NxQ terms
+            t_lq = t1/lq
+
+            #Upsilon Calculations using wofz
+            t2_lq2 = -t_lq*t_lq #Required when using wofz
+            wnu = wofz(1j*nu).real
+            lwnu = np.log(wnu)
+            upm = wnu[ind] - np.exp(t2_lq2 + gamt + np.log(wofz(1j*(t_lq + nu[ind])).real))
+            upm[t1[:, 0] == 0., :] = 0.
+
+            nu2 = nu*nu
+            z1 = nu[ind] - t_lq
+            indv1 = np.where(z1 >= 0.)
+            indv2 = np.where(z1 < 0.)
+            upv = -np.exp(lwnu[ind] + gamt)
+            if indv1[0].shape > 0:
+                upv[indv1] += np.exp(t2_lq2[indv1] + np.log(wofz(1j*z1[indv1]).real))
+            if indv2[0].shape > 0:
+                upv[indv2] += np.exp(nu2[ind[indv2[0]], indv2[1]] + gamt[indv2[0], 0] + np.log(2.))\
+                              - np.exp(t2_lq2[indv2] + np.log(wofz(-1j*z1[indv2]).real))
+            upv[t1[:, 0] == 0, :] = 0.
+
+            wnuc = wofz(1j*nuc).real
+            lwnuc = np.log(wnuc)
+
+            upmc = wnuc[ind] - np.exp(t2_lq2 + gamct + np.log(wofz(1j*(t_lq + nuc[ind])).real))
+            upmc[t1[:, 0] == 0., :] = 0.
+
+            nuc2 = nuc*nuc
+            z1 = nuc[ind] - t_lq
+            indv1 = np.where(z1 >= 0.)
+            indv2 = np.where(z1 < 0.)
+            upvc = - np.exp(lwnuc[ind] + gamct)
+            if indv1[0].shape > 0:
+                upvc[indv1] += np.exp(t2_lq2[indv1] + np.log(wofz(1j*z1[indv1]).real))
+            if indv2[0].shape > 0:
+                upvc[indv2] += np.exp(nuc2[ind[indv2[0]], indv2[1]] + gamct[indv2[0], 0] + np.log(2.))\
+                               - np.exp(t2_lq2[indv2] + np.log(wofz(-1j*z1[indv2]).real))
+            upvc[t1[:, 0] == 0, :] = 0.
+
+            #Covariance calculation
+            kdiag[ind2t] = np.sum(K011[ind]*upm + K012[ind]*upmc + (c0[ind]*ec)*upv + (c0[ind]*ec2)*upvc, axis=1)
+        return kdiag
+
+    def update_gradients_full(self, dL_dK, X, X2 = None):
+        #index = np.asarray(X, dtype=np.int)
+        #index = index.reshape(index.size,)
+        if hasattr(X, 'values'):
+            X = X.values
+        self.B.gradient = np.zeros(self.B.shape)
+        self.C.gradient = np.zeros(self.C.shape)
+        self.W.gradient = np.zeros(self.W.shape)
+        self.lengthscale.gradient = np.zeros(self.lengthscale.shape)
+        index = np.int_(X[:, 1])
+        index = index.reshape(index.size,)
+        X_flag = index[0] >= self.output_dim
+        if X2 is None:
+            if X_flag: #Kuu or Kmm
+                index -= self.output_dim
+                tmp = dL_dK*self._gkuu_lq(X, index)
+                for q in np.unique(index):
+                    ind = np.where(index == q)
+                    self.lengthscale.gradient[q] = tmp[np.ix_(ind[0], ind[0])].sum()
+            else:
+                raise NotImplementedError
+        else: #Kfu or Knm
+            #index2 = np.asarray(X2, dtype=np.int)
+            #index2 = index2.reshape(index2.size,)
+            if hasattr(X2, 'values'):
+                X2 = X2.values
+            index2 = np.int_(X2[:, 1])
+            index2 = index2.reshape(index2.size,)
+            X2_flag = index2[0] >= self.output_dim
+            if not X_flag and X2_flag:
+                index2 -= self.output_dim
+            else:
+                dL_dK = dL_dK.T #so we obtaing dL_Kfu
+                indtemp = index - self.output_dim
+                Xtemp = X
+                X = X2
+                X2 = Xtemp
+                index = index2
+                index2 = indtemp
+            glq, gSdq, gB, gC = self._gkfu(X, index, X2, index2)
+            tmp = dL_dK*glq
+            for q in np.unique(index2):
+                ind = np.where(index2 == q)
+                self.lengthscale.gradient[q] = tmp[:, ind].sum()
+            tmpB = dL_dK*gB
+            tmpC = dL_dK*gC
+            tmp = dL_dK*gSdq
+            for d in np.unique(index):
+                ind = np.where(index == d)
+                self.B.gradient[d] = tmpB[ind, :].sum()
+                self.C.gradient[d] = tmpC[ind, :].sum()
+                for q in np.unique(index2):
+                    ind2 = np.where(index2 == q)
+                    self.W.gradient[d, q] = tmp[np.ix_(ind[0], ind2[0])].sum()
+
+    def update_gradients_diag(self, dL_dKdiag, X):
+        #index = np.asarray(X, dtype=np.int)
+        #index = index.reshape(index.size,)
+        if hasattr(X, 'values'):
+            X = X.values
+        self.B.gradient = np.zeros(self.B.shape)
+        self.C.gradient = np.zeros(self.C.shape)
+        self.W.gradient = np.zeros(self.W.shape)
+        self.lengthscale.gradient = np.zeros(self.lengthscale.shape)
+        index = np.int_(X[:, 1])
+        index = index.reshape(index.size,)
+        
+        glq, gS, gB, gC = self._gkdiag(X, index)
+        tmp = dL_dKdiag.reshape(index.size, 1)*glq
+        self.lengthscale.gradient = tmp.sum(0)
+        #TODO: Avoid the reshape by a priori knowing the shape of dL_dKdiag
+        tmpB = dL_dKdiag*gB.reshape(dL_dKdiag.shape)
+        tmpC = dL_dKdiag*gC.reshape(dL_dKdiag.shape)
+        tmp = dL_dKdiag.reshape(index.size, 1)*gS
+        for d in np.unique(index):
+            ind = np.where(index == d)
+            self.B.gradient[d] = tmpB[ind].sum()
+            self.C.gradient[d] = tmpC[ind].sum()
+            self.W.gradient[d, :] = tmp[ind].sum(0)
+
+    def gradients_X(self, dL_dK, X, X2=None):
+        #index = np.asarray(X, dtype=np.int)
+        #index = index.reshape(index.size,)
+        if hasattr(X, 'values'):
+            X = X.values
+        index = np.int_(X[:, 1])
+        index = index.reshape(index.size,)
+        X_flag = index[0] >= self.output_dim
+        #If input_dim == 1, use this
+        #gX = np.zeros((X.shape[0], 1))
+        #Cheat to allow gradient for input_dim==2
+        gX = np.zeros(X.shape)
+        if X2 is None: #Kuu or Kmm
+            if X_flag:
+                index -= self.output_dim
+                gX[:, 0] = 2.*(dL_dK*self._gkuu_X(X, index)).sum(0)
+                return gX
+            else:
+                raise NotImplementedError
+        else: #Kuf or Kmn
+            #index2 = np.asarray(X2, dtype=np.int)
+            #index2 = index2.reshape(index2.size,)
+            if hasattr(X2, 'values'):
+                X2 = X2.values
+            index2 = np.int_(X2[:, 1])
+            index2 = index2.reshape(index2.size,)
+            X2_flag = index2[0] >= self.output_dim
+            if X_flag and not X2_flag: #gradient of Kuf(Z, X) wrt Z
+                index -= self.output_dim
+                gX[:, 0] = (dL_dK*self._gkfu_z(X2, index2, X, index).T).sum(1)
+                return gX
+            else:
+                raise NotImplementedError
+
+    #---------------------------------------#
+    #             Helper functions          #
+    #---------------------------------------#
+
+    #Evaluation of squared exponential for LFM
+    def _Kuu(self, X, index):
+        index = index.reshape(index.size,)
+        t = X[:, 0].reshape(X.shape[0],)
+        lq = self.lengthscale.values.reshape(self.rank,)
+        lq2 = lq*lq
+        #Covariance matrix initialization
+        kuu = np.zeros((t.size, t.size))
+        #Assign 1. to diagonal terms
+        kuu[np.diag_indices(t.size)] = 1.
+        #Upper triangular indices
+        indtri1, indtri2 = np.triu_indices(t.size, 1)
+        #Block Diagonal indices among Upper Triangular indices
+        ind = np.where(index[indtri1] == index[indtri2])
+        indr = indtri1[ind]
+        indc = indtri2[ind]
+        r = t[indr] - t[indc]
+        r2 = r*r
+        #Calculation of  covariance function
+        kuu[indr, indc] = np.exp(-r2/lq2[index[indr]])
+        #Completation of lower triangular part
+        kuu[indc, indr] = kuu[indr, indc]
+        return kuu
+
+    #Evaluation of cross-covariance function
+    def _Kfu(self, X, index, X2, index2):
+        #terms that move along t
+        t = X[:, 0].reshape(X.shape[0], 1)
+        d = np.unique(index) #Output Indexes
+        B = self.B.values[d]
+        C = self.C.values[d]
+        S = self.W.values[d, :]
+        #Index transformation
+        indd = np.arange(self.output_dim)
+        indd[d] = np.arange(d.size)
+        index = indd[index]
+        #Check where wd becomes complex
+        wbool = C*C >= 4.*B
+        #Output related variables must be column-wise
+        C = C.reshape(C.size, 1)
+        B = B.reshape(B.size, 1)
+        C2 = C*C
+        #Input related variables must be row-wise
+        z = X2[:, 0].reshape(1, X2.shape[0])
+        lq = self.lengthscale.values.reshape((1, self.rank))
+        #print np.max(z), np.max(z/lq[0, index2])
+        alpha = .5*C
+
+        wbool2 = wbool[index]
+        ind2t = np.where(wbool2)
+        ind3t = np.where(np.logical_not(wbool2))
+
+        kfu = np.empty((t.size, z.size))
+
+        indD = np.arange(B.size)
+        #(1) when wd is real
+        if np.any(np.logical_not(wbool)):
+            #Indexes of index and t related to (2)
+            t1 = t[ind3t]
+            ind = index[ind3t]
+            #Index transformation
+            d = np.asarray(np.where(np.logical_not(wbool))[0])
+            indd = indD.copy()
+            indd[d] = np.arange(d.size)
+            ind = indd[ind]
+            #Dx1 terms
+            w = .5*np.sqrt(4.*B[d] - C2[d])
+            alphad = alpha[d]
+            gam = alphad - 1j*w
+
+            #DxQ terms
+            Slq = (S[d]/w)*(.5*lq)
+            c0 = Slq*np.sqrt(np.pi)
+            nu = gam*(.5*lq)
+            #1xM terms
+            z_lq = z/lq[0, index2]
+            #NxQ terms
+            t_lq = t1/lq
+            #NxM terms
+            zt_lq = z_lq - t_lq[:, index2]
+
+            # Upsilon Calculations
+            #Using wofz
+            tz = t1-z
+            fullind = np.ix_(ind, index2)
+            zt_lq2 = -zt_lq*zt_lq
+            z_lq2 = -z_lq*z_lq
+            gamt = -gam[ind]*t1
+
+            upsi = - np.exp(z_lq2 + gamt + np.log(wofz(1j*(z_lq + nu[fullind]))))
+            z1 = zt_lq + nu[fullind]
+            indv1 = np.where(z1.real >= 0.)
+            indv2 = np.where(z1.real < 0.)
+            if indv1[0].shape > 0:
+                upsi[indv1] += np.exp(zt_lq2[indv1] + np.log(wofz(1j*z1[indv1])))
+            if indv2[0].shape > 0:
+                nua2 = nu[ind[indv2[0]], index2[indv2[1]]]**2
+                upsi[indv2] += np.exp(nua2 - gam[ind[indv2[0]], 0]*tz[indv2] + np.log(2.))\
+                               - np.exp(zt_lq2[indv2] + np.log(wofz(-1j*z1[indv2])))
+            upsi[t1[:, 0] == 0., :] = 0.
+
+            #Covariance calculation
+            kfu[ind3t] = c0[fullind]*upsi.imag
+
+        #(2) when wd is complex
+        if np.any(wbool):
+            #Indexes of index and t related to (2)
+            t1 = t[ind2t]
+            ind = index[ind2t]
+            #Index transformation
+            d = np.asarray(np.where(wbool)[0])
+            indd = indD.copy()
+            indd[d] = np.arange(d.size)
+            ind = indd[ind]
+            #Dx1 terms
+            w = .5*np.sqrt(C2[d] - 4.*B[d])
+            alphad = alpha[d]
+            gam = alphad - w
+            gamc = alphad + w
+            #DxQ terms
+            Slq = S[d]*(lq*.25)
+            c0 = -Slq*(np.sqrt(np.pi)/w)
+            nu = gam*(lq*.5)
+            nuc = gamc*(lq*.5)
+            #1xM terms
+            z_lq = z/lq[0, index2]
+            #NxQ terms
+            t_lq = t1/lq[0, index2]
+            #NxM terms
+            zt_lq = z_lq - t_lq
+
+            # Upsilon Calculations
+            tz = t1-z
+            z_lq2 = -z_lq*z_lq
+            zt_lq2 = -zt_lq*zt_lq
+            gamt = -gam[ind]*t1
+            gamct = -gamc[ind]*t1
+            fullind = np.ix_(ind, index2)
+            upsi = np.exp(z_lq2 + gamt + np.log(wofz(1j*(z_lq + nu[fullind])).real))\
+                   - np.exp(z_lq2 + gamct + np.log(wofz(1j*(z_lq + nuc[fullind])).real))
+
+            z1 = zt_lq + nu[fullind]
+            indv1 = np.where(z1 >= 0.)
+            indv2 = np.where(z1 < 0.)
+            if indv1[0].shape > 0:
+                upsi[indv1] -= np.exp(zt_lq2[indv1] + np.log(wofz(1j*z1[indv1]).real))
+            if indv2[0].shape > 0:
+                nua2 = nu[ind[indv2[0]], index2[indv2[1]]]**2
+                upsi[indv2] -= np.exp(nua2 - gam[ind[indv2[0]], 0]*tz[indv2] + np.log(2.))\
+                               - np.exp(zt_lq2[indv2] + np.log(wofz(-1j*z1[indv2]).real))
+            z1 = zt_lq + nuc[fullind]
+            indv1 = np.where(z1 >= 0.)
+            indv2 = np.where(z1 < 0.)
+            if indv1[0].shape > 0:
+                upsi[indv1] += np.exp(zt_lq2[indv1] + np.log(wofz(1j*z1[indv1]).real))
+            if indv2[0].shape > 0:
+                nuac2 = nuc[ind[indv2[0]], index2[indv2[1]]]**2
+                upsi[indv2] += np.exp(nuac2 - gamc[ind[indv2[0]], 0]*tz[indv2] + np.log(2.))\
+                               - np.exp(zt_lq2[indv2] + np.log(wofz(-1j*z1[indv2]).real))
+            upsi[t1[:, 0] == 0., :] = 0.
+
+            kfu[ind2t] = c0[np.ix_(ind, index2)]*upsi
+        return kfu
+
+    #Gradient of Kuu wrt lengthscale
+    def _gkuu_lq(self, X, index):
+        t = X[:, 0].reshape(X.shape[0],)
+        index = index.reshape(X.shape[0],)
+        lq = self.lengthscale.values.reshape(self.rank,)
+        lq2 = lq*lq
+        #Covariance matrix initialization
+        glq = np.zeros((t.size, t.size))
+        #Upper triangular indices
+        indtri1, indtri2 = np.triu_indices(t.size, 1)
+        #Block Diagonal indices among Upper Triangular indices
+        ind = np.where(index[indtri1] == index[indtri2])
+        indr = indtri1[ind]
+        indc = indtri2[ind]
+        r = t[indr] - t[indc]
+        r2 = r*r
+        r2_lq2 = r2/lq2[index[indr]]
+        #Calculation of  covariance function
+        er2_lq2 = np.exp(-r2_lq2)
+        #Gradient wrt lq
+        c = 2.*r2_lq2/lq[index[indr]]
+        glq[indr, indc] = er2_lq2*c
+        #Complete the lower triangular
+        glq[indc, indr] = glq[indr, indc]
+        return glq
+
+    #Be careful this derivative should be transpose it
+    def _gkuu_X(self, X, index): #Diagonal terms are always zero
+        t = X[:, 0].reshape(X.shape[0],)
+        index = index.reshape(index.size,)
+        lq = self.lengthscale.values.reshape(self.rank,)
+        lq2 = lq*lq
+        #Covariance matrix initialization
+        gt = np.zeros((t.size, t.size))
+        #Upper triangular indices
+        indtri1, indtri2 = np.triu_indices(t.size, 1) #Offset of 1 from the diagonal
+        #Block Diagonal indices among Upper Triangular indices
+        ind = np.where(index[indtri1] == index[indtri2])
+        indr = indtri1[ind]
+        indc = indtri2[ind]
+        r = t[indr] - t[indc]
+        r2 = r*r
+        r2_lq2 = r2/(-lq2[index[indr]])
+        #Calculation of  covariance function
+        er2_lq2 = np.exp(r2_lq2)
+        #Gradient wrt t
+        c = 2.*r/lq2[index[indr]]
+        gt[indr, indc] = er2_lq2*c
+        #Complete the lower triangular
+        gt[indc, indr] = -gt[indr, indc]
+        return gt
+
+    #Gradients for Diagonal Kff
+    def _gkdiag(self, X, index):
+        index = index.reshape(index.size,)
+        #terms that move along t
+        d = np.unique(index)
+        B = self.B[d].values
+        C = self.C[d].values
+        S = self.W[d, :].values
+        #Index transformation
+        indd = np.arange(self.output_dim)
+        indd[d] = np.arange(d.size)
+        index = indd[index]
+        #Check where wd becomes complex
+        wbool = C*C >= 4.*B
+        #Output related variables must be column-wise
+        t = X[:, 0].reshape(X.shape[0], 1)
+        B = B.reshape(B.size, 1)
+        C = C.reshape(C.size, 1)
+        alpha = .5*C
+        C2 = C*C
+        S2 = S*S
+
+        wbool2 = wbool[index]
+        ind2t = np.where(wbool2)
+        ind3t = np.where(np.logical_not(wbool2))
+
+        #Input related variables must be row-wise
+        lq = self.lengthscale.values.reshape(1, self.rank)
+        lq2 = lq*lq
+
+        gB = np.empty((t.size,))
+        gC = np.empty((t.size,))
+        glq = np.empty((t.size, lq.size))
+        gS = np.empty((t.size, lq.size))
+
+        indD = np.arange(B.size)
+        #(1) When wd is real
+        if np.any(np.logical_not(wbool)):
+            #Indexes of index and t related to (1)
+            t1 = t[ind3t]
+            ind = index[ind3t]
+            #Index transformation
+            d = np.asarray(np.where(np.logical_not(wbool))[0])
+            indd = indD.copy()
+            indd[d] = np.arange(d.size)
+            ind = indd[ind]
+            #Dx1 terms
+            S2lq = S2[d]*(.5*lq)
+            c0 = S2lq*np.sqrt(np.pi)
+
+            w = .5*np.sqrt(4.*B[d] - C2[d])
+            alphad = alpha[d]
+            alpha2 = alphad*alphad
+            w2 = w*w
+            gam = alphad + 1j*w
+            gam2 = gam*gam
+            gamc = alphad - 1j*w
+            c1 = 0.5/alphad
+            c2 = 0.5/gam
+            c = c1 - c2
+
+            #DxQ terms
+            c0 = c0/w2
+            nu = (.5*lq)*gam
+            #Nx1 terms
+            gamt = -gam[ind]*t1
+            gamct = -gamc[ind]*t1
+            egamt = np.exp(gamt)
+            egamct = np.exp(gamct)
+            ec = egamt*c2[ind] - egamct*c1[ind]
+
+            #NxQ terms
+            t_lq = t1/lq
+            t2_lq2 = -t_lq*t_lq
+            t_lq2 = t_lq/lq
+
+            et2_lq2 = np.exp(t2_lq2)
+            etlq2gamt = np.exp(t2_lq2 + gamt)
+
+            ##Upsilon calculations
+            #Using wofz
+            wnu = wofz(1j*nu)
+            lwnu = np.log(wnu)
+            t2_lq2 = -t_lq*t_lq
+            upm = wnu[ind] - np.exp(t2_lq2 + gamt + np.log(wofz(1j*(t_lq + nu[ind]))))
+            upm[t1[:, 0] == 0, :] = 0.
+
+            nu2 = nu*nu
+            z1 = nu[ind] - t_lq
+            indv1 = np.where(z1.real >= 0.)
+            indv2 = np.where(z1.real < 0.)
+            upv = -np.exp(lwnu[ind] + gamt)
+            if indv1[0].shape > 0:
+                upv[indv1] += np.exp(t2_lq2[indv1] + np.log(wofz(1j*z1[indv1])))
+            if indv2[0].shape > 0:
+                upv[indv2] += np.exp(nu2[ind[indv2[0]], indv2[1]] + gamt[indv2[0], 0] + np.log(2.))\
+                             - np.exp(t2_lq2[indv2] + np.log(wofz(-1j*z1[indv2])))
+            upv[t1[:, 0] == 0, :] = 0.
+
+            #Gradient wrt S
+            Slq = S[d]*lq #For grad wrt S
+            c0_S = Slq*np.sqrt(np.pi)/w2
+            K01 = c0_S*c
+
+            gS[ind3t] = np.real(K01[ind]*upm) + np.real((c0_S[ind]*ec)*upv)
+
+            #For B and C
+            upmd = etlq2gamt - 1.
+            upvd = egamt - et2_lq2
+
+            # gradient wrt B
+            dw_dB = 0.5/w
+            dgam_dB = 1j*dw_dB
+
+            Ba1 = c0*(0.5*dgam_dB/gam2 + (0.5*lq2*gam*dgam_dB - 2.*dw_dB/w)*c)
+            Ba2_1 = c0*(dgam_dB*(0.5/gam2 - 0.25*lq2) + dw_dB/(w*gam))
+            Ba2_2 = c0*dgam_dB/gam
+            Ba3 = c0*(-0.25*lq2*gam*dgam_dB/alphad + dw_dB/(w*alphad))
+            Ba4_1 = (S2lq*lq)*dgam_dB/w2
+            Ba4 = Ba4_1*c
+
+            gB[ind3t] = np.sum(np.real(Ba1[ind]*upm) - np.real(((Ba2_1[ind] + Ba2_2[ind]*t1)*egamt - Ba3[ind]*egamct)*upv)\
+                + np.real(Ba4[ind]*upmd) + np.real((Ba4_1[ind]*ec)*upvd), axis=1)
+
+            # gradient wrt C
+            dw_dC = - alphad*dw_dB
+            dgam_dC = 0.5 + 1j*dw_dC
+
+            Ca1 = c0*(-0.25/alpha2 + 0.5*dgam_dC/gam2 + (0.5*lq2*gam*dgam_dC - 2.*dw_dC/w)*c)
+            Ca2_1 = c0*(dgam_dC*(0.5/gam2 - 0.25*lq2) + dw_dC/(w*gam))
+            Ca2_2 = c0*dgam_dC/gam
+            Ca3_1 = c0*(0.25/alpha2 - 0.25*lq2*gam*dgam_dC/alphad + dw_dC/(w*alphad))
+            Ca3_2 = 0.5*c0/alphad
+            Ca4_1 = (S2lq*lq)*dgam_dC/w2
+            Ca4 = Ca4_1*c
+
+            gC[ind3t] = np.sum(np.real(Ca1[ind]*upm) - np.real(((Ca2_1[ind] + Ca2_2[ind]*t1)*egamt - (Ca3_1[ind] + Ca3_2[ind]*t1)*egamct)*upv)\
+                + np.real(Ca4[ind]*upmd) + np.real((Ca4_1[ind]*ec)*upvd), axis=1)
+
+            #Gradient wrt lengthscale
+            #DxQ terms
+            la = (1./lq + nu*gam)*c0
+            la1 = la*c
+
+            c0l = (S2[d]/w2)*lq
+            la3 = c0l*c
+            gam_2 = .5*gam
+            glq[ind3t] = (la1[ind]*upm).real + ((la[ind]*ec)*upv).real\
+                + (la3[ind]*(-gam_2[ind] + etlq2gamt*(-t_lq2 + gam_2[ind]))).real\
+                + ((c0l[ind]*ec)*(-et2_lq2*(t_lq2 + gam_2[ind]) + egamt*gam_2[ind])).real
+
+        #(2) When w_d is complex
+        if np.any(wbool):
+            t1 = t[ind2t]
+            ind = index[ind2t]
+            #Index transformation
+            d = np.asarray(np.where(wbool)[0])
+            indd = indD.copy()
+            indd[d] = np.arange(d.size)
+            ind = indd[ind]
+            #Dx1 terms
+            S2lq = S2[d]*(.25*lq)
+            c0 = S2lq*np.sqrt(np.pi)
+            w = .5*np.sqrt(C2[d]-4.*B[d])
+            w2 = -w*w
+            alphad = alpha[d]
+            alpha2 = alphad*alphad
+            gam = alphad - w
+            gamc = alphad + w
+            gam2 = gam*gam
+            gamc2 = gamc*gamc
+            c1 = .5/alphad
+            c21 = .5/gam
+            c22 = .5/gamc
+            c = c1 - c21
+            c2 = c1 - c22
+            #DxQ terms
+            c0 = c0/w2
+            nu = .5*lq*gam
+            nuc = .5*lq*gamc
+
+            #Nx1 terms
+            gamt = -gam[ind]*t1
+            gamct = -gamc[ind]*t1
+            egamt = np.exp(gamt)
+            egamct = np.exp(gamct)
+            ec = egamt*c21[ind] - egamct*c1[ind]
+            ec2 = egamct*c22[ind] - egamt*c1[ind]
+            #NxQ terms
+            t_lq = t1/lq
+            t2_lq2 = -t_lq*t_lq
+
+            et2_lq2 = np.exp(t2_lq2)
+            etlq2gamct = np.exp(t2_lq2 + gamct)
+            etlq2gamt = np.exp(t2_lq2 + gamt)
+
+            #Upsilon Calculations using wofz
+            t2_lq2 = -t_lq*t_lq #Required when using wofz
+            wnu = np.real(wofz(1j*nu))
+            lwnu = np.log(wnu)
+
+            upm = wnu[ind] - np.exp(t2_lq2 + gamt + np.log(wofz(1j*(t_lq + nu[ind])).real))
+            upm[t1[:, 0] == 0., :] = 0.
+
+            nu2 = nu*nu
+            z1 = nu[ind] - t_lq
+            indv1 = np.where(z1 >= 0.)
+            indv2 = np.where(z1 < 0.)
+            upv = -np.exp(lwnu[ind] + gamt)
+            if indv1[0].shape > 0:
+                upv[indv1] += np.exp(t2_lq2[indv1] + np.log(wofz(1j*z1[indv1]).real))
+            if indv2[0].shape > 0:
+                upv[indv2] += np.exp(nu2[ind[indv2[0]], indv2[1]] + gamt[indv2[0], 0] + np.log(2.)) - np.exp(t2_lq2[indv2]\
+                    + np.log(wofz(-1j*z1[indv2]).real))
+            upv[t1[:, 0] == 0, :] = 0.
+
+            wnuc = wofz(1j*nuc).real
+            upmc = wnuc[ind] - np.exp(t2_lq2 + gamct + np.log(wofz(1j*(t_lq + nuc[ind])).real))
+            upmc[t1[:, 0] == 0., :] = 0.
+
+            lwnuc = np.log(wnuc)
+            nuc2 = nuc*nuc
+            z1 = nuc[ind] - t_lq
+            indv1 = np.where(z1 >= 0.)
+            indv2 = np.where(z1 < 0.)
+            upvc = -np.exp(lwnuc[ind] + gamct)
+            if indv1[0].shape > 0:
+                upvc[indv1] += np.exp(t2_lq2[indv1] + np.log(wofz(1j*z1[indv1]).real))
+            if indv2[0].shape > 0:
+                upvc[indv2] += np.exp(nuc2[ind[indv2[0]], indv2[1]] + gamct[indv2[0], 0] + np.log(2.)) - np.exp(t2_lq2[indv2]\
+                    + np.log(wofz(-1j*z1[indv2]).real))
+            upvc[t1[:, 0] == 0, :] = 0.
+
+            #Gradient wrt S
+            #NxQ terms
+            c0_S = (S[d]/w2)*(lq*(np.sqrt(np.pi)*.5))
+
+            K011 = c0_S*c
+            K012 = c0_S*c2
+
+            gS[ind2t] = K011[ind]*upm + K012[ind]*upmc + (c0_S[ind]*ec)*upv + (c0_S[ind]*ec2)*upvc
+
+            #Is required to cache this, C gradient also required them
+            upmd = -1. + etlq2gamt
+            upvd = -et2_lq2 + egamt
+            upmdc = -1. + etlq2gamct
+            upvdc = -et2_lq2 + egamct
+
+            # Gradient wrt B
+            dgam_dB = 0.5/w
+            dgamc_dB = -dgam_dB
+
+            Ba1 = c0*(0.5*dgam_dB/gam2 + (0.5*lq2*gam*dgam_dB - 1./w2)*c)
+            Ba3 = c0*(-0.25*lq2*gam*dgam_dB/alphad + 0.5/(w2*alphad))
+            Ba4_1 = (S2lq*lq)*dgam_dB/w2
+            Ba4 = Ba4_1*c
+            Ba2_1 = c0*(dgam_dB*(0.5/gam2 - 0.25*lq2) + 0.5/(w2*gam))
+            Ba2_2 = c0*dgam_dB/gam
+
+            Ba1c = c0*(0.5*dgamc_dB/gamc2 + (0.5*lq2*gamc*dgamc_dB - 1./w2)*c2)
+            Ba3c = c0*(-0.25*lq2*gamc*dgamc_dB/alphad + 0.5/(w2*alphad))
+            Ba4_1c = (S2lq*lq)*dgamc_dB/w2
+            Ba4c = Ba4_1c*c2
+            Ba2_1c = c0*(dgamc_dB*(0.5/gamc2 - 0.25*lq2) + 0.5/(w2*gamc))
+            Ba2_2c = c0*dgamc_dB/gamc
+
+            gB[ind2t] = np.sum(Ba1[ind]*upm - ((Ba2_1[ind] + Ba2_2[ind]*t1)*egamt - Ba3[ind]*egamct)*upv\
+                + Ba4[ind]*upmd + (Ba4_1[ind]*ec)*upvd\
+                + Ba1c[ind]*upmc - ((Ba2_1c[ind] + Ba2_2c[ind]*t1)*egamct - Ba3c[ind]*egamt)*upvc\
+                + Ba4c[ind]*upmdc + (Ba4_1c[ind]*ec2)*upvdc, axis=1)
+
+            ##Gradient wrt C
+            dw_dC = 0.5*alphad/w
+            dgam_dC = 0.5 - dw_dC
+            dgamc_dC = 0.5 + dw_dC
+            S2lq2 = S2lq*lq
+
+            Ca1 = c0*(-0.25/alpha2 + 0.5*dgam_dC/gam2 + (0.5*lq2*gam*dgam_dC + alphad/w2)*c)
+            Ca2_1 = c0*(dgam_dC*(0.5/gam2 - 0.25*lq2) - 0.5*alphad/(w2*gam))
+            Ca2_2 = c0*dgam_dC/gam
+            Ca3_1 = c0*(0.25/alpha2 - 0.25*lq2*gam*dgam_dC/alphad - 0.5/w2)
+            Ca3_2 = 0.5*c0/alphad
+            Ca4_1 = S2lq2*(dgam_dC/w2)
+            Ca4 = Ca4_1*c
+
+            Ca1c = c0*(-0.25/alpha2 + 0.5*dgamc_dC/gamc2 + (0.5*lq2*gamc*dgamc_dC + alphad/w2)*c2)
+            Ca2_1c = c0*(dgamc_dC*(0.5/gamc2 - 0.25*lq2) - 0.5*alphad/(w2*gamc))
+            Ca2_2c = c0*dgamc_dC/gamc
+            Ca3_1c = c0*(0.25/alpha2 - 0.25*lq2*gamc*dgamc_dC/alphad - 0.5/w2)
+            Ca3_2c = 0.5*c0/alphad
+            Ca4_1c = S2lq2*(dgamc_dC/w2)
+            Ca4c = Ca4_1c*c2
+
+            gC[ind2t] = np.sum(Ca1[ind]*upm - ((Ca2_1[ind] + Ca2_2[ind]*t1)*egamt - (Ca3_1[ind] + Ca3_2[ind]*t1)*egamct)*upv\
+                + Ca4[ind]*upmd + (Ca4_1[ind]*ec)*upvd\
+                + Ca1c[ind]*upmc - ((Ca2_1c[ind] + Ca2_2c[ind]*t1)*egamct - (Ca3_1c[ind] + Ca3_2c[ind]*t1)*egamt)*upvc\
+                + Ca4c[ind]*upmdc + (Ca4_1c[ind]*ec2)*upvdc, axis=1)
+
+            #Gradient wrt lengthscale
+            #DxQ terms
+            la = (1./lq + nu*gam)*c0
+            lac = (1./lq + nuc*gamc)*c0
+            la1 = la*c
+            la1c = lac*c2
+            t_lq2 = t_lq/lq
+            c0l = (S2[d]/w2)*(.5*lq)
+            la3 = c0l*c
+            la3c = c0l*c2
+            gam_2 = .5*gam
+            gamc_2 = .5*gamc
+            glq[ind2t] = la1c[ind]*upmc + (lac[ind]*ec2)*upvc\
+                + la3c[ind]*(-gamc_2[ind] + etlq2gamct*(-t_lq2 + gamc_2[ind]))\
+                + (c0l[ind]*ec2)*(-et2_lq2*(t_lq2 + gamc_2[ind]) + egamct*gamc_2[ind])\
+                + la1[ind]*upm + (la[ind]*ec)*upv\
+                + la3[ind]*(-gam_2[ind] + etlq2gamt*(-t_lq2 + gam_2[ind]))\
+                + (c0l[ind]*ec)*(-et2_lq2*(t_lq2 + gam_2[ind]) + egamt*gam_2[ind])
+
+        return glq, gS, gB, gC
+
+    def _gkfu(self, X, index, Z, index2):
+        index = index.reshape(index.size,)
+        #TODO: reduce memory usage
+        #terms that move along t
+        d = np.unique(index)
+        B = self.B[d].values
+        C = self.C[d].values
+        S = self.W[d, :].values
+        #Index transformation
+        indd = np.arange(self.output_dim)
+        indd[d] = np.arange(d.size)
+        index = indd[index]
+        #Check where wd becomes complex
+        wbool = C*C >= 4.*B
+        #t column
+        t = X[:, 0].reshape(X.shape[0], 1)
+        C = C.reshape(C.size, 1)
+        B = B.reshape(B.size, 1)
+        C2 = C*C
+        #z row
+        z = Z[:, 0].reshape(1, Z.shape[0])
+        index2 = index2.reshape(index2.size,)
+        lq = self.lengthscale.values.reshape((1, self.rank))
+        lq2 = lq*lq
+
+        alpha = .5*C
+
+        wbool2 = wbool[index]
+        ind2t = np.where(wbool2)
+        ind3t = np.where(np.logical_not(wbool2))
+        #kfu = np.empty((t.size, z.size))
+        glq = np.empty((t.size, z.size))
+        gSdq = np.empty((t.size, z.size))
+        gB = np.empty((t.size, z.size))
+        gC = np.empty((t.size, z.size))
+
+        indD = np.arange(B.size)
+        #(1) when wd is real
+        if np.any(np.logical_not(wbool)):
+            #Indexes of index and t related to (2)
+            t1 = t[ind3t]
+            ind = index[ind3t]
+            #Index transformation
+            d = np.asarray(np.where(np.logical_not(wbool))[0])
+            indd = indD.copy()
+            indd[d] = np.arange(d.size)
+            ind = indd[ind]
+            #Dx1 terms
+            w = .5*np.sqrt(4.*B[d] - C2[d])
+            alphad = alpha[d]
+            gam = alphad - 1j*w
+            gam_2 = .5*gam
+            S_w = S[d]/w
+            S_wpi = S_w*(.5*np.sqrt(np.pi))
+            #DxQ terms
+            c0 = S_wpi*lq #lq*Sdq*sqrt(pi)/(2w)
+            nu = gam*lq
+            nu2 = 1.+.5*(nu*nu)
+            nu *= .5
+
+            #1xM terms
+            z_lq = z/lq[0, index2]
+            z_lq2 = -z_lq*z_lq
+            #NxQ terms
+            t_lq = t1/lq
+            #DxM terms
+            gamt = -gam[ind]*t1
+            #NxM terms
+            zt_lq = z_lq - t_lq[:, index2]
+            zt_lq2 = -zt_lq*zt_lq
+            ezt_lq2 = -np.exp(zt_lq2)
+            ezgamt = np.exp(z_lq2 + gamt)
+
+            # Upsilon calculations
+            fullind = np.ix_(ind, index2)
+            upsi = - np.exp(z_lq2 + gamt + np.log(wofz(1j*(z_lq + nu[fullind]))))
+            tz = t1-z
+            z1 = zt_lq + nu[fullind]
+            indv1 = np.where(z1.real >= 0.)
+            indv2 = np.where(z1.real < 0.)
+            if indv1[0].shape > 0:
+                upsi[indv1] += np.exp(zt_lq2[indv1] + np.log(wofz(1j*z1[indv1])))
+            if indv2[0].shape > 0:
+                nua2 = nu[ind[indv2[0]], index2[indv2[1]]]**2
+                upsi[indv2] += np.exp(nua2 - gam[ind[indv2[0]], 0]*tz[indv2] + np.log(2.))\
+                               - np.exp(zt_lq2[indv2] + np.log(wofz(-1j*z1[indv2])))
+            upsi[t1[:, 0] == 0., :] = 0.
+
+            #Gradient wrt S
+            #DxQ term
+            Sa1 = lq*(.5*np.sqrt(np.pi))/w
+
+            gSdq[ind3t] = Sa1[np.ix_(ind, index2)]*upsi.imag
+
+            #Gradient wrt lq
+            la1 = S_wpi*nu2
+            la2 = S_w*lq
+            uplq = ezt_lq2*(gam_2[ind])
+            uplq += ezgamt*(-z_lq/lq[0, index2] + gam_2[ind])
+
+            glq[ind3t] = (la1[np.ix_(ind, index2)]*upsi).imag
+            glq[ind3t] += la2[np.ix_(ind, index2)]*uplq.imag
+
+            #Gradient wrt B
+            #Dx1 terms
+            dw_dB = .5/w
+            dgam_dB = -1j*dw_dB
+            #DxQ terms
+            Ba1 = -c0*dw_dB/w #DXQ
+            Ba2 = c0*dgam_dB #DxQ
+            Ba3 = lq2*gam_2 #DxQ
+            Ba4 = (dgam_dB*S_w)*(.5*lq2) #DxQ
+
+            gB[ind3t] = ((Ba1[np.ix_(ind, index2)] + Ba2[np.ix_(ind, index2)]*(Ba3[np.ix_(ind, index2)] - (t1-z)))*upsi).imag\
+                + (Ba4[np.ix_(ind, index2)]*(ezt_lq2 + ezgamt)).imag
+
+            #Gradient wrt C (it uses some calculations performed in B)
+            #Dx1 terms
+            dw_dC = -.5*alphad/w
+            dgam_dC = 0.5 - 1j*dw_dC
+            #DxQ terms
+            Ca1 = -c0*dw_dC/w #DXQ
+            Ca2 = c0*dgam_dC #DxQ
+            Ca4 = (dgam_dC*S_w)*(.5*lq2) #DxQ
+
+            gC[ind3t] = ((Ca1[np.ix_(ind, index2)] + Ca2[np.ix_(ind, index2)]*(Ba3[np.ix_(ind, index2)] - (t1-z)))*upsi).imag\
+                + (Ca4[np.ix_(ind, index2)]*(ezt_lq2 + ezgamt)).imag
+
+        #(2) when wd is complex
+        if np.any(wbool):
+            #Indexes of index and t related to (2)
+            t1 = t[ind2t]
+            ind = index[ind2t]
+            #Index transformation
+            d = np.asarray(np.where(wbool)[0])
+            indd = indD.copy()
+            indd[d] = np.arange(d.size)
+            ind = indd[ind]
+            #Dx1 terms
+            w = .5*np.sqrt(C2[d] - 4.*B[d])
+            w2 = w*w
+            alphad = alpha[d]
+            gam = alphad - w
+            gamc = alphad + w
+            #DxQ terms
+            S_w= -S[d]/w #minus is given by j*j
+            S_wpi = S_w*(.25*np.sqrt(np.pi))
+
+            c0 = S_wpi*lq
+            gam_2 = .5*gam
+            gamc_2 = .5*gamc
+            nu = gam*lq
+            nuc = gamc*lq
+            nu2 = 1.+.5*(nu*nu)
+            nuc2 = 1.+.5*(nuc*nuc)
+            nu *= .5
+            nuc *= .5
+            #1xM terms
+            z_lq = z/lq[0, index2]
+            z_lq2 = -z_lq*z_lq
+            #Nx1
+            gamt = -gam[ind]*t1
+            gamct = -gamc[ind]*t1
+            #NxQ terms
+            t_lq = t1/lq[0, index2]
+            #NxM terms
+            zt_lq = z_lq - t_lq
+            zt_lq2 = -zt_lq*zt_lq
+            ezt_lq2 = -np.exp(zt_lq2)
+            ezgamt = np.exp(z_lq2 + gamt)
+            ezgamct = np.exp(z_lq2 + gamct)
+
+            # Upsilon calculations
+            fullind = np.ix_(ind, index2)
+            upsi1 = - np.exp(z_lq2 + gamct + np.log(wofz(1j*(z_lq + nuc[fullind])).real))
+            tz = t1-z
+            z1 = zt_lq + nuc[fullind]
+            indv1 = np.where(z1 >= 0.)
+            indv2 = np.where(z1 < 0.)
+            if indv1[0].shape > 0:
+                upsi1[indv1] += np.exp(zt_lq2[indv1] + np.log(wofz(1j*z1[indv1]).real))
+            if indv2[0].shape > 0:
+                nuac2 = nuc[ind[indv2[0]], index2[indv2[1]]]**2
+                upsi1[indv2] += np.exp(nuac2 - gamc[ind[indv2[0]], 0]*tz[indv2] + np.log(2.))\
+                               - np.exp(zt_lq2[indv2] + np.log(wofz(-1j*z1[indv2]).real))
+            upsi1[t1[:, 0] == 0., :] = 0.
+
+            upsi2 = - np.exp(z_lq2 + gamt + np.log(wofz(1j*(z_lq + nu[fullind])).real))
+            z1 = zt_lq + nu[fullind]
+            indv1 = np.where(z1 >= 0.)
+            indv2 = np.where(z1 < 0.)
+            if indv1[0].shape > 0:
+                upsi2[indv1] += np.exp(zt_lq2[indv1] + np.log(wofz(1j*z1[indv1]).real))
+            if indv2[0].shape > 0:
+                nua2 = nu[ind[indv2[0]], index2[indv2[1]]]**2
+                upsi2[indv2] += np.exp(nua2 - gam[ind[indv2[0]], 0]*tz[indv2] + np.log(2.))\
+                               - np.exp(zt_lq2[indv2] + np.log(wofz(-1j*z1[indv2]).real))
+            upsi2[t1[:, 0] == 0., :] = 0.
+
+            #Gradient wrt lq
+            la1 = S_wpi*nu2
+            la1c = S_wpi*nuc2
+            la2 = S_w*(.5*lq)
+            uplq = ezt_lq2*(gamc_2[ind]) + ezgamct*(-z_lq/lq[0, index2] + gamc_2[ind])\
+                - ezt_lq2*(gam_2[ind]) - ezgamt*(-z_lq/lq[0, index2] + gam_2[ind])
+
+            glq[ind2t] = la1c[np.ix_(ind, index2)]*upsi1 - la1[np.ix_(ind, index2)]*upsi2\
+                + la2[np.ix_(ind, index2)]*uplq
+
+
+            #Gradient wrt S
+            Sa1 = (lq*(-.25*np.sqrt(np.pi)))/w
+
+            gSdq[ind2t] = Sa1[np.ix_(ind, index2)]*(upsi1 - upsi2)
+
+            #Gradient wrt B
+            #Dx1 terms
+            dgam_dB = .5/w
+            dgamc_dB = -dgam_dB
+            #DxQ terms
+            Ba1 = .5*(c0/w2)
+            Ba2 = c0*dgam_dB
+            Ba3 = lq2*gam_2
+            Ba4 = (dgam_dB*S_w)*(.25*lq2)
+
+            Ba2c = c0*dgamc_dB
+            Ba3c = lq2*gamc_2
+            Ba4c = (dgamc_dB*S_w)*(.25*lq2)
+
+            gB[ind2t] = (Ba1[np.ix_(ind, index2)] + Ba2c[np.ix_(ind, index2)]*(Ba3c[np.ix_(ind, index2)] - (t1-z)))*upsi1\
+                + Ba4c[np.ix_(ind, index2)]*(ezt_lq2 + ezgamct)\
+                - (Ba1[np.ix_(ind, index2)] + Ba2[np.ix_(ind, index2)]*(Ba3[np.ix_(ind, index2)] - (t1-z)))*upsi2\
+                - Ba4[np.ix_(ind, index2)]*(ezt_lq2 + ezgamt)
+
+            #Gradient wrt C
+            #Dx1 terms
+            dgam_dC = 0.5 - .5*(alphad/w)
+            dgamc_dC = 0.5 + .5*(alphad/w)
+            #DxQ terms
+            Ca1 = -c0*(.5*alphad/w2)
+            Ca2 = c0*dgam_dC
+            Ca4 = (dgam_dC*S_w)*(.25*lq2)
+
+            Ca2c = c0*dgamc_dC
+            Ca4c = (dgamc_dC*S_w)*(.25*lq2)
+
+            gC[ind2t] = (Ca1[np.ix_(ind, index2)] + Ca2c[np.ix_(ind, index2)]*(Ba3c[np.ix_(ind, index2)] - (t1-z)))*upsi1\
+                + Ca4c[np.ix_(ind, index2)]*(ezt_lq2 + ezgamct)\
+                - (Ca1[np.ix_(ind, index2)] + Ca2[np.ix_(ind, index2)]*(Ba3[np.ix_(ind, index2)] - (t1-z)))*upsi2\
+                - Ca4[np.ix_(ind, index2)]*(ezt_lq2 + ezgamt)
+
+        return glq, gSdq, gB, gC
+
+    #TODO: reduce memory usage
+    def _gkfu_z(self, X, index, Z, index2): #Kfu(t,z)
+        index = index.reshape(index.size,)
+        #terms that move along t
+        d = np.unique(index)
+        B = self.B[d].values
+        C = self.C[d].values
+        S = self.W[d, :].values
+        #Index transformation
+        indd = np.arange(self.output_dim)
+        indd[d] = np.arange(d.size)
+        index = indd[index]
+        #Check where wd becomes complex
+        wbool = C*C >= 4.*B
+        wbool2 = wbool[index]
+        ind2t = np.where(wbool2)
+        ind3t = np.where(np.logical_not(wbool2))
+        #t column
+        t = X[:, 0].reshape(X.shape[0], 1)
+        C = C.reshape(C.size, 1)
+        B = B.reshape(B.size, 1)
+        C2 = C*C
+        alpha = .5*C
+        #z row
+        z = Z[:, 0].reshape(1, Z.shape[0])
+        index2 = index2.reshape(index2.size,)
+        lq = self.lengthscale.values.reshape((1, self.rank))
+
+        #kfu = np.empty((t.size, z.size))
+        gz = np.empty((t.size, z.size))
+        indD = np.arange(B.size)
+        #(1) when wd is real
+        if np.any(np.logical_not(wbool)):
+            #Indexes of index and t related to (2)
+            t1 = t[ind3t]
+            ind = index[ind3t]
+            #TODO: Find a better way of doing this
+            #Index transformation
+            d = np.asarray(np.where(np.logical_not(wbool))[0])
+            indd = indD.copy()
+            indd[d] = np.arange(d.size)
+            ind = indd[ind]
+            #Dx1 terms
+            w = .5*np.sqrt(4.*B[d] - C2[d])
+            alphad = alpha[d]
+            gam = alphad - 1j*w
+            S_w = S[d]/w
+            S_wpi =S_w*(.5*np.sqrt(np.pi))
+            #DxQ terms
+            c0 = S_wpi*lq #lq*Sdq*sqrt(pi)/(2w)
+            nu = (.5*gam)*lq
+
+            #1xM terms
+            z_lq = z/lq[0, index2]
+            z_lq2 = -z_lq*z_lq
+            #NxQ terms
+            t_lq = t1/lq
+            #DxM terms
+            gamt = -gam[ind]*t1
+            #NxM terms
+            zt_lq = z_lq - t_lq[:, index2]
+            zt_lq2 = -zt_lq*zt_lq
+            #ezt_lq2 = -np.exp(zt_lq2)
+            ezgamt = np.exp(z_lq2 + gamt)
+
+            # Upsilon calculations
+            fullind = np.ix_(ind, index2)
+            upsi = - np.exp(z_lq2 + gamt + np.log(wofz(1j*(z_lq + nu[fullind]))))
+            tz = t1-z
+            z1 = zt_lq + nu[fullind]
+            indv1 = np.where(z1.real >= 0.)
+            indv2 = np.where(z1.real < 0.)
+            if indv1[0].shape > 0:
+                upsi[indv1] += np.exp(zt_lq2[indv1] + np.log(wofz(1j*z1[indv1])))
+            if indv2[0].shape > 0:
+                nua2 = nu[ind[indv2[0]], index2[indv2[1]]]**2
+                upsi[indv2] += np.exp(nua2 - gam[ind[indv2[0]], 0]*tz[indv2] + np.log(2.))\
+                               - np.exp(zt_lq2[indv2] + np.log(wofz(-1j*z1[indv2])))
+            upsi[t1[:, 0] == 0., :] = 0.
+
+            #Gradient wrt z
+            za1 = c0*gam
+            #za2 = S_w
+            gz[ind3t] = (za1[np.ix_(ind, index2)]*upsi).imag + S_w[np.ix_(ind, index2)]*ezgamt.imag
+
+        #(2) when wd is complex
+        if np.any(wbool):
+            #Indexes of index and t related to (2)
+            t1 = t[ind2t]
+            ind = index[ind2t]
+            #Index transformation
+            d = np.asarray(np.where(wbool)[0])
+            indd = indD.copy()
+            indd[d] = np.arange(d.size)
+            ind = indd[ind]
+            #Dx1 terms
+            w = .5*np.sqrt(C2[d] - 4.*B[d])
+            alphad = alpha[d]
+            gam = alphad - w
+            gamc = alphad + w
+            #DxQ terms
+            S_w = -S[d]/w #minus is given by j*j
+            S_wpi = S_w*(.25*np.sqrt(np.pi))
+            c0 = S_wpi*lq
+            nu = .5*gam*lq
+            nuc = .5*gamc*lq
+
+            #1xM terms
+            z_lq = z/lq[0, index2]
+            z_lq2 = -z_lq*z_lq
+            #Nx1
+            gamt = -gam[ind]*t1
+            gamct = -gamc[ind]*t1
+            #NxQ terms
+            t_lq = t1/lq
+            #NxM terms
+            zt_lq = z_lq - t_lq[:, index2]
+            ezgamt = np.exp(z_lq2 + gamt)
+            ezgamct = np.exp(z_lq2 + gamct)
+
+            # Upsilon calculations
+            zt_lq2 = -zt_lq*zt_lq
+            fullind = np.ix_(ind, index2)
+            upsi1 = - np.exp(z_lq2 + gamct + np.log(wofz(1j*(z_lq + nuc[fullind])).real))
+            tz = t1-z
+            z1 = zt_lq + nuc[fullind]
+            indv1 = np.where(z1 >= 0.)
+            indv2 = np.where(z1 < 0.)
+            if indv1[0].shape > 0:
+                upsi1[indv1] += np.exp(zt_lq2[indv1] + np.log(wofz(1j*z1[indv1]).real))
+            if indv2[0].shape > 0:
+                nuac2 = nuc[ind[indv2[0]], index2[indv2[1]]]**2
+                upsi1[indv2] += np.exp(nuac2 - gamc[ind[indv2[0]], 0]*tz[indv2] + np.log(2.))\
+                               - np.exp(zt_lq2[indv2] + np.log(wofz(-1j*z1[indv2]).real))
+            upsi1[t1[:, 0] == 0., :] = 0.
+
+            upsi2 = - np.exp(z_lq2 + gamt + np.log(wofz(1j*(z_lq + nu[fullind])).real))
+            z1 = zt_lq + nu[fullind]
+            indv1 = np.where(z1 >= 0.)
+            indv2 = np.where(z1 < 0.)
+            if indv1[0].shape > 0:
+                upsi2[indv1] += np.exp(zt_lq2[indv1] + np.log(wofz(1j*z1[indv1]).real))
+            if indv2[0].shape > 0:
+                nua2 = nu[ind[indv2[0]], index2[indv2[1]]]**2
+                upsi2[indv2] += np.exp(nua2 - gam[ind[indv2[0]], 0]*tz[indv2] + np.log(2.))\
+                               - np.exp(zt_lq2[indv2] + np.log(wofz(-1j*z1[indv2]).real))
+            upsi2[t1[:, 0] == 0., :] = 0.
+
+            #Gradient wrt z
+            za1 = c0*gam
+            za1c = c0*gamc
+            za2 = .5*S_w
+            gz[ind2t] = za1c[np.ix_(ind, index2)]*upsi1 - za1[np.ix_(ind, index2)]*upsi2\
+                + za2[np.ix_(ind, index2)]*(ezgamct - ezgamt)
+        return gz
diff --git a/GPy/kern/_src/mlp.py b/GPy/kern/_src/mlp.py
index badbd60d..16e84363 100644
--- a/GPy/kern/_src/mlp.py
+++ b/GPy/kern/_src/mlp.py
@@ -79,8 +79,14 @@ class MLP(Kern):
                              + 2*self.bias_variance + 2.))*base_cov_grad).sum()
 
     def update_gradients_diag(self, X):
-        raise NotImplementedError, "TODO"
-
+        self._K_diag_computations(X)
+        self.variance.gradient = np.sum(self._K_diag_dvar*dL_dKdiag)
+        
+        base = four_over_tau*self.variance/np.sqrt(1-self._K_diag_asin_arg*self._K_diag_asin_arg)
+        base_cov_grad = base*dL_dKdiag/np.square(self._K_diag_denom)
+        
+        self.weight_variance.gradient = (base_cov_grad*np.square(X).sum(axis=1)).sum()
+        self.bias_variance.gradient = base_cov_grad.sum()
 
     def gradients_X(self, dL_dK, X, X2):
         """Derivative of the covariance matrix with respect to X"""
diff --git a/GPy/kern/_src/prod.py b/GPy/kern/_src/prod.py
index dd9a5fe4..bff6d841 100644
--- a/GPy/kern/_src/prod.py
+++ b/GPy/kern/_src/prod.py
@@ -42,25 +42,41 @@ class Prod(CombinationKernel):
         return reduce(np.multiply, (p.Kdiag(X) for p in which_parts))
 
     def update_gradients_full(self, dL_dK, X, X2=None):
-        k = self.K(X,X2)*dL_dK
-        for p in self.parts:
-            p.update_gradients_full(k/p.K(X,X2),X,X2)
+        if len(self.parts)==2:
+            self.parts[0].update_gradients_full(dL_dK*self.parts[1].K(X,X2), X, X2)
+            self.parts[1].update_gradients_full(dL_dK*self.parts[0].K(X,X2), X, X2)
+        else:
+            k = self.K(X,X2)*dL_dK
+            for p in self.parts:
+                p.update_gradients_full(k/p.K(X,X2),X,X2)
 
     def update_gradients_diag(self, dL_dKdiag, X):
-        k = self.Kdiag(X)*dL_dKdiag
-        for p in self.parts:
-            p.update_gradients_diag(k/p.Kdiag(X),X)
+        if len(self.parts)==2:
+            self.parts[0].update_gradients_diag(dL_dKdiag*self.parts[1].Kdiag(X), X)
+            self.parts[1].update_gradients_diag(dL_dKdiag*self.parts[0].Kdiag(X), X)
+        else:
+            k = self.Kdiag(X)*dL_dKdiag
+            for p in self.parts:
+                p.update_gradients_diag(k/p.Kdiag(X),X)
 
     def gradients_X(self, dL_dK, X, X2=None):
         target = np.zeros(X.shape)
-        k = self.K(X,X2)*dL_dK
-        for p in self.parts:
-            target += p.gradients_X(k/p.K(X,X2),X,X2)
+        if len(self.parts)==2:
+            target += self.parts[0].gradients_X(dL_dK*self.parts[1].K(X, X2), X, X2)
+            target += self.parts[1].gradients_X(dL_dK*self.parts[0].K(X, X2), X, X2)
+        else:
+            k = self.K(X,X2)*dL_dK
+            for p in self.parts:
+                target += p.gradients_X(k/p.K(X,X2),X,X2)
         return target
 
     def gradients_X_diag(self, dL_dKdiag, X):
         target = np.zeros(X.shape)
-        k = self.Kdiag(X)*dL_dKdiag
-        for p in self.parts:
-            target += p.gradients_X_diag(k/p.Kdiag(X),X)
+        if len(self.parts)==2:
+            target += self.parts[0].gradients_X_diag(dL_dKdiag*self.parts[1].Kdiag(X), X)
+            target += self.parts[1].gradients_X_diag(dL_dKdiag*self.parts[0].Kdiag(X), X)
+        else:
+            k = self.Kdiag(X)*dL_dKdiag
+            for p in self.parts:
+                target += p.gradients_X_diag(k/p.Kdiag(X),X)
         return target
diff --git a/GPy/kern/_src/psi_comp/linear_psi_comp.py b/GPy/kern/_src/psi_comp/linear_psi_comp.py
index 93297e7e..50090428 100644
--- a/GPy/kern/_src/psi_comp/linear_psi_comp.py
+++ b/GPy/kern/_src/psi_comp/linear_psi_comp.py
@@ -6,6 +6,7 @@ The package for the Psi statistics computation of the linear kernel for Bayesian
 """
 
 import numpy as np
+from ....util.linalg import tdot
 
 def psicomputations(variance, Z, variational_posterior):
     """
@@ -19,9 +20,9 @@ def psicomputations(variance, Z, variational_posterior):
     mu = variational_posterior.mean
     S = variational_posterior.variance
 
-    psi0 = np.einsum('q,nq->n',variance,np.square(mu)+S)
-    psi1 = np.einsum('q,mq,nq->nm',variance,Z,mu)
-    psi2 = np.einsum('q,mq,oq,nq->mo',np.square(variance),Z,Z,S) + np.einsum('nm,no->mo',psi1,psi1)
+    psi0 = (variance*(np.square(mu)+S)).sum(axis=1)
+    psi1 = np.dot(mu,(variance*Z).T)
+    psi2 = np.dot(S.sum(axis=0)*np.square(variance)*Z,Z.T)+ tdot(psi1.T)
 
     return psi0, psi1, psi2
 
@@ -33,10 +34,12 @@ def psiDerivativecomputations(dL_dpsi0, dL_dpsi1, dL_dpsi2, variance, Z, variati
 
     # Compute for psi0 and psi1
     mu2S = np.square(mu)+S
-    dL_dvar += np.einsum('n,nq->q',dL_dpsi0,mu2S) + np.einsum('nm,mq,nq->q',dL_dpsi1,Z,mu)
-    dL_dmu += np.einsum('n,q,nq->nq',dL_dpsi0,2.*variance,mu) + np.einsum('nm,q,mq->nq',dL_dpsi1,variance,Z)
-    dL_dS += np.einsum('n,q->nq',dL_dpsi0,variance)
-    dL_dZ +=  np.einsum('nm,q,nq->mq',dL_dpsi1, variance,mu)
+    dL_dpsi0_var = dL_dpsi0[:,None]*variance[None,:]
+    dL_dpsi1_mu = np.dot(dL_dpsi1.T,mu)
+    dL_dvar += (dL_dpsi0[:,None]*mu2S).sum(axis=0)+ (dL_dpsi1_mu*Z).sum(axis=0)
+    dL_dmu += 2.*dL_dpsi0_var*mu+np.dot(dL_dpsi1,Z)*variance
+    dL_dS += dL_dpsi0_var
+    dL_dZ += dL_dpsi1_mu*variance
     
     return dL_dvar, dL_dZ, dL_dmu, dL_dS
 
@@ -55,17 +58,20 @@ def _psi2computations(dL_dpsi2, variance, Z, mu, S):
     # _psi2_dS             NxQ
     
     variance2 = np.square(variance)
-    common_sum = np.einsum('q,mq,nq->nm',variance,Z,mu) # NxM
-    Z_expect = np.einsum('mo,mq,oq->q',dL_dpsi2,Z,Z)
-    common_expect = np.einsum('mo,mq,no->nq',dL_dpsi2+dL_dpsi2.T,Z,common_sum)
+    common_sum = np.dot(mu,(variance*Z).T)
+    Z_expect = (np.dot(dL_dpsi2,Z)*Z).sum(axis=0)
+    dL_dpsi2T = dL_dpsi2+dL_dpsi2.T
+    common_expect = np.dot(common_sum,np.dot(dL_dpsi2T,Z))
+    Z2_expect = np.inner(common_sum,dL_dpsi2T)
+    Z1_expect = np.dot(dL_dpsi2T,Z)
 
-    dL_dvar = np.einsum('q,nq,q->q',Z_expect,2.*S,variance)+ np.einsum('nq,nq->q',common_expect,mu)
+    dL_dvar = 2.*S.sum(axis=0)*variance*Z_expect+(common_expect*mu).sum(axis=0)
             
-    dL_dmu = np.einsum('nq,q->nq',common_expect,variance)
+    dL_dmu = common_expect*variance
     
     dL_dS = np.empty(S.shape)
-    dL_dS[:] = np.einsum('q,q->q',Z_expect,variance2)
+    dL_dS[:] = Z_expect*variance2
     
-    dL_dZ = 2.*(np.einsum('om,q,mq,nq->oq',dL_dpsi2,variance2,Z,S)+np.einsum('om,q,nq,nm->oq',dL_dpsi2,variance,mu,common_sum))
+    dL_dZ = variance2*S.sum(axis=0)*Z1_expect+np.dot(Z2_expect.T,variance*mu)
 
     return dL_dvar, dL_dmu, dL_dS, dL_dZ
diff --git a/GPy/kern/_src/stationary.py b/GPy/kern/_src/stationary.py
index 443871af..06671b23 100644
--- a/GPy/kern/_src/stationary.py
+++ b/GPy/kern/_src/stationary.py
@@ -159,7 +159,7 @@ class Stationary(Kern):
             #self.lengthscale.gradient = -((dL_dr*rinv)[:,:,None]*x_xl3).sum(0).sum(0)/self.lengthscale**3
             tmp = dL_dr*self._inv_dist(X, X2)
             if X2 is None: X2 = X
-            
+
 
             if config.getboolean('weave', 'working'):
                 try:
@@ -261,7 +261,7 @@ class Stationary(Kern):
             ret(n,d) = retnd;
           }
         }
- 
+
         """
         if hasattr(X, 'values'):X = X.values #remove the GPy wrapping to make passing into weave safe
         if hasattr(X2, 'values'):X2 = X2.values
@@ -278,12 +278,12 @@ class Stationary(Kern):
                          'extra_link_args'   : ['-lgomp']}
         weave.inline(code, ['ret', 'N', 'D', 'M', 'tmp', 'X', 'X2'], type_converters=weave.converters.blitz, support_code=support_code, **weave_options)
         return ret/self.lengthscale**2
-    
+
     def gradients_X_diag(self, dL_dKdiag, X):
         return np.zeros(X.shape)
 
     def input_sensitivity(self, summarize=True):
-        return np.ones(self.input_dim)/self.lengthscale**2
+        return self.variance*np.ones(self.input_dim)/self.lengthscale**2
 
 class Exponential(Stationary):
     def __init__(self, input_dim, variance=1., lengthscale=None, ARD=False, active_dims=None, name='Exponential'):
diff --git a/GPy/likelihoods/bernoulli.py b/GPy/likelihoods/bernoulli.py
index 35015b2d..26de274b 100644
--- a/GPy/likelihoods/bernoulli.py
+++ b/GPy/likelihoods/bernoulli.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2012, 2013 The GPy authors (see AUTHORS.txt)
+# Copyright (c) 2012-2014 The GPy authors (see AUTHORS.txt)
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 
 import numpy as np
@@ -77,6 +77,32 @@ class Bernoulli(Likelihood):
 
         return Z_hat, mu_hat, sigma2_hat
 
+    def variational_expectations(self, Y, m, v, gh_points=None):
+        if isinstance(self.gp_link, link_functions.Probit):
+
+            if gh_points is None:
+                gh_x, gh_w = np.polynomial.hermite.hermgauss(20)
+            else:
+                gh_x, gh_w = gh_points
+
+            from scipy import stats
+
+            shape = m.shape
+            m,v,Y = m.flatten(), v.flatten(), Y.flatten()
+            Ysign = np.where(Y==1,1,-1)
+            X = gh_x[None,:]*np.sqrt(2.*v[:,None]) + (m*Ysign)[:,None]
+            p = stats.norm.cdf(X)
+            p = np.clip(p, 1e-9, 1.-1e-9) # for numerical stability
+            N = stats.norm.pdf(X)
+            F = np.log(p).dot(gh_w)
+            NoverP = N/p
+            dF_dm = (NoverP*Ysign[:,None]).dot(gh_w)
+            dF_dv = -0.5*(NoverP**2 + NoverP*X).dot(gh_w)
+            return F.reshape(*shape), dF_dm.reshape(*shape), dF_dv.reshape(*shape), None
+        else:
+            raise NotImplementedError
+
+
     def predictive_mean(self, mu, variance, Y_metadata=None):
 
         if isinstance(self.gp_link, link_functions.Probit):
@@ -133,7 +159,7 @@ class Bernoulli(Likelihood):
         """
         #objective = y*np.log(inv_link_f) + (1.-y)*np.log(inv_link_f)
         p = np.where(y==1, inv_link_f, 1.-inv_link_f)
-        return np.log(np.clip(p, 1e-6 ,np.inf))
+        return np.log(np.clip(p, 1e-9 ,np.inf))
 
     def dlogpdf_dlink(self, inv_link_f, y, Y_metadata=None):
         """
@@ -152,7 +178,7 @@ class Bernoulli(Likelihood):
         """
         #grad = (y/inv_link_f) - (1.-y)/(1-inv_link_f)
         #grad = np.where(y, 1./inv_link_f, -1./(1-inv_link_f))
-        ff = np.clip(inv_link_f, 1e-6, 1-1e-6)
+        ff = np.clip(inv_link_f, 1e-9, 1-1e-9)
         denom = np.where(y, ff, -(1-ff))
         return 1./denom
 
@@ -180,7 +206,7 @@ class Bernoulli(Likelihood):
         #d2logpdf_dlink2 = -y/(inv_link_f**2) - (1-y)/((1-inv_link_f)**2)
         #d2logpdf_dlink2 = np.where(y, -1./np.square(inv_link_f), -1./np.square(1.-inv_link_f))
         arg = np.where(y, inv_link_f, 1.-inv_link_f)
-        ret =  -1./np.square(np.clip(arg, 1e-3, np.inf))
+        ret =  -1./np.square(np.clip(arg, 1e-9, 1e9))
         if np.any(np.isinf(ret)):
             stop
         return ret
diff --git a/GPy/likelihoods/exponential.py b/GPy/likelihoods/exponential.py
index 489a4c9e..8110c7d4 100644
--- a/GPy/likelihoods/exponential.py
+++ b/GPy/likelihoods/exponential.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2012, 2013 GPy Authors
+# Copyright (c) 2012-2014 GPy Authors
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 
 
diff --git a/GPy/likelihoods/gaussian.py b/GPy/likelihoods/gaussian.py
index 2546b07a..a6e5b7e0 100644
--- a/GPy/likelihoods/gaussian.py
+++ b/GPy/likelihoods/gaussian.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2012, GPy authors (see AUTHORS.txt).
+# Copyright (c) 2012-2014 The GPy authors (see AUTHORS.txt)
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 #TODO
 """
@@ -316,3 +316,11 @@ class Gaussian(Likelihood):
         v = var_star + self.variance
         return -0.5*np.log(2*np.pi) -0.5*np.log(v) - 0.5*np.square(y_test - mu_star)/v
 
+    def variational_expectations(self, Y, m, v, gh_points=None):
+        lik_var = float(self.variance)
+        F = -0.5*np.log(2*np.pi) -0.5*np.log(lik_var) - 0.5*(np.square(Y) + np.square(m) + v - 2*m*Y)/lik_var
+        dF_dmu = (Y - m)/lik_var
+        dF_dv = np.ones_like(v)*(-0.5/lik_var)
+        dF_dlik_var = np.sum(-0.5/lik_var + 0.5*(np.square(Y) + np.square(m) + v - 2*m*Y)/(lik_var**2))
+        dF_dtheta = [dF_dlik_var]
+        return F, dF_dmu, dF_dv, dF_dtheta
diff --git a/GPy/likelihoods/likelihood.py b/GPy/likelihoods/likelihood.py
index b60fcb9e..790c6ba4 100644
--- a/GPy/likelihoods/likelihood.py
+++ b/GPy/likelihoods/likelihood.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2012, GPy authors (see AUTHORS.txt).
+# Copyright (c) 2012-2014 The GPy authors (see AUTHORS.txt)
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 
 import numpy as np
@@ -133,7 +133,7 @@ class Likelihood(Parameterized):
 
     def variational_expectations(self, Y, m, v, gh_points=None):
         """
-        Use Gauss-Hermite Quadrature to compute 
+        Use Gauss-Hermite Quadrature to compute
 
            E_p(f) [ log p(y|f) ]
            d/dm E_p(f) [ log p(y|f) ]
@@ -143,9 +143,10 @@ class Likelihood(Parameterized):
 
         if no gh_points are passed, we construct them using defualt options
         """
+        #May be broken
 
         if gh_points is None:
-            gh_x, gh_w = np.polynomial.hermite.hermgauss(12)
+            gh_x, gh_w = np.polynomial.hermite.hermgauss(20)
         else:
             gh_x, gh_w = gh_points
 
@@ -156,15 +157,15 @@ class Likelihood(Parameterized):
         X = gh_x[None,:]*np.sqrt(2.*v[:,None]) + m[:,None]
 
         #evaluate the likelhood for the grid. First ax indexes the data (and mu, var) and the second indexes the grid.
-        # broadcast needs to be handled carefully. 
+        # broadcast needs to be handled carefully.
         logp = self.logpdf(X,Y[:,None])
         dlogp_dx = self.dlogpdf_df(X, Y[:,None])
         d2logp_dx2 = self.d2logpdf_df2(X, Y[:,None])
 
         #clipping for numerical stability
-        logp = np.clip(logp,-1e6,1e6)
-        dlogp_dx = np.clip(dlogp_dx,-1e6,1e6)
-        d2logp_dx2 = np.clip(d2logp_dx2,-1e6,1e6)
+        #logp = np.clip(logp,-1e9,1e9)
+        #dlogp_dx = np.clip(dlogp_dx,-1e9,1e9)
+        #d2logp_dx2 = np.clip(d2logp_dx2,-1e9,1e9)
 
         #average over the gird to get derivatives of the Gaussian's parameters
         F = np.dot(logp, gh_w)
@@ -176,10 +177,8 @@ class Likelihood(Parameterized):
         if np.any(np.isnan(dF_dm)) or np.any(np.isinf(dF_dm)):
             stop
 
-        return F.reshape(*shape), dF_dm.reshape(*shape), dF_dv.reshape(*shape)
-
-
-
+        dF_dtheta = None # Not yet implemented
+        return F.reshape(*shape), dF_dm.reshape(*shape), dF_dv.reshape(*shape), dF_dtheta
 
     def predictive_mean(self, mu, variance, Y_metadata=None):
         """
diff --git a/GPy/likelihoods/link_functions.py b/GPy/likelihoods/link_functions.py
index a7d36057..a4ddc760 100644
--- a/GPy/likelihoods/link_functions.py
+++ b/GPy/likelihoods/link_functions.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2012, 2013 The GPy authors
+# Copyright (c) 2012-2014 The GPy authors (see AUTHORS.txt)
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 
 import numpy as np
diff --git a/GPy/likelihoods/mixed_noise.py b/GPy/likelihoods/mixed_noise.py
index 9692bb07..8c56f45b 100644
--- a/GPy/likelihoods/mixed_noise.py
+++ b/GPy/likelihoods/mixed_noise.py
@@ -1,3 +1,6 @@
+# Copyright (c) 2012-2014 The GPy authors (see AUTHORS.txt)
+# Licensed under the BSD 3-clause license (see LICENSE.txt)
+
 import numpy as np
 from scipy import stats, special
 import link_functions
diff --git a/GPy/likelihoods/poisson.py b/GPy/likelihoods/poisson.py
index 088fc478..ea9b2d10 100644
--- a/GPy/likelihoods/poisson.py
+++ b/GPy/likelihoods/poisson.py
@@ -1,5 +1,5 @@
 from __future__ import division
-# Copyright (c) 2012, 2013 Ricardo Andrade
+# Copyright (c) 2012-2014 Ricardo Andrade, Alan Saul
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 
 import numpy as np
diff --git a/GPy/likelihoods/student_t.py b/GPy/likelihoods/student_t.py
index 3aeb43e0..855f6b40 100644
--- a/GPy/likelihoods/student_t.py
+++ b/GPy/likelihoods/student_t.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2012, 2013 Ricardo Andrade
+# Copyright (c) 2012-2014 Ricardo Andrade, Alan Saul
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 
 import numpy as np
diff --git a/GPy/models/bayesian_gplvm.py b/GPy/models/bayesian_gplvm.py
index fca97e96..7cbd69eb 100644
--- a/GPy/models/bayesian_gplvm.py
+++ b/GPy/models/bayesian_gplvm.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2012, GPy authors (see AUTHORS.txt).
+# Copyright (c) 2012 - 2014 the GPy Austhors (see AUTHORS.txt)
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 
 import numpy as np
diff --git a/GPy/models/bayesian_gplvm_minibatch.py b/GPy/models/bayesian_gplvm_minibatch.py
index 80abba59..71f69eb2 100644
--- a/GPy/models/bayesian_gplvm_minibatch.py
+++ b/GPy/models/bayesian_gplvm_minibatch.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2012, GPy authors (see AUTHORS.txt).
+# Copyright (c) 2012-2014, GPy authors (see AUTHORS.txt).
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 
 import numpy as np
@@ -8,6 +8,7 @@ from ..core.parameterization.variational import NormalPosterior, NormalPrior
 from ..inference.latent_function_inference.var_dtc_parallel import VarDTC_minibatch
 import logging
 from GPy.models.sparse_gp_minibatch import SparseGPMiniBatch
+from GPy.core.parameterization.param import Param
 
 class BayesianGPLVMMiniBatch(SparseGPMiniBatch):
     """
@@ -35,15 +36,20 @@ class BayesianGPLVMMiniBatch(SparseGPMiniBatch):
 
         self.init = init
 
-        if X_variance is None:
-            self.logger.info("initializing latent space variance ~ uniform(0,.1)")
-            X_variance = np.random.uniform(0,.1,X.shape)
-
         if Z is None:
             self.logger.info("initializing inducing inputs")
             Z = np.random.permutation(X.copy())[:num_inducing]
         assert Z.shape[1] == X.shape[1]
 
+        if X_variance == False:
+            self.logger.info('no variance on X, activating sparse GPLVM')
+            X = Param("latent space", X)
+        elif X_variance is None:
+            self.logger.info("initializing latent space variance ~ uniform(0,.1)")
+            X_variance = np.random.uniform(0,.1,X.shape)
+            self.variational_prior = NormalPrior()
+            X = NormalPosterior(X, X_variance)
+
         if kernel is None:
             self.logger.info("initializing kernel RBF")
             kernel = kern.RBF(input_dim, lengthscale=1./fracs, ARD=True) #+ kern.Bias(input_dim) + kern.White(input_dim)
@@ -51,9 +57,6 @@ class BayesianGPLVMMiniBatch(SparseGPMiniBatch):
         if likelihood is None:
             likelihood = Gaussian()
 
-        self.variational_prior = NormalPrior()
-        X = NormalPosterior(X, X_variance)
-
         self.kl_factr = 1.
 
         if inference_method is None:
@@ -80,39 +83,45 @@ class BayesianGPLVMMiniBatch(SparseGPMiniBatch):
         """Get the gradients of the posterior distribution of X in its specific form."""
         return X.mean.gradient, X.variance.gradient
 
-    def _inner_parameters_changed(self, kern, X, Z, likelihood, Y, Y_metadata, Lm=None, dL_dKmm=None, subset_indices=None):
-        posterior, log_marginal_likelihood, grad_dict, current_values, value_indices = super(BayesianGPLVMMiniBatch, self)._inner_parameters_changed(kern, X, Z, likelihood, Y, Y_metadata, Lm=Lm, dL_dKmm=dL_dKmm, subset_indices=subset_indices)
+    def _inner_parameters_changed(self, kern, X, Z, likelihood, Y, Y_metadata, Lm=None, dL_dKmm=None, subset_indices=None, **kw):
+        posterior, log_marginal_likelihood, grad_dict, current_values, value_indices = super(BayesianGPLVMMiniBatch, self)._inner_parameters_changed(kern, X, Z, likelihood, Y, Y_metadata, Lm=Lm, dL_dKmm=dL_dKmm, subset_indices=subset_indices, **kw)
 
-        current_values['meangrad'], current_values['vargrad'] = self.kern.gradients_qX_expectations(
-                                            variational_posterior=X,
-                                            Z=Z, dL_dpsi0=grad_dict['dL_dpsi0'],
-                                            dL_dpsi1=grad_dict['dL_dpsi1'],
-                                            dL_dpsi2=grad_dict['dL_dpsi2'])
+        if self.has_uncertain_inputs():
+            current_values['meangrad'], current_values['vargrad'] = self.kern.gradients_qX_expectations(
+                                                variational_posterior=X,
+                                                Z=Z, dL_dpsi0=grad_dict['dL_dpsi0'],
+                                                dL_dpsi1=grad_dict['dL_dpsi1'],
+                                                dL_dpsi2=grad_dict['dL_dpsi2'])
+        else:
+            current_values['Xgrad'] = self.kern.gradients_X(grad_dict['dL_dKnm'], X, Z)
+            current_values['Xgrad'] += self.kern.gradients_X_diag(grad_dict['dL_dKdiag'], X)
+            if subset_indices is not None:
+                value_indices['Xgrad'] = subset_indices['samples']
 
         kl_fctr = self.kl_factr
-        if self.missing_data:
-            d = self.output_dim
-            log_marginal_likelihood -= kl_fctr*self.variational_prior.KL_divergence(X)/d
-        else:
-            log_marginal_likelihood -= kl_fctr*self.variational_prior.KL_divergence(X)
+        if self.has_uncertain_inputs():
+            if self.missing_data:
+                d = self.output_dim
+                log_marginal_likelihood -= kl_fctr*self.variational_prior.KL_divergence(X)/d
+            else:
+                log_marginal_likelihood -= kl_fctr*self.variational_prior.KL_divergence(X)
 
+            # Subsetting Variational Posterior objects, makes the gradients
+            # empty. We need them to be 0 though:
+            X.mean.gradient[:] = 0
+            X.variance.gradient[:] = 0
 
-        # Subsetting Variational Posterior objects, makes the gradients
-        # empty. We need them to be 0 though:
-        X.mean.gradient[:] = 0
-        X.variance.gradient[:] = 0
+            self.variational_prior.update_gradients_KL(X)
+            if self.missing_data:
+                current_values['meangrad'] += kl_fctr*X.mean.gradient/d
+                current_values['vargrad'] += kl_fctr*X.variance.gradient/d
+            else:
+                current_values['meangrad'] += kl_fctr*X.mean.gradient
+                current_values['vargrad'] += kl_fctr*X.variance.gradient
 
-        self.variational_prior.update_gradients_KL(X)
-        if self.missing_data:
-            current_values['meangrad'] += kl_fctr*X.mean.gradient/d
-            current_values['vargrad'] += kl_fctr*X.variance.gradient/d
-        else:
-            current_values['meangrad'] += kl_fctr*X.mean.gradient
-            current_values['vargrad'] += kl_fctr*X.variance.gradient
-
-        if subset_indices is not None:
-            value_indices['meangrad'] = subset_indices['samples']
-            value_indices['vargrad'] = subset_indices['samples']
+            if subset_indices is not None:
+                value_indices['meangrad'] = subset_indices['samples']
+                value_indices['vargrad'] = subset_indices['samples']
         return posterior, log_marginal_likelihood, grad_dict, current_values, value_indices
 
     def _outer_values_update(self, full_values):
@@ -121,42 +130,24 @@ class BayesianGPLVMMiniBatch(SparseGPMiniBatch):
         E.g. set the gradients of parameters, etc.
         """
         super(BayesianGPLVMMiniBatch, self)._outer_values_update(full_values)
-        self.X.mean.gradient = full_values['meangrad']
-        self.X.variance.gradient = full_values['vargrad']
+        if self.has_uncertain_inputs():
+            self.X.mean.gradient = full_values['meangrad']
+            self.X.variance.gradient = full_values['vargrad']
+        else:
+            self.X.gradient = full_values['Xgrad']
 
     def _outer_init_full_values(self):
-        return dict(meangrad=np.zeros(self.X.mean.shape),
-                    vargrad=np.zeros(self.X.variance.shape))
+        if self.has_uncertain_inputs():
+            return dict(meangrad=np.zeros(self.X.mean.shape),
+                        vargrad=np.zeros(self.X.variance.shape))
+        else:
+            return dict(Xgrad=np.zeros(self.X.shape))
 
     def parameters_changed(self):
         super(BayesianGPLVMMiniBatch,self).parameters_changed()
         if isinstance(self.inference_method, VarDTC_minibatch):
             return
 
-        #super(BayesianGPLVM, self).parameters_changed()
-        #self._log_marginal_likelihood -= self.variational_prior.KL_divergence(self.X)
-
-        #self.X.mean.gradient, self.X.variance.gradient = self.kern.gradients_qX_expectations(variational_posterior=self.X, Z=self.Z, dL_dpsi0=self.grad_dict['dL_dpsi0'], dL_dpsi1=self.grad_dict['dL_dpsi1'], dL_dpsi2=self.grad_dict['dL_dpsi2'])
-
-        # This is testing code -------------------------
-#         i = np.random.randint(self.X.shape[0])
-#         X_ = self.X.mean
-#         which = np.sqrt(((X_ - X_[i:i+1])**2).sum(1)).argsort()>(max(0, self.X.shape[0]-51))
-#         _, _, grad_dict = self.inference_method.inference(self.kern, self.X[which], self.Z, self.likelihood, self.Y[which], self.Y_metadata)
-#         grad = self.kern.gradients_qX_expectations(variational_posterior=self.X[which], Z=self.Z, dL_dpsi0=grad_dict['dL_dpsi0'], dL_dpsi1=grad_dict['dL_dpsi1'], dL_dpsi2=grad_dict['dL_dpsi2'])
-#
-#         self.X.mean.gradient[:] = 0
-#         self.X.variance.gradient[:] = 0
-#         self.X.mean.gradient[which] = grad[0]
-#         self.X.variance.gradient[which] = grad[1]
-
-        # update for the KL divergence
-#         self.variational_prior.update_gradients_KL(self.X, which)
-        # -----------------------------------------------
-
-        # update for the KL divergence
-        #self.variational_prior.update_gradients_KL(self.X)
-
     def plot_latent(self, labels=None, which_indices=None,
                 resolution=50, ax=None, marker='o', s=40,
                 fignum=None, plot_inducing=True, legend=True,
diff --git a/GPy/models/bcgplvm.py b/GPy/models/bcgplvm.py
index c54ffdf6..899bb2f8 100644
--- a/GPy/models/bcgplvm.py
+++ b/GPy/models/bcgplvm.py
@@ -1,4 +1,4 @@
-# ## Copyright (c) 2012, GPy authors (see AUTHORS.txt).
+# Copyright (c) 2012-2014, GPy authors (see AUTHORS.txt).
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 
 
diff --git a/GPy/models/gp_classification.py b/GPy/models/gp_classification.py
index 188d5e84..bbf4f316 100644
--- a/GPy/models/gp_classification.py
+++ b/GPy/models/gp_classification.py
@@ -1,4 +1,3 @@
-# Copyright (c) 2013, Ricardo Andrade
 # Copyright (c) 2013, the GPy Authors (see AUTHORS.txt)
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 
diff --git a/GPy/models/gplvm.py b/GPy/models/gplvm.py
index 9cc361ee..6318829d 100644
--- a/GPy/models/gplvm.py
+++ b/GPy/models/gplvm.py
@@ -1,4 +1,4 @@
-# ## Copyright (c) 2012, GPy authors (see AUTHORS.txt).
+# Copyright (c) 2012-2014, GPy authors (see AUTHORS.txt).
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 
 
diff --git a/GPy/models/mrd.py b/GPy/models/mrd.py
index 645cdf88..f3e643c9 100644
--- a/GPy/models/mrd.py
+++ b/GPy/models/mrd.py
@@ -111,9 +111,6 @@ class MRD(BayesianGPLVMMiniBatch):
             assert all([isinstance(k, Kern) for k in kernel]), "invalid kernel object detected!"
             kernels = kernel
 
-        if X_variance is None:
-            X_variance = np.random.uniform(0.1, 0.2, X.shape)
-
         self.variational_prior = NormalPrior()
         #self.X = NormalPosterior(X, X_variance)
 
@@ -174,10 +171,13 @@ class MRD(BayesianGPLVMMiniBatch):
             self.Z.gradient[:] += b.full_values['Zgrad']
             grad_dict = b.full_values
 
-            self.X.mean.gradient += grad_dict['meangrad']
-            self.X.variance.gradient += grad_dict['vargrad']
+            if self.has_uncertain_inputs():
+                self.X.mean.gradient += grad_dict['meangrad']
+                self.X.variance.gradient += grad_dict['vargrad']
+            else:
+                self.X.gradient += grad_dict['Xgrad']
 
-        if isinstance(self.X, VariationalPosterior):
+        if self.has_uncertain_inputs():
             # update for the KL divergence
             self.variational_prior.update_gradients_KL(self.X)
             self._log_marginal_likelihood -= self.variational_prior.KL_divergence(self.X)
diff --git a/GPy/models/sparse_gp_minibatch.py b/GPy/models/sparse_gp_minibatch.py
index ec2e28f5..e827bb70 100644
--- a/GPy/models/sparse_gp_minibatch.py
+++ b/GPy/models/sparse_gp_minibatch.py
@@ -3,6 +3,7 @@
 
 import numpy as np
 from ..core.parameterization.param import Param
+from ..core.sparse_gp import SparseGP
 from ..core.gp import GP
 from ..inference.latent_function_inference import var_dtc
 from .. import likelihoods
@@ -16,14 +17,9 @@ from GPy.inference.optimization.stochastics import SparseGPStochastics,\
     #SparseGPMissing
 logger = logging.getLogger("sparse gp")
 
-class SparseGPMiniBatch(GP):
+class SparseGPMiniBatch(SparseGP):
     """
-    A general purpose Sparse GP model
-'''
-Created on 3 Nov 2014
-
-@author: maxz
-'''
+    A general purpose Sparse GP model, allowing missing data and stochastics across dimensions.
 
     This model allows (approximate) inference using variational DTC or FITC
     (Gaussian likelihoods) as well as non-conjugate sparse methods based on
@@ -97,7 +93,7 @@ Created on 3 Nov 2014
     def has_uncertain_inputs(self):
         return isinstance(self.X, VariationalPosterior)
 
-    def _inner_parameters_changed(self, kern, X, Z, likelihood, Y, Y_metadata, Lm=None, dL_dKmm=None, subset_indices=None):
+    def _inner_parameters_changed(self, kern, X, Z, likelihood, Y, Y_metadata, Lm=None, dL_dKmm=None, subset_indices=None, **kwargs):
         """
         This is the standard part, which usually belongs in parameters_changed.
 
@@ -117,7 +113,7 @@ Created on 3 Nov 2014
         algorithm.
         """
         try:
-            posterior, log_marginal_likelihood, grad_dict = self.inference_method.inference(kern, X, Z, likelihood, Y, Y_metadata, Lm=Lm, dL_dKmm=None)
+            posterior, log_marginal_likelihood, grad_dict = self.inference_method.inference(kern, X, Z, likelihood, Y, Y_metadata, Lm=Lm, dL_dKmm=None, **kwargs)
         except:
             posterior, log_marginal_likelihood, grad_dict = self.inference_method.inference(kern, X, Z, likelihood, Y, Y_metadata)
         current_values = {}
@@ -315,34 +311,3 @@ Created on 3 Nov 2014
         else:
             self.posterior, self._log_marginal_likelihood, self.grad_dict, self.full_values, _ = self._inner_parameters_changed(self.kern, self.X, self.Z, self.likelihood, self.Y_normalized, self.Y_metadata)
             self._outer_values_update(self.full_values)
-
-    def _raw_predict(self, Xnew, full_cov=False, kern=None):
-        """
-        Make a prediction for the latent function values
-        """
-
-        if kern is None: kern = self.kern
-
-        if not isinstance(Xnew, VariationalPosterior):
-            Kx = kern.K(self.Z, Xnew)
-            mu = np.dot(Kx.T, self.posterior.woodbury_vector)
-            if full_cov:
-                Kxx = kern.K(Xnew)
-                if self.posterior.woodbury_inv.ndim == 2:
-                    var = Kxx - np.dot(Kx.T, np.dot(self.posterior.woodbury_inv, Kx))
-                elif self.posterior.woodbury_inv.ndim == 3:
-                    var = Kxx[:,:,None] - np.tensordot(np.dot(np.atleast_3d(self.posterior.woodbury_inv).T, Kx).T, Kx, [1,0]).swapaxes(1,2)
-                var = var
-            else:
-                Kxx = kern.Kdiag(Xnew)
-                var = (Kxx - np.sum(np.dot(np.atleast_3d(self.posterior.woodbury_inv).T, Kx) * Kx[None,:,:], 1)).T
-        else:
-            Kx = kern.psi1(self.Z, Xnew)
-            mu = np.dot(Kx, self.posterior.woodbury_vector)
-            if full_cov:
-                raise NotImplementedError, "TODO"
-            else:
-                Kxx = kern.psi0(self.Z, Xnew)
-                psi2 = kern.psi2(self.Z, Xnew)
-                var = Kxx - np.sum(np.sum(psi2 * Kmmi_LmiBLmi[None, :, :], 1), 1)
-        return mu, var
diff --git a/GPy/notes.txt b/GPy/notes.txt
deleted file mode 100644
index 768701f2..00000000
--- a/GPy/notes.txt
+++ /dev/null
@@ -1,80 +0,0 @@
-Prod.py kernel could also take a list of kernels rather than two arguments for kernels.
-transformations.py should have limits on what is fed into exp() particularly for the negative log logistic (done -neil). 
-
-Load in a model with mlp kernel, plot it, change a parameter, plot it again. It doesn't update the plot.
-
-Tests for kernels which work directly on the kernel implementation (not through GP).
-
-Should stationary covariances have their own kernpart type, I think so, also inner product kernels. That way the caching so carefully constructed for RBF or linear could be shared.
-
-Where do we declare default kernel parameters. In constructors.py or in the definition file for the kernel?
-
-When printing to stdout, can we check that our approach is also working nicely for the ipython notebook? I like the way our optimization ticks over, but at the moment this doesn't seem to work in the ipython notebook, it would be nice if it did. My problems may be due to using ipython 0.12, I've had a poke around at fixing this and I can't do it for 0.12.
-
-When we print a model should we also include information such as number of inputs and number of outputs?
-
-Let's not use N for giving the number of data in the model. When it pops up as a help tip it's not as clear as num_samples or num_data. Prefer the second, but oddly I've been using first.
-
-Loving the fact that the * has been overloaded on the kernels (oddly never thought to check this before). Although naming can be a bit confusing. Can we think how to deal with the names in a clearer way when we use a kernel like this one:
-kern = GPy.kern.rbf(30)*(GPy.kern.mlp(30)+GPy.kern.poly(30, degree=5)) + GPy.kern.bias(30). There seems to be some tieing of parameters going on ... should there be? (you can try it as the kernel for the robot wireless model).
-
-Can we comment up some of the list incomprehensions in hierarchical.py??
-
-Need to tidy up classification.py, 
-many examples include help that doesn't apply 
-(it is suggested that you can try different approximation types)
-
-Shall we overload the ** operator to have tensor products? (I've done this now we can see if we like it)
-
-People aren't filling the doc strings in as they go *everyone* needs to get in the habit of this (and modifying them as they edit, or correcting them when there is a problem).
-
-Need some nice way of explaining how to compile documentation and run the unit tests, could this be in a readme or FAQ somewhere? Maybe it's there already somewhere and I've missed it.
-
-Shouldn't EP be in the inference package (not likelihoods)?
-
-When using bfgs in ipython notebook, text appears in the original console, not in the notebook.
-
-In sparse GPs wouldn't it be clearer to call Z inducing?
-
-In coregionalisation matrix, setting the W to all ones will (surely?) ensure that symmetry isn't broken. Also, but allowing it to scale like that, the output variance increases as rank is increased (and if user sets rank to more than output dim they could get very different results).
-
-We are inconsistent about our use of ise and ize e.g. optimize and normalize_X, but coregionalise, we should choose one and stick to it. Suggest -ize. Neil- I'm imposing the US spellings to keep things consistent, so -ize it is.
-
-Exceptions: we need to provide a list of exceptions we throw and specify what is thrown where. 
-
-Why is it get_params() but it's getstate()? Should be get_state(). Why is it get_gradient instead of get_gradients? Need to be consistent!! Doesn't matter which way we choose as long as it's consistent.
-
-In likelihood Nparams should be num_params
-
-In likelihood N should be num_data
-
-The Gaussian target in likelihood should be F What is V doing here?
-
-Need to check for nan values in likelihoods. These should be treated as missing values. If the likelihood can't handle the missing value an error should be throw.
-
-
-Sometimes you want to print kernpart objects, for diagnosis etc. This isn't possible currently. 
-
-Why do likelihoods still have YYT everywhere, didn't we agree to set observed data to Y and latent function to F?
-
-For some reason a stub of _get_param_names(self) wasn't available in the Parameterized base class. Have put it in (is this right?)
-
-Is there a quick FAQ or something on how to build the documentation? I did it once, but can't remember! Have started a FAQ.txt file where we can add this type of information. 
-
-Similar for the nosetests ... even ran them last week but can't remember the command!
-
-Now added Gaussian priors to GPLVM latent variables by default. When running the GPy.examples.dimensionality_reduction.stick() example the print out from print model has the same value for the prior+likelihood as for the prior.
-
-For the back constrained GP-LVM need priors to be on the Xs not on the model parameters (because they aren't parameters, they are constraints). Need to work out how to do this, perhaps by creating the full GP-LVM model then constraining around it, rather than overriding inside the GP-LVM model.
-
-
-This code fails:
-
-kern = GPy.kern.rbf(2)
-GPy.kern.Kern_check_dK_dX(kern, X=np.random.randn(10, 2), X2=None).checkgrad(verbose=True)
-
-because X2 is now equal to X, so there is a factor of 2 missing. Does this every come up? Yes, in the GP-LVM, (gplvm.py, line 64) where it is called with a corrective factor of 2! And on line 241 of sparse_gp where it is also called with a corrective factor of 2! In original matlab GPLVM, didn't allow gradients with respect to X alone, and multiplied by 2 in base code, but then add diagonal across those elements. This is missing in the new code.
-
-
-In white.py, line 41, Need to check here if X and X2 refer to the same reference too ... becaue up the pipeline somewhere someone may have set X2=X when X2 arrived originally equal to None.
-
diff --git a/GPy/plotting/matplot_dep/dim_reduction_plots.py b/GPy/plotting/matplot_dep/dim_reduction_plots.py
index 25a1166f..1398b40c 100644
--- a/GPy/plotting/matplot_dep/dim_reduction_plots.py
+++ b/GPy/plotting/matplot_dep/dim_reduction_plots.py
@@ -1,3 +1,5 @@
+# Copyright (c) 2012, GPy authors (see AUTHORS.txt).
+# Licensed under the BSD 3-clause license (see LICENSE.txt)
 
 import numpy as np
 from latent_space_visualizations.controllers.imshow_controller import ImshowController,ImAnnotateController
diff --git a/GPy/plotting/matplot_dep/img_plots.py b/GPy/plotting/matplot_dep/img_plots.py
index 21dbd64f..453a904d 100644
--- a/GPy/plotting/matplot_dep/img_plots.py
+++ b/GPy/plotting/matplot_dep/img_plots.py
@@ -1,3 +1,5 @@
+# Copyright (c) 2012, GPy authors (see AUTHORS.txt).
+# Licensed under the BSD 3-clause license (see LICENSE.txt)
 """
 The module contains the tools for ploting 2D image visualizations
 """
diff --git a/GPy/plotting/matplot_dep/kernel_plots.py b/GPy/plotting/matplot_dep/kernel_plots.py
index dd0f1cf5..347e3d08 100644
--- a/GPy/plotting/matplot_dep/kernel_plots.py
+++ b/GPy/plotting/matplot_dep/kernel_plots.py
@@ -25,7 +25,7 @@ def add_bar_labels(fig, ax, bars, bottom=0):
             c = 'w'
             t = TextPath((0, 0), "${xi}$".format(xi=xi), rotation=0, ha='center')
             transform = transOffset
-            if patch.get_extents().height <= t.get_extents().height + 3:
+            if patch.get_extents().height <= t.get_extents().height + 5:
                 va = 'bottom'
                 c = 'k'
                 transform = transOffsetUp
diff --git a/GPy/plotting/matplot_dep/maps.py b/GPy/plotting/matplot_dep/maps.py
index dbedaa98..fcb03b38 100644
--- a/GPy/plotting/matplot_dep/maps.py
+++ b/GPy/plotting/matplot_dep/maps.py
@@ -1,3 +1,5 @@
+# Copyright (c) 2012, GPy authors (see AUTHORS.txt).
+# Licensed under the BSD 3-clause license (see LICENSE.txt)
 import numpy as np
 try:
     import pylab as pb
diff --git a/GPy/plotting/matplot_dep/models_plots.py b/GPy/plotting/matplot_dep/models_plots.py
index ed024c0a..d2d5a8e2 100644
--- a/GPy/plotting/matplot_dep/models_plots.py
+++ b/GPy/plotting/matplot_dep/models_plots.py
@@ -172,7 +172,7 @@ def plot_fit(model, plot_limits=None, which_data_rows='all',
         if hasattr(model,"Z"):
             #Zu = model.Z[:,free_dims] * model._Xscale[:,free_dims] + model._Xoffset[:,free_dims]
             Zu = Z[:,free_dims]
-            plots['inducing_inputs'] = ax.plot(Zu[:,free_dims[0]], Zu[:,free_dims[1]], 'wo')
+            plots['inducing_inputs'] = ax.plot(Zu[:,0], Zu[:,1], 'wo')
 
     else:
         raise NotImplementedError, "Cannot define a frame with more than two input dimensions"
diff --git a/GPy/testing/__init__.py b/GPy/testing/__init__.py
index f5a4c54f..2e64d90e 100644
--- a/GPy/testing/__init__.py
+++ b/GPy/testing/__init__.py
@@ -1,3 +1,5 @@
+# Copyright (c) 2014, Max Zwiessele
+# Licensed under the BSD 3-clause license (see LICENSE.txt)
 """
 
 MaxZ
diff --git a/GPy/testing/index_operations_tests.py b/GPy/testing/index_operations_tests.py
index 738f92b4..e5c2011a 100644
--- a/GPy/testing/index_operations_tests.py
+++ b/GPy/testing/index_operations_tests.py
@@ -1,8 +1,6 @@
-'''
-Created on 12 Feb 2014
+# Copyright (c) 2014, Max Zwiessele
+# Licensed under the BSD 3-clause license (see LICENSE.txt)
 
-@author: maxz
-'''
 import unittest
 import numpy as np
 from GPy.core.parameterization.index_operations import ParameterIndexOperations,\
@@ -134,4 +132,4 @@ class Test(unittest.TestCase):
 
 if __name__ == "__main__":
     #import sys;sys.argv = ['', 'Test.test_index_view']
-    unittest.main()
\ No newline at end of file
+    unittest.main()
diff --git a/GPy/testing/inference_tests.py b/GPy/testing/inference_tests.py
index fd81022a..ac92c519 100644
--- a/GPy/testing/inference_tests.py
+++ b/GPy/testing/inference_tests.py
@@ -1,3 +1,5 @@
+# Copyright (c) 2014, Max Zwiessele
+# Licensed under the BSD 3-clause license (see LICENSE.txt)
 
 """
 The test cases for various inference algorithms
@@ -79,4 +81,4 @@ class InferenceXTestCase(unittest.TestCase):
         
 
 if __name__ == "__main__":
-    unittest.main()
\ No newline at end of file
+    unittest.main()
diff --git a/GPy/testing/kernel_tests.py b/GPy/testing/kernel_tests.py
index df64cb78..c1bb9265 100644
--- a/GPy/testing/kernel_tests.py
+++ b/GPy/testing/kernel_tests.py
@@ -18,9 +18,9 @@ class Kern_check_model(GPy.core.Model):
     """
     def __init__(self, kernel=None, dL_dK=None, X=None, X2=None):
         GPy.core.Model.__init__(self, 'kernel_test_model')
-        np.random.seed()
         if kernel==None:
             kernel = GPy.kern.RBF(1)
+        kernel.randomize(loc=1, scale=0.1)
         if X is None:
             X = np.random.randn(20, kernel.input_dim)
         if dL_dK is None:
diff --git a/GPy/testing/likelihood_tests.py b/GPy/testing/likelihood_tests.py
index 9a188de5..95929098 100644
--- a/GPy/testing/likelihood_tests.py
+++ b/GPy/testing/likelihood_tests.py
@@ -1,3 +1,5 @@
+# Copyright (c) 2014, Alan Saul
+# Licensed under the BSD 3-clause license (see LICENSE.txt)
 import numpy as np
 import unittest
 import GPy
diff --git a/GPy/testing/linalg_test.py b/GPy/testing/linalg_test.py
new file mode 100644
index 00000000..8e103795
--- /dev/null
+++ b/GPy/testing/linalg_test.py
@@ -0,0 +1,37 @@
+import numpy as np
+import scipy as sp
+from ..util.linalg import jitchol
+
+class LinalgTests(np.testing.TestCase):
+    def setUp(self):
+        #Create PD matrix
+        A = np.random.randn(20,100)
+        self.A = A.dot(A.T)
+        #compute Eigdecomp
+        vals, vectors = np.linalg.eig(self.A)
+        #Set smallest eigenval to be negative with 5 rounds worth of jitter
+        vals[vals.argmin()] = 0
+        default_jitter = 1e-6*np.mean(vals)
+        vals[vals.argmin()] = -default_jitter*(10**3.5)
+        self.A_corrupt = (vectors * vals).dot(vectors.T)
+
+    def test_jitchol_success(self):
+        """
+        Expect 5 rounds of jitter to be added and for the recovered matrix to be
+        identical to the corrupted matrix apart from the jitter added to the diagonal
+        """
+        L = jitchol(self.A_corrupt, maxtries=5)
+        A_new = L.dot(L.T)
+        diff = A_new - self.A_corrupt
+        np.testing.assert_allclose(diff, np.eye(A_new.shape[0])*np.diag(diff).mean(), atol=1e-13)
+
+    def test_jitchol_failure(self):
+        try:
+            """
+            Expecting an exception to be thrown as we expect it to require
+            5 rounds of jitter to be added to enforce PDness
+            """
+            jitchol(self.A_corrupt, maxtries=4)
+            return False
+        except sp.linalg.LinAlgError:
+            return True
diff --git a/GPy/testing/model_tests.py b/GPy/testing/model_tests.py
index 521baeb3..559014f7 100644
--- a/GPy/testing/model_tests.py
+++ b/GPy/testing/model_tests.py
@@ -178,6 +178,24 @@ class MiscTests(unittest.TestCase):
         m.optimize()
         print m
 
+    def test_model_updates(self):
+        Y1 = np.random.normal(0, 1, (40, 13))
+        Y2 = np.random.normal(0, 1, (40, 6))
+        m = GPy.models.MRD([Y1, Y2], 5)
+        self.count = 0
+        m.add_observer(self, self._count_updates, -2000)
+        m.update_model(False)
+        m['.*Gaussian'] = .001
+        self.assertEquals(self.count, 0)
+        m['.*Gaussian'].constrain_bounded(0,.01)
+        self.assertEquals(self.count, 0)
+        m.Z.fix()
+        self.assertEquals(self.count, 0)
+        m.update_model(True)
+        self.assertEquals(self.count, 1)
+    def _count_updates(self, me, which):
+        self.count+=1
+
     def test_model_optimize(self):
         X = np.random.uniform(-3., 3., (20, 1))
         Y = np.sin(X) + np.random.randn(20, 1) * 0.05
diff --git a/GPy/testing/mpi_tests.py b/GPy/testing/mpi_tests.py
index 45777eb1..5c489032 100644
--- a/GPy/testing/mpi_tests.py
+++ b/GPy/testing/mpi_tests.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2012, GPy authors (see AUTHORS.txt).
+# Copyright (c) 2013-2014, Zhenwen Dai
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 
 import unittest
@@ -89,4 +89,4 @@ if __name__ == "__main__":
         import mpi4py
         unittest.main()
     except:
-        pass
\ No newline at end of file
+        pass
diff --git a/GPy/testing/observable_tests.py b/GPy/testing/observable_tests.py
index d8aad4c7..84059d98 100644
--- a/GPy/testing/observable_tests.py
+++ b/GPy/testing/observable_tests.py
@@ -1,8 +1,5 @@
-'''
-Created on 27 Feb 2014
-
-@author: maxz
-'''
+# Copyright (c) 2014, Max Zwiessele
+# Licensed under the BSD 3-clause license (see LICENSE.txt)
 import unittest
 from GPy.core.parameterization.parameterized import Parameterized
 from GPy.core.parameterization.param import Param
@@ -132,4 +129,4 @@ class Test(unittest.TestCase):
 
 if __name__ == "__main__":
     #import sys;sys.argv = ['', 'Test.testName']
-    unittest.main()
\ No newline at end of file
+    unittest.main()
diff --git a/GPy/testing/pickle_tests.py b/GPy/testing/pickle_tests.py
index d6d6f923..c79e9914 100644
--- a/GPy/testing/pickle_tests.py
+++ b/GPy/testing/pickle_tests.py
@@ -138,8 +138,6 @@ class Test(ListDictTestCase):
         self.assertIsNot(par.gradient_full, pcopy.gradient_full)
         self.assertTrue(pcopy.checkgrad())
         self.assert_(np.any(pcopy.gradient!=0.0))
-        pcopy.optimize('bfgs')
-        par.optimize('bfgs')
         np.testing.assert_allclose(pcopy.param_array, par.param_array, atol=1e-6)
         par.randomize()
         with tempfile.TemporaryFile('w+b') as f:
diff --git a/GPy/util/block_matrices.py b/GPy/util/block_matrices.py
index 8fd5f89d..95920868 100644
--- a/GPy/util/block_matrices.py
+++ b/GPy/util/block_matrices.py
@@ -1,3 +1,5 @@
+# Copyright (c) 2012, GPy authors (see AUTHORS.txt).
+# Licensed under the BSD 3-clause license (see LICENSE.txt)
 import numpy as np
 
 def get_blocks(A, blocksizes):
diff --git a/GPy/util/caching.py b/GPy/util/caching.py
index 6e954fc7..16adc320 100644
--- a/GPy/util/caching.py
+++ b/GPy/util/caching.py
@@ -1,3 +1,5 @@
+# Copyright (c) 2012, GPy authors (see AUTHORS.txt).
+# Licensed under the BSD 3-clause license (see LICENSE.txt)
 from ..core.parameterization.observable import Observable
 import collections, weakref
 
diff --git a/GPy/util/choleskies.py b/GPy/util/choleskies.py
new file mode 100644
index 00000000..3f37fc3f
--- /dev/null
+++ b/GPy/util/choleskies.py
@@ -0,0 +1,130 @@
+# Copyright James Hensman and Max Zwiessele 2014
+# Licensed under the GNU GPL version 3.0
+
+import numpy as np
+from scipy import weave
+import linalg
+
+
+def safe_root(N):
+    i = np.sqrt(N)
+    j = int(i)
+    if i != j:
+        raise ValueError, "N is not square!"
+    return j
+
+def flat_to_triang(flat):
+    """take a matrix N x D and return a M X M x D array where
+
+    N = M(M+1)/2
+
+    the lower triangluar portion of the d'th slice of the result is filled by the d'th column of flat.
+    """
+    N, D = flat.shape
+    M = (-1 + safe_root(8*N+1))/2
+    ret = np.zeros((M, M, D))
+    flat = np.ascontiguousarray(flat)
+
+    code = """
+    int count = 0;
+    for(int m=0; m<M; m++)
+    {
+      for(int mm=0; mm<=m; mm++)
+      {
+        for(int d=0; d<D; d++)
+        {
+          ret[d + m*D*M + mm*D] = flat[count];
+          count++;
+        }
+      }
+    }
+    """
+    weave.inline(code, ['flat', 'ret', 'D', 'M'])
+    return ret
+
+def triang_to_flat(L):
+    M, _, D = L.shape
+
+    L = np.ascontiguousarray(L) # should do nothing if L was created by flat_to_triang
+
+    N = M*(M+1)/2
+    flat = np.empty((N, D))
+    code = """
+    int count = 0;
+    for(int m=0; m<M; m++)
+    {
+      for(int mm=0; mm<=m; mm++)
+      {
+        for(int d=0; d<D; d++)
+        {
+          flat[count] = L[d + m*D*M + mm*D];
+          count++;
+        }
+      }
+    }
+    """
+    weave.inline(code, ['flat', 'L', 'D', 'M'])
+    return flat
+
+def triang_to_cov(L):
+    return np.dstack([np.dot(L[:,:,i], L[:,:,i].T) for i in xrange(L.shape[-1])])
+
+def multiple_dpotri_old(Ls):
+    M, _, D = Ls.shape
+    Kis = np.rollaxis(Ls, -1).copy()
+    [dpotri(Kis[i,:,:], overwrite_c=1, lower=1) for i in xrange(D)]
+    code = """
+    for(int d=0; d<D; d++)
+    {
+      for(int m=0; m<M; m++)
+      {
+        for(int mm=0; mm<m; mm++)
+        {
+          Kis[d*M*M + mm*M + m ] = Kis[d*M*M + m*M + mm];
+        }
+      }
+    }
+
+    """
+    weave.inline(code, ['Kis', 'D', 'M'])
+    Kis = np.rollaxis(Kis, 0, 3) #wtf rollaxis?
+    return Kis
+
+def multiple_dpotri(Ls):
+    return np.dstack([linalg.dpotri(np.asfortranarray(Ls[:,:,i]), lower=1)[0] for i in range(Ls.shape[-1])])
+
+
+
+
+def indexes_to_fix_for_low_rank(rank, size):
+    """
+    work out which indexes of the flatteneed array should be fixed if we want the cholesky to represent a low rank matrix
+    """
+    #first we'll work out what to keep, and the do the set difference.
+
+    #here are the indexes of the first column, which are the triangular numbers
+    n = np.arange(size)
+    triangulars = (n**2 + n) / 2
+    keep = []
+    for i in range(rank):
+        keep.append(triangulars[i:] + i)
+    #add the diagonal
+    keep.append(triangulars[1:]-1)
+    keep.append((size**2 + size)/2 -1)# the very last element
+    keep = np.hstack(keep)
+
+    return np.setdiff1d(np.arange((size**2+size)/2), keep)
+
+
+
+#class cholchecker(GPy.core.Model):
+    #def __init__(self, L, name='cholchecker'):
+        #super(cholchecker, self).__init__(name)
+        #self.L = GPy.core.Param('L',L)
+        #self.link_parameter(self.L)
+    #def parameters_changed(self):
+        #LL = flat_to_triang(self.L)
+        #Ki = multiple_dpotri(LL)
+        #self.L.gradient = 2*np.einsum('ijk,jlk->ilk', Ki, LL)
+        #self._loglik = np.sum([np.sum(np.log(np.abs(np.diag()))) for i in range(self.L.shape[-1])])
+#
diff --git a/GPy/util/classification.py b/GPy/util/classification.py
index 41701949..c0859793 100644
--- a/GPy/util/classification.py
+++ b/GPy/util/classification.py
@@ -1,3 +1,5 @@
+# Copyright (c) 2012, GPy authors (see AUTHORS.txt).
+# Licensed under the BSD 3-clause license (see LICENSE.txt)
 import numpy as np
 
 def conf_matrix(p,labels,names=['1','0'],threshold=.5,show=True):
diff --git a/GPy/util/debug.py b/GPy/util/debug.py
index b676d028..00107f5e 100644
--- a/GPy/util/debug.py
+++ b/GPy/util/debug.py
@@ -10,7 +10,7 @@ import numpy as np
 def checkFinite(arr, name=None):
     if name is None:
         name = 'Array with ID['+str(id(arr))+']'
-        
+
     if np.any(np.logical_not(np.isfinite(arr))):
         idx = np.where(np.logical_not(np.isfinite(arr)))[0]
         print name+' at indices '+str(idx)+' have not finite values: '+str(arr[idx])+'!'
@@ -21,16 +21,15 @@ def checkFullRank(m, tol=1e-10, name=None, force_check=False):
     if name is None:
         name = 'Matrix with ID['+str(id(m))+']'
     assert len(m.shape)==2 and m.shape[0]==m.shape[1], 'The input of checkFullRank has to be a square matrix!'
-    
+
     if not force_check and m.shape[0]>=10000:
         print 'The size of '+name+'is too big to check (>=10000)!'
         return True
-    
+
     s = np.real(np.linalg.eigvals(m))
-    
+
     if s.min()/s.max()<tol:
         print name+' is close to singlar!'
         print 'The eigen values of '+name+' is '+str(s)
         return False
     return True
-    
\ No newline at end of file
diff --git a/GPy/util/decorators.py b/GPy/util/decorators.py
index c8aa08a2..4273442d 100644
--- a/GPy/util/decorators.py
+++ b/GPy/util/decorators.py
@@ -1,3 +1,5 @@
+# Copyright (c) 2012, GPy authors (see AUTHORS.txt).
+# Licensed under the BSD 3-clause license (see LICENSE.txt)
 import numpy as np
 from functools import wraps
 
diff --git a/GPy/util/diag.py b/GPy/util/diag.py
index 3044ed54..e7c332e2 100644
--- a/GPy/util/diag.py
+++ b/GPy/util/diag.py
@@ -1,10 +1,5 @@
-'''
-.. module:: GPy.util.diag
-
-.. moduleauthor:: Max Zwiessele <ibinbei@gmail.com>
-
-'''
-__updated__ = '2013-12-03'
+# Copyright (c) 2012, GPy authors (see AUTHORS.txt).
+# Licensed under the BSD 3-clause license (see LICENSE.txt)
 
 import numpy as np
 
diff --git a/GPy/util/functions.py b/GPy/util/functions.py
index 3278182f..be024aeb 100644
--- a/GPy/util/functions.py
+++ b/GPy/util/functions.py
@@ -1,3 +1,5 @@
+# Copyright (c) 2012, GPy authors (see AUTHORS.txt).
+# Licensed under the BSD 3-clause license (see LICENSE.txt)
 import numpy as np
 from scipy.special import erf, erfc, erfcx
 import sys
diff --git a/GPy/util/linalg.py b/GPy/util/linalg.py
index dffd438a..b148f2f4 100644
--- a/GPy/util/linalg.py
+++ b/GPy/util/linalg.py
@@ -82,6 +82,7 @@ def force_F_ordered(A):
 
 #         return jitchol(A+np.eye(A.shape[0])*jitter, maxtries-1)
 
+
 def jitchol(A, maxtries=5):
     A = np.ascontiguousarray(A)
     L, info = lapack.dpotrf(A, lower=1)
@@ -92,25 +93,19 @@ def jitchol(A, maxtries=5):
         if np.any(diagA <= 0.):
             raise linalg.LinAlgError, "not pd: non-positive diagonal elements"
         jitter = diagA.mean() * 1e-6
-        while maxtries > 0 and np.isfinite(jitter):
+        num_tries = 1
+        while num_tries <= maxtries and np.isfinite(jitter):
             try:
                 L = linalg.cholesky(A + np.eye(A.shape[0]) * jitter, lower=True)
+                logging.warning('Added {} rounds of jitter, jitter of {:.10e}\n'.format(num_tries, jitter))
+                return L
             except:
                 jitter *= 10
-            finally:
-                maxtries -= 1
-        raise linalg.LinAlgError, "not positive definite, even with jitter."
+                num_tries += 1
     import traceback
-    try: raise
-    except:
-        logging.warning('\n'.join(['Added jitter of {:.10e}'.format(jitter),
-            '  in '+traceback.format_list(traceback.extract_stack(limit=2)[-2:-1])[0][2:]]))
-    import ipdb;ipdb.set_trace()
-    return L
-
-
-
-
+    logging.warning('\n'.join(['Added {} rounds of jitter, jitter of {:.10e}'.format(num_tries-1, jitter),
+                                '  in '+traceback.format_list(traceback.extract_stack(limit=2)[-2:-1])[0][2:]]))
+    raise linalg.LinAlgError, "not positive definite, even with jitter."
 
 # def dtrtri(L, lower=1):
 #     """
diff --git a/README.md b/README.md
index 6a880209..5e98af85 100644
--- a/README.md
+++ b/README.md
@@ -1,17 +1,17 @@
-GPy
-===
+# GPy
+
 
 A Gaussian processes framework in Python.
 
 * [GPy homepage](http://sheffieldml.github.io/GPy/)
+* [Tutorial notebooks](http://nbviewer.ipython.org/github/SheffieldML/notebook/blob/master/GPy/index.ipynb)
 * [User mailing list](https://lists.shef.ac.uk/sympa/subscribe/gpy-users)
 * [Online documentation](https://gpy.readthedocs.org/en/latest/)
 * [Unit tests (Travis-CI)](https://travis-ci.org/SheffieldML/GPy)
 
 Continuous integration status: ![CI status](https://travis-ci.org/SheffieldML/GPy.png)
 
-Citation
-========
+### Citation
 
     @Misc{gpy2014,
       author =   {The GPy authors},
@@ -20,23 +20,29 @@ Citation
       year = {2012--2014}
     }
 
-Pronounciation
-==============
+### Pronounciation
+
 We like to pronounce it 'Gee-pie'.
 
-Getting started
-===============
-Installing with pip
--------------------
-The simplest way to install GPy is using pip. ubuntu users can do:
+### Getting started: installing with pip
+
+The simplest way to install GPy is using pip. Ubuntu users can do:
 
     sudo apt-get install python-pip
     pip install gpy
+    
+On windows, we recommend the ![anaconda python distribution](http://continuum.io/downloads). We've also had luck with ![enthought](http://www.enthought.com).
+
+On a fresh install of windows 8.1, we downloaded the Anaconda python distribution, started the anaconda command prompt and typed 
+
+    pip install GPy
+
+Everything seems to work: from here you can type `ipython` and then `import GPy; GPy.tests()`. Working as of 21/11/14
 
 If you'd like to install from source, or want to contribute to the project (e.g. by sending pull requests via github), read on.
 
-Ubuntu
-------
+### Ubuntu hackers
+
 For the most part, the developers are using ubuntu. To install the required packages:
 
     sudo apt-get install python-numpy python-scipy python-matplotlib
@@ -47,32 +53,25 @@ clone this git repository and add it to your path:
     echo 'PYTHONPATH=$PYTHONPATH:~/SheffieldML' >> ~/.bashrc
 
 
-Windows
--------
-On windows, we recommend the ![anaconda python distribution](http://continuum.io/downloads). We've also had luck with ![enthought](http://www.enthought.com). git clone or unzip the source to a suitable directory, and add an approptiate PYTHONPATH environment variable. 
+ 
+### OSX
 
-On windows 7 (and possibly earlier versions) there's a bug in scipy version 0.13 which tries to write very long filenames. Reverting to scipy 0.12 seems to do the trick:
-
-    conda install scipy=0.12
-
-OSX
----
 Everything appears to work out-of-the box using ![enthought](http://www.enthought.com) on osx Mavericks. Download/clone GPy, and then add GPy to your PYTHONPATH
 
     git clone git@github.com:SheffieldML/GPy.git ~/SheffieldML
     echo 'PYTHONPATH=$PYTHONPATH:~/SheffieldML' >> ~/.profile
 
 
-Compiling documentation:
-========================
+### Compiling documentation:
+
 
 The documentation is stored in doc/ and is compiled with the Sphinx Python documentation generator, and is written in the reStructuredText format.
 
 The Sphinx documentation is available here: http://sphinx-doc.org/latest/contents.html
 
 
-Installing dependencies:
-------------------------
+##### Installing dependencies:
+
 
 To compile the documentation, first ensure that Sphinx is installed. On Debian-based systems, this can be achieved as follows:
 
@@ -86,8 +85,8 @@ A LaTeX distribution is also required to compile the equations. Note that the ex
     sudo apt-get install ipython
 
 
-Compiling documentation:
-------------------------
+#### Compiling documentation:
+
 
 The documentation can be compiled as follows:
 
@@ -97,8 +96,8 @@ The documentation can be compiled as follows:
 The HTML files are then stored in doc/_build/
 
 
-Running unit tests:
-===================
+## Running unit tests:
+
 
 Ensure nose is installed via pip:
 
@@ -108,8 +107,14 @@ Run nosetests from the root directory of the repository:
 
     nosetests -v
 
-Funding Acknowledgements
-========================
+or from within IPython
+
+    import GPy; GPy.tests()
+
+
+
+## Funding Acknowledgements
+
 
 Current support for the GPy software is coming through the following projects. 
 
diff --git a/setup.py b/setup.py
index c4963bcc..0562c9d8 100644
--- a/setup.py
+++ b/setup.py
@@ -5,7 +5,7 @@ import os
 from setuptools import setup
 
 # Version number
-version = '0.6.0'
+version = '0.6.1'
 
 def read(fname):
     return open(os.path.join(os.path.dirname(__file__), fname)).read()