[huge merge] the second

2026-06-08 15:05:15 +02:00 · 2014-11-21 16:42:01 +00:00 · 2014-11-21 16:42:01 +00:00 · 187f85c239
commit 187f85c239
parent 180650ec85
35 changed files with 40 additions and 3018 deletions
--- a/.gitignore
+++ b/.gitignore
@ -45,4 +45,4 @@ iterate.dat

 # git merge files #
 ###################
-*.orig
+*.orig
--- a/.travis.yml
+++ b/.travis.yml
@ -2,14 +2,14 @@ language: python
 python:
  - "2.7"

-#Set virtual env with system-site-packages to true
-virtualenv:
-  system_site_packages: true
-
 # command to install dependencies, e.g. pip install -r requirements.txt --use-mirrors
 before_install:
-  - sudo apt-get install -qq python-scipy python-pip
-  - sudo apt-get install -qq python-matplotlib
+  #Install a mini version of anaconda such that we can easily install our dependencies
+  - wget http://repo.continuum.io/miniconda/Miniconda-latest-Linux-x86_64.sh -O miniconda.sh
+  - chmod +x miniconda.sh
+  - ./miniconda.sh -b
+  - export PATH=/home/travis/miniconda/bin:$PATH
+  - conda update --yes conda
  # Workaround for a permissions issue with Travis virtual machine images
  # that breaks Python's multiprocessing:
  # https://github.com/travis-ci/travis-cookbooks/issues/155
@ -17,11 +17,10 @@ before_install:
  - sudo ln -s /run/shm /dev/shm

 install:
-  - pip install --upgrade numpy==1.7.1
-  - pip install sphinx
-  - pip install nose
-  - pip install . --use-mirrors
+  - conda install --yes python=$TRAVIS_PYTHON_VERSION atlas numpy=1.7 scipy=0.12 matplotlib nose sphinx pip nose
+  - pip install . 
+  #--use-mirrors
+  #
 # command to run tests, e.g. python setup.py test
-script:
+script: 
  - nosetests GPy/testing
-  #- yes | nosetests GPy/testing
--- a/GPy/gpy_config.cfg
+++ b/GPy/gpy_config.cfg
@ -1,7 +0,0 @@
-# This is the configuration file for GPy
-
-[parallel]
-# Enable openmp support. This speeds up some computations, depending on the number
-# of cores available. Setting up a compiler with openmp support can be difficult on 
-# some platforms, hence this option.
-openmp=False
--- a/GPy/inference/optimization/init.py
+++ b/GPy/inference/optimization/init.py
@ -1 +1,2 @@
+from scg import SCG
 from optimization import *
--- a/GPy/inference/optimization/optimization.py
+++ b/GPy/inference/optimization/optimization.py
@ -225,11 +225,13 @@ class opt_SCG(Optimizer):
        self.status = opt_result[3]

 def get_optimizer(f_min):
+    from sgd import opt_SGD
+
    optimizers = {'fmin_tnc': opt_tnc,
          'simplex': opt_simplex,
          'lbfgsb': opt_lbfgsb,
          'scg': opt_SCG,
-          }
+          'sgd': opt_SGD}

    if rasm_available:
        optimizers['rasmussen'] = opt_rasm
--- a/GPy/kern/_src/sympy_helpers.cpp
+++ b/GPy/kern/_src/sympy_helpers.cpp
@ -1,9 +1,7 @@
-#include "Python.h"
 #include <math.h>
 #include <float.h>
 #include <stdlib.h>
-#include <iostream>
-#include <stdexcept>
+
 double DiracDelta(double x){
  // TODO: this doesn't seem to be a dirac delta ... should return infinity. Neil
    if((x<0.000001) & (x>-0.000001))//go on, laugh at my c++ skills
@ -16,7 +14,6 @@ double DiracDelta(double x,int foo){
 };

 double sinc(double x){
-  // compute the sinc function
  if (x==0)
    return 1.0;
  else 
@ -24,39 +21,28 @@ double sinc(double x){
 }

 double sinc_grad(double x){
-  // compute the gradient of the sinc function.
  if (x==0)
    return 0.0;
  else 
    return (x*cos(x) - sin(x))/(x*x);
 }
+
 double erfcx(double x){
-  // Based on code by Soren Hauberg 2010 for Octave.
-  // compute the scaled complex error function.
-  //return erfc(x)*exp(x*x);
  double xneg=-sqrt(log(DBL_MAX/2));
  double xmax = 1/(sqrt(M_PI)*DBL_MIN);
  xmax = DBL_MAX<xmax ? DBL_MAX : xmax;
  // Find values where erfcx can be evaluated
-  double t = 3.97886080735226 / (fabs(x) + 3.97886080735226);
+  double t = 3.97886080735226 / (abs(x) + 3.97886080735226);
  double u = t-0.5;
  double y = (((((((((u * 0.00127109764952614092 + 1.19314022838340944e-4) * u 
-		     - 0.003963850973605135)   * u - 8.70779635317295828e-4) * u 
-		   + 0.00773672528313526668) * u + 0.00383335126264887303) * u 
-		 - 0.0127223813782122755)  * u - 0.0133823644533460069)  * u 
-	       + 0.0161315329733252248)  * u + 0.0390976845588484035)  * u + 0.00249367200053503304;
-  y = ((((((((((((y * u - 0.0838864557023001992) * u -		       
-		 0.119463959964325415) * u + 0.0166207924969367356) * u + 
-	       0.357524274449531043) * u + 0.805276408752910567)  * u + 
-	     1.18902982909273333)  * u + 1.37040217682338167)   * u +	
-	   1.31314653831023098)  * u + 1.07925515155856677)   * u +	
-	 0.774368199119538609) * u + 0.490165080585318424)  * u +	
-       0.275374741597376782) * t;
-
+	      - 0.003963850973605135)   * u - 8.70779635317295828e-4) * u 
+	    + 0.00773672528313526668) * u + 0.00383335126264887303) * u 
+	  - 0.0127223813782122755)  * u - 0.0133823644533460069)  * u 
+	+ 0.0161315329733252248)  * u + 0.0390976845588484035)  * u + 0.00249367200053503304;
  if (x<xneg)
    return -INFINITY;
  else if (x<0)
-    return 2.0*exp(x*x)-y;
+    return 2*exp(x*x)-y;
  else if (x>xmax)
    return 0.0;
  else 
@ -64,133 +50,12 @@ double erfcx(double x){
 }

 double ln_diff_erf(double x0, double x1){
-  // stably compute the log of difference between two erfs.
-  if (x1>x0){
-    PyErr_SetString(PyExc_RuntimeError,"second argument must be smaller than or equal to first in ln_diff_erf");
-    throw 1;
-  }
-  if (x0==x1){
-    PyErr_WarnEx(PyExc_RuntimeWarning,"divide by zero encountered in log", 1);
-    return -INFINITY;
-  }
-  else if(x0<0 && x1>0 || x0>0 && x1<0) //x0 and x1 have opposite signs
+  if (x0==x1)
+    return INFINITY;
+  else if(x0<0 && x1>0 || x0>0 && x1<0)
    return log(erf(x0)-erf(x1));
-  else if(x0>0) //x0 positive, x1 non-negative
-    return log(erfcx(x1)-erfcx(x0)*exp(x1*x1- x0*x0))-x1*x1; 
-  else //x0 and x1 non-positive
+  else if(x1>0)
+    return log(erfcx(x1)-erfcx(x0)*exp(x1*x1)- x0*x0)-x1*x1;
+  else 
    return log(erfcx(-x0)-erfcx(-x1)*exp(x0*x0 - x1*x1))-x0*x0;
 }
-// TODO: For all these computations of h things are very efficient at the moment. Need to recode sympykern to allow the precomputations to take place and all the gradients to be computed in one function. Not sure of best way forward for that yet. Neil
-double h(double t, double tprime, double d_i, double d_j, double l){
-  // Compute the h function for the sim covariance.
-  double half_l_di = 0.5*l*d_i;
-  double arg_1 = half_l_di + tprime/l;
-  double arg_2 = half_l_di - (t-tprime)/l;
-  double ln_part_1 = ln_diff_erf(arg_1, arg_2);
-  arg_2 = half_l_di - t/l;
-  double sign_val = 1.0;
-  if(t/l==0)
-    sign_val = 0.0;
-  else if (t/l < 0)
-    sign_val = -1.0;
-  arg_2 = half_l_di - t/l;
-  double ln_part_2 = ln_diff_erf(half_l_di, arg_2);
-  // if either ln_part_1 or ln_part_2 are -inf, don't bother computing rest of that term.
-  double part_1 = 0.0;
-  if(isfinite(ln_part_1))
-    part_1 = sign_val*exp(half_l_di*half_l_di - d_i*(t-tprime) + ln_part_1 - log(d_i + d_j));
-  double part_2 = 0.0;
-  if(isfinite(ln_part_2))
-    part_2 = sign_val*exp(half_l_di*half_l_di - d_i*t - d_j*tprime + ln_part_2 - log(d_i + d_j));
-  return part_1 - part_2;
-}
-
-
-double dh_dd_i(double t, double tprime, double d_i, double d_j, double l){
-  double diff_t = (t-tprime);
-  double l2 = l*l;
-  double hv = h(t, tprime, d_i, d_j, l);
-  double half_l_di = 0.5*l*d_i;
-  double arg_1 = half_l_di + tprime/l;
-  double arg_2 = half_l_di - (t-tprime)/l;
-  double ln_part_1 = ln_diff_erf(arg_1, arg_2);
-  arg_1 = half_l_di;
-  arg_2 = half_l_di - t/l;
-  double sign_val = 1.0;
-  if(t/l==0)
-    sign_val = 0.0;
-  else if (t/l < 0)
-    sign_val = -1.0;
-  double ln_part_2 = ln_diff_erf(half_l_di, half_l_di - t/l);
-  double base = (0.5*d_i*l2*(d_i+d_j)-1)*hv;
-  if(isfinite(ln_part_1))
-    base -= diff_t*sign_val*exp(half_l_di*half_l_di
-				-d_i*diff_t
-				+ln_part_1);
-  if(isfinite(ln_part_2))
-    base += t*sign_val*exp(half_l_di*half_l_di
-			   -d_i*t-d_j*tprime
-			   +ln_part_2);
-  base += l/sqrt(M_PI)*(-exp(-diff_t*diff_t/l2)
-			+exp(-tprime*tprime/l2-d_i*t)
-			+exp(-t*t/l2-d_j*tprime)
-			-exp(-(d_i*t + d_j*tprime)));
-  return base/(d_i+d_j);
-
-}
-
-double dh_dd_j(double t, double tprime, double d_i, double d_j, double l){
-  double half_l_di = 0.5*l*d_i;
-  double hv = h(t, tprime, d_i, d_j, l);
-  double sign_val = 1.0;
-  if(t/l==0)
-    sign_val = 0.0;
-  else if (t/l < 0)
-    sign_val = -1.0;
-  double ln_part_2 = ln_diff_erf(half_l_di, half_l_di - t/l);
-  double base = -hv;
-  if(isfinite(ln_part_2))
-    base += tprime*sign_val*exp(half_l_di*half_l_di-(d_i*t+d_j*tprime)+ln_part_2);
-  return base/(d_i+d_j);
-}
-
-double dh_dl(double t, double tprime, double d_i, double d_j, double l){
-  // compute gradient of h function with respect to lengthscale for sim covariance
-  // TODO a lot of energy wasted recomputing things here, need to do this in a shared way somehow ... perhaps needs rewrite of sympykern.
-  double half_l_di = 0.5*l*d_i;
-  double arg_1 = half_l_di + tprime/l;
-  double arg_2 = half_l_di - (t-tprime)/l;
-  double ln_part_1 = ln_diff_erf(arg_1, arg_2);
-  arg_2 = half_l_di - t/l;
-  double ln_part_2 = ln_diff_erf(half_l_di, arg_2);
-  double diff_t = t - tprime;
-  double l2 = l*l;
-  double hv = h(t, tprime, d_i, d_j, l);
-  return 0.5*d_i*d_i*l*hv + 2/(sqrt(M_PI)*(d_i+d_j))*((-diff_t/l2-d_i/2)*exp(-diff_t*diff_t/l2)+(-tprime/l2+d_i/2)*exp(-tprime*tprime/l2-d_i*t)-(-t/l2-d_i/2)*exp(-t*t/l2-d_j*tprime)-d_i/2*exp(-(d_i*t+d_j*tprime)));
-}
-
-double dh_dt(double t, double tprime, double d_i, double d_j, double l){
-  // compute gradient of h function with respect to t.
-  double diff_t = t - tprime;
-  double half_l_di = 0.5*l*d_i;
-  double arg_1 = half_l_di + tprime/l;
-  double arg_2 = half_l_di - diff_t/l;
-  double ln_part_1 = ln_diff_erf(arg_1, arg_2);
-  arg_2 = half_l_di - t/l;
-  double ln_part_2 = ln_diff_erf(half_l_di, arg_2);
-  
-  return (d_i*exp(ln_part_2-d_i*t - d_j*tprime) - d_i*exp(ln_part_1-d_i*diff_t) + 2*exp(-d_i*diff_t - pow(half_l_di - diff_t/l, 2))/(sqrt(M_PI)*l) - 2*exp(-d_i*t - d_j*tprime - pow(half_l_di - t/l,2))/(sqrt(M_PI)*l))*exp(half_l_di*half_l_di)/(d_i + d_j);
-}
-
-double dh_dtprime(double t, double tprime, double d_i, double d_j, double l){
-  // compute gradient of h function with respect to tprime.
-  double diff_t = t - tprime;
-  double half_l_di = 0.5*l*d_i;
-  double arg_1 = half_l_di + tprime/l;
-  double arg_2 = half_l_di - diff_t/l;
-  double ln_part_1 = ln_diff_erf(arg_1, arg_2);
-  arg_2 = half_l_di - t/l;
-  double ln_part_2 = ln_diff_erf(half_l_di, arg_2);
-
-  return (d_i*exp(ln_part_1-d_i*diff_t) + d_j*exp(ln_part_2-d_i*t - d_j*tprime) + (-2*exp(-pow(half_l_di - diff_t/l,2)) + 2*exp(-pow(half_l_di + tprime/l,2)))*exp(-d_i*diff_t)/(sqrt(M_PI)*l))*exp(half_l_di*half_l_di)/(d_i + d_j);
-}
--- a/GPy/kern/_src/sympy_helpers.h
+++ b/GPy/kern/_src/sympy_helpers.h
@ -7,10 +7,3 @@ double sinc_grad(double x);

 double erfcx(double x);
 double ln_diff_erf(double x0, double x1);
-
-double h(double t, double tprime, double d_i, double d_j, double l);
-double dh_dl(double t, double tprime, double d_i, double d_j, double l);
-double dh_dd_i(double t, double tprime, double d_i, double d_j, double l);
-double dh_dd_j(double t, double tprime, double d_i, double d_j, double l);
-double dh_dt(double t, double tprime, double d_i, double d_j, double l);
-double dh_dtprime(double t, double tprime, double d_i, double d_j, double l);
--- a/GPy/kern/_src/todo/ODE_1.py
+++ b/GPy/kern/_src/todo/ODE_1.py
@ -137,11 +137,7 @@ class ODE_1(Kernpart):
        k2 = (np.exp(-lu*dist)*(ly-2*lu+lu*ly*dist-lu**2*dist) + np.exp(-ly*dist)*(2*lu-ly) ) / (ly-lu)**2 
        k3 = np.exp(-lu*dist) * ( (1+lu*dist)/(lu+ly) + (lu)/(lu+ly)**2 )
        dkdvar = k1+k2+k3
-        
-        #target[0] dk dvarU
-        #target[1] dk dvarY
-        #target[2] dk d theta1
-        #target[3] dk d theta2 
+
        target[0] += np.sum(self.varianceY*dkdvar * dL_dK)
        target[1] += np.sum(self.varianceU*dkdvar * dL_dK)
        target[2] += np.sum(dktheta1*(-np.sqrt(3)*self.lengthscaleU**(-2)) * dL_dK)
--- a/GPy/kern/_src/todo/hetero.py
+++ b/GPy/kern/_src/todo/hetero.py
@ -1,6 +1,7 @@
 # Copyright (c) 2013, GPy authors (see AUTHORS.txt).
 # Licensed under the BSD 3-clause license (see LICENSE.txt)

+from IPython.core.debugger import Tracer; debug_here=Tracer()
 from kernpart import Kernpart
 import numpy as np
 from ...util.linalg import tdot
--- a/GPy/kern/_src/todo/symmetric.py
+++ b/GPy/kern/_src/todo/symmetric.py
@ -45,7 +45,7 @@ class Symmetric(Kernpart):
        AX = np.dot(X,self.transform)
        if X2 is None:
            X2 = X
-            AX2 = AX
+            ZX2 = AX
        else:
            AX2 = np.dot(X2, self.transform)
        self.k._param_grad_helper(dL_dK,X,X2,target)
--- a/GPy/kern/parts/ODE_UY.py
+++ b/GPy/kern/parts/ODE_UY.py
@ -1,335 +0,0 @@
-# Copyright (c) 2012, GPy authors (see AUTHORS.txt).
-# Licensed under the BSD 3-clause license (see LICENSE.txt)
-
-
-from kernpart import Kernpart
-import numpy as np
-
-def index_to_slices(index):
-    """
-    take a numpy array of integers (index) and return a  nested list of slices such that the slices describe the start, stop points for each integer in the index.
-
-    e.g.
-    >>> index = np.asarray([0,0,0,1,1,1,2,2,2])
-    returns
-    >>> [[slice(0,3,None)],[slice(3,6,None)],[slice(6,9,None)]]
-
-    or, a more complicated example
-    >>> index = np.asarray([0,0,1,1,0,2,2,2,1,1])
-    returns
-    >>> [[slice(0,2,None),slice(4,5,None)],[slice(2,4,None),slice(8,10,None)],[slice(5,8,None)]]
-    """
-
-    #contruct the return structure
-    ind = np.asarray(index,dtype=np.int64)
-    ret = [[] for i in range(ind.max()+1)]
-
-    #find the switchpoints
-    ind_ = np.hstack((ind,ind[0]+ind[-1]+1))
-    switchpoints = np.nonzero(ind_ - np.roll(ind_,+1))[0]
-
-    [ret[ind_i].append(slice(*indexes_i)) for ind_i,indexes_i in zip(ind[switchpoints[:-1]],zip(switchpoints,switchpoints[1:]))]
-    return ret
-
-class ODE_UY(Kernpart):
-    """
-    kernel resultiong from a first order ODE with OU driving GP
-
-    :param input_dim: the number of input dimension, has to be equal to one
-    :type input_dim: int
-    :param input_lengthU: the number of input U length
-    :type input_dim: int
-    :param varianceU: variance of the driving GP
-    :type varianceU: float
-    :param lengthscaleU: lengthscale of the driving GP  (sqrt(3)/lengthscaleU)
-    :type lengthscaleU: float
-    :param varianceY: 'variance' of the transfer function
-    :type varianceY: float
-    :param lengthscaleY: 'lengthscale' of the transfer function (1/lengthscaleY)
-    :type lengthscaleY: float
-    :rtype: kernel object
-
-    """
-
-
-
-
-    def __init__(self, input_dim=2,varianceU=1., varianceY=1., lengthscaleU=None, lengthscaleY=None):
-        assert input_dim==2, "Only defined for input_dim = 1"
-        self.input_dim = input_dim
-        self.num_params = 4
-        self.name = 'ODE_UY'
-
-
-        if lengthscaleU is not None:
-            lengthscaleU = np.asarray(lengthscaleU)
-            assert lengthscaleU.size == 1, "lengthscaleU should be one dimensional"
-        else:
-            lengthscaleU = np.ones(1)
-        if lengthscaleY is not None:
-            lengthscaleY = np.asarray(lengthscaleY)
-            assert lengthscaleY.size == 1, "lengthscaleY should be one dimensional"
-        else:
-            lengthscaleY = np.ones(1)
-            #lengthscaleY = 0.5
-        self._set_params(np.hstack((varianceU, varianceY, lengthscaleU,lengthscaleY)))
-
-    def _get_params(self):
-        """return the value of the parameters."""
-        return np.hstack((self.varianceU,self.varianceY, self.lengthscaleU,self.lengthscaleY))
-
-    def _set_params(self, x):
-        """set the value of the parameters."""
-        assert x.size == self.num_params
-
-        self.varianceU = x[0]
-        self.varianceY = x[1]
-        self.lengthscaleU = x[2]
-        self.lengthscaleY = x[3]
-
-
-    def _get_param_names(self):
-        """return parameter names."""
-        return ['varianceU','varianceY', 'lengthscaleU', 'lengthscaleY']
-
-
-    def K(self, X, X2, target):
-        """Compute the covariance matrix between X and X2."""
-        # model :   a * dy/dt + b * y = U
-        #lu=sqrt(3)/theta1  ly=1/theta2  theta2= a/b :thetay   sigma2=1/(2ab) :sigmay
-
-        X,slices = X[:,:-1],index_to_slices(X[:,-1])
-        if X2 is None:
-            X2,slices2 = X,slices
-        else:
-            X2,slices2 = X2[:,:-1],index_to_slices(X2[:,-1])
-
-
-        #rdist = X[:,0][:,None] - X2[:,0][:,None].T
-        rdist = X - X2.T
-        ly=1/self.lengthscaleY
-        lu=np.sqrt(3)/self.lengthscaleU
-        #iu=self.input_lengthU  #dimention of U
-
-        Vu=self.varianceU
-        Vy=self.varianceY
-
-        # kernel for kuu  matern3/2
-        kuu = lambda dist:Vu * (1 + lu* np.abs(dist)) * np.exp(-lu * np.abs(dist))
-
-        # kernel for kyy
-        k1 = lambda dist:np.exp(-ly*np.abs(dist))*(2*lu+ly)/(lu+ly)**2
-        k2 = lambda dist:(np.exp(-lu*dist)*(ly-2*lu+lu*ly*dist-lu**2*dist) + np.exp(-ly*dist)*(2*lu-ly) ) / (ly-lu)**2
-        k3 = lambda dist:np.exp(-lu*dist) * ( (1+lu*dist)/(lu+ly) + (lu)/(lu+ly)**2 )
-        kyy = lambda dist:Vu*Vy*(k1(dist) + k2(dist) + k3(dist))
-
-
-        # cross covariance function
-        kyu3 = lambda dist:np.exp(-lu*dist)/(lu+ly)*(1+lu*(dist+1/(lu+ly)))
-
-        # cross covariance kyu
-        kyup = lambda dist:Vu*Vy*(k1(dist)+k2(dist))    #t>0 kyu
-        kyun = lambda dist:Vu*Vy*(kyu3(dist))       #t<0 kyu
-
-        # cross covariance kuy
-        kuyp = lambda dist:Vu*Vy*(kyu3(dist))       #t>0 kuy
-        kuyn = lambda dist:Vu*Vy*(k1(dist)+k2(dist))      #t<0 kuy
-
-        for i, s1 in enumerate(slices):
-            for j, s2 in enumerate(slices2):
-                for ss1 in s1:
-                    for ss2 in s2:
-                        if i==0 and j==0:
-                            target[ss1,ss2] = kuu(np.abs(rdist[ss1,ss2]))
-                        elif i==0 and j==1:
-                            #target[ss1,ss2] = np.where(  rdist[ss1,ss2]>0 , kuyp(np.abs(rdist[ss1,ss2])), kuyn(np.abs(rdist[s1[0],s2[0]]) )   )
-                            target[ss1,ss2] = np.where(  rdist[ss1,ss2]>0 , kuyp(np.abs(rdist[ss1,ss2])), kuyn(np.abs(rdist[ss1,ss2]) )   )
-                        elif i==1 and j==1:
-                            target[ss1,ss2] = kyy(np.abs(rdist[ss1,ss2]))
-                        else:
-                            #target[ss1,ss2] = np.where(  rdist[ss1,ss2]>0 , kyup(np.abs(rdist[ss1,ss2])), kyun(np.abs(rdist[s1[0],s2[0]]) )   )
-                            target[ss1,ss2] = np.where(  rdist[ss1,ss2]>0 , kyup(np.abs(rdist[ss1,ss2])), kyun(np.abs(rdist[ss1,ss2]) )   )
-
-        #KUU = kuu(np.abs(rdist[:iu,:iu]))
-
-        #KYY = kyy(np.abs(rdist[iu:,iu:]))
-
-        #KYU = np.where(rdist[iu:,:iu]>0,kyup(np.abs(rdist[iu:,:iu])),kyun(np.abs(rdist[iu:,:iu]) ))
-
-        #KUY = np.where(rdist[:iu,iu:]>0,kuyp(np.abs(rdist[:iu,iu:])),kuyn(np.abs(rdist[:iu,iu:]) ))
-
-        #ker=np.vstack((np.hstack([KUU,KUY]),np.hstack([KYU,KYY])))
-
-        #np.add(ker, target, target)
-
-    def Kdiag(self, X, target):
-        """Compute the diagonal of the covariance matrix associated to X."""
-        ly=1/self.lengthscaleY
-        lu=np.sqrt(3)/self.lengthscaleU
-        #ly=self.lengthscaleY
-        #lu=self.lengthscaleU
-
-        k1 = (2*lu+ly)/(lu+ly)**2
-        k2 = (ly-2*lu + 2*lu-ly ) / (ly-lu)**2
-        k3 = 1/(lu+ly) + (lu)/(lu+ly)**2
-
-        slices = index_to_slices(X[:,-1])
-
-        for i, ss1 in enumerate(slices):
-            for s1 in ss1:
-                if i==0:
-                    target[s1]+= self.varianceU
-                elif i==1:
-                    target[s1]+= self.varianceU*self.varianceY*(k1+k2+k3)
-                else:
-                    raise ValueError, "invalid input/output index"
-
-        #target[slices[0][0]]+= self.varianceU   #matern32 diag
-        #target[slices[1][0]]+= self.varianceU*self.varianceY*(k1+k2+k3)  #  diag
-
-
-
-
-
-
-    def dK_dtheta(self, dL_dK, X, X2, target):
-        """derivative of the covariance matrix with respect to the parameters."""
-
-        X,slices = X[:,:-1],index_to_slices(X[:,-1])
-        if X2 is None:
-            X2,slices2 = X,slices
-        else:
-            X2,slices2 = X2[:,:-1],index_to_slices(X2[:,-1])
-        #rdist = X[:,0][:,None] - X2[:,0][:,None].T
-        rdist = X - X2.T
-        ly=1/self.lengthscaleY
-        lu=np.sqrt(3)/self.lengthscaleU
-
-        rd=rdist.shape[0]
-        dktheta1 = np.zeros([rd,rd])
-        dktheta2 = np.zeros([rd,rd])
-        dkUdvar = np.zeros([rd,rd])
-        dkYdvar = np.zeros([rd,rd])
-
-        # dk dtheta for UU
-        UUdtheta1 = lambda dist: np.exp(-lu* dist)*dist + (-dist)*np.exp(-lu* dist)*(1+lu*dist)
-        UUdtheta2 = lambda dist: 0
-        #UUdvar = lambda dist: (1 + lu*dist)*np.exp(-lu*dist)
-        UUdvar = lambda dist: (1 + lu* np.abs(dist)) * np.exp(-lu * np.abs(dist))
-
-        # dk dtheta for YY
-
-        dk1theta1 = lambda dist: np.exp(-ly*dist)*2*(-lu)/(lu+ly)**3
-        #c=np.sqrt(3)
-        #t1=c/lu
-        #t2=1/ly
-        #dk1theta1=np.exp(-dist*ly)*t2*( (2*c*t2+2*t1)/(c*t2+t1)**2 -2*(2*c*t2*t1+t1**2)/(c*t2+t1)**3   )
-
-        dk2theta1 = lambda dist: 1*(
-            np.exp(-lu*dist)*dist*(-ly+2*lu-lu*ly*dist+dist*lu**2)*(ly-lu)**(-2) + np.exp(-lu*dist)*(-2+ly*dist-2*dist*lu)*(ly-lu)**(-2)
-            +np.exp(-dist*lu)*(ly-2*lu+ly*lu*dist-dist*lu**2)*2*(ly-lu)**(-3)
-            +np.exp(-dist*ly)*2*(ly-lu)**(-2)
-            +np.exp(-dist*ly)*2*(2*lu-ly)*(ly-lu)**(-3)
-            )
-
-        dk3theta1 = lambda dist: np.exp(-dist*lu)*(lu+ly)**(-2)*((2*lu+ly+dist*lu**2+lu*ly*dist)*(-dist-2/(lu+ly))+2+2*lu*dist+ly*dist)
-
-        #dktheta1 = lambda dist: self.varianceU*self.varianceY*(dk1theta1+dk2theta1+dk3theta1)
-
-
-
-
-        dk1theta2 = lambda dist: np.exp(-ly*dist) * ((lu+ly)**(-2)) * (  (-dist)*(2*lu+ly)  +  1  +  (-2)*(2*lu+ly)/(lu+ly)  )
-
-        dk2theta2 =lambda dist:  1*(
-            np.exp(-dist*lu)*(ly-lu)**(-2) * ( 1+lu*dist+(-2)*(ly-2*lu+lu*ly*dist-dist*lu**2)*(ly-lu)**(-1) )
-            +np.exp(-dist*ly)*(ly-lu)**(-2) * ( (-dist)*(2*lu-ly) -1+(2*lu-ly)*(-2)*(ly-lu)**(-1) )
-            )
-
-        dk3theta2 = lambda dist: np.exp(-dist*lu) * (-3*lu-ly-dist*lu**2-lu*ly*dist)/(lu+ly)**3
-
-        #dktheta2 = lambda dist: self.varianceU*self.varianceY*(dk1theta2 + dk2theta2 +dk3theta2)
-
-        # kyy kernel
-        #k1 = lambda dist: np.exp(-ly*dist)*(2*lu+ly)/(lu+ly)**2
-        #k2 = lambda dist: (np.exp(-lu*dist)*(ly-2*lu+lu*ly*dist-lu**2*dist) + np.exp(-ly*dist)*(2*lu-ly) ) / (ly-lu)**2
-        #k3 = lambda dist: np.exp(-lu*dist) * ( (1+lu*dist)/(lu+ly) + (lu)/(lu+ly)**2 )
-        k1 = lambda dist: np.exp(-ly*dist)*(2*lu+ly)/(lu+ly)**2
-        k2 = lambda dist: (np.exp(-lu*dist)*(ly-2*lu+lu*ly*dist-lu**2*dist) + np.exp(-ly*dist)*(2*lu-ly) ) / (ly-lu)**2
-        k3 = lambda dist: np.exp(-lu*dist) * ( (1+lu*dist)/(lu+ly) + (lu)/(lu+ly)**2 )
-        #dkdvar = k1+k2+k3
-
-        #cross covariance kernel
-        kyu3 = lambda dist:np.exp(-lu*dist)/(lu+ly)*(1+lu*(dist+1/(lu+ly)))
-
-        # dk dtheta for UY
-        dkcrtheta2 = lambda dist: np.exp(-lu*dist) * ( (-1)*(lu+ly)**(-2)*(1+lu*dist+lu*(lu+ly)**(-1)) + (lu+ly)**(-1)*(-lu)*(lu+ly)**(-2) )
-        dkcrtheta1 = lambda dist: np.exp(-lu*dist)*(lu+ly)**(-1)* ( (-dist)*(1+dist*lu+lu*(lu+ly)**(-1)) - (lu+ly)**(-1)*(1+dist*lu+lu*(lu+ly)**(-1)) +dist+(lu+ly)**(-1)-lu*(lu+ly)**(-2) )
-        #dkuyp dtheta
-        #dkuyp dtheta1 = self.varianceU*self.varianceY* (dk1theta1() + dk2theta1())
-        #dkuyp dtheta2 = self.varianceU*self.varianceY* (dk1theta2() + dk2theta2())
-        #dkuyp dVar = k1() + k2()
-
-
-        #dkyup dtheta
-        #dkyun dtheta1 = self.varianceU*self.varianceY* (dk1theta1() + dk2theta1())
-        #dkyun dtheta2 = self.varianceU*self.varianceY* (dk1theta2() + dk2theta2())
-        #dkyup dVar = k1() + k2()        #
-
-
-
-
-        for i, s1 in enumerate(slices):
-            for j, s2 in enumerate(slices2):
-                for ss1 in s1:
-                    for ss2 in s2:
-                        if i==0 and j==0:
-                            #target[ss1,ss2] = kuu(np.abs(rdist[ss1,ss2]))
-                            dktheta1[ss1,ss2] = self.varianceU*self.varianceY*UUdtheta1(np.abs(rdist[ss1,ss2]))
-                            dktheta2[ss1,ss2] = 0
-                            dkUdvar[ss1,ss2] = UUdvar(np.abs(rdist[ss1,ss2]))
-                            dkYdvar[ss1,ss2] = 0
-                        elif i==0 and j==1:
-                            #target[ss1,ss2] = np.where(  rdist[ss1,ss2]>0 , kuyp(np.abs(rdist[ss1,ss2])), kuyn(np.abs(rdist[s1[0],s2[0]]) )   )
-                            #dktheta1[ss1,ss2] =
-                            #dktheta2[ss1,ss2] =
-                            #dkdvar[ss1,ss2] =           np.where(  rdist[ss1,ss2]>0 , kuyp(np.abs(rdist[ss1,ss2])), kuyn(np.abs(rdist[s1[0],s2[0]]) )   )
-                            dktheta1[ss1,ss2] = np.where(  rdist[ss1,ss2]>0 , dkcrtheta1(np.abs(rdist[ss1,ss2])) ,self.varianceU*self.varianceY*(dk1theta1(np.abs(rdist[ss1,ss2]))+dk2theta1(np.abs(rdist[ss1,ss2])))    )
-                            dktheta2[ss1,ss2] = np.where(  rdist[ss1,ss2]>0 , dkcrtheta2(np.abs(rdist[ss1,ss2])) ,self.varianceU*self.varianceY*(dk1theta2(np.abs(rdist[ss1,ss2]))+dk2theta2(np.abs(rdist[ss1,ss2])))    )
-                            dkUdvar[ss1,ss2] = np.where(  rdist[ss1,ss2]>0 ,  kyu3(np.abs(rdist[ss1,ss2]))  ,k1(np.abs(rdist[ss1,ss2]))+k2(np.abs(rdist[ss1,ss2]))  )
-                            dkYdvar[ss1,ss2] = dkUdvar[ss1,ss2]
-                        elif i==1 and j==1:
-                            #target[ss1,ss2] = kyy(np.abs(rdist[ss1,ss2]))
-                            dktheta1[ss1,ss2] = self.varianceU*self.varianceY*(dk1theta1(np.abs(rdist[ss1,ss2]))+dk2theta1(np.abs(rdist[ss1,ss2]))+dk3theta1(np.abs(rdist[ss1,ss2])))
-                            dktheta2[ss1,ss2] = self.varianceU*self.varianceY*(dk1theta2(np.abs(rdist[ss1,ss2])) + dk2theta2(np.abs(rdist[ss1,ss2])) +dk3theta2(np.abs(rdist[ss1,ss2])))
-                            dkUdvar[ss1,ss2] = (k1(np.abs(rdist[ss1,ss2]))+k2(np.abs(rdist[ss1,ss2]))+k3(np.abs(rdist[ss1,ss2])) )
-                            dkYdvar[ss1,ss2] = dkUdvar[ss1,ss2]
-                        else:
-                            #target[ss1,ss2] = np.where(  rdist[ss1,ss2]>0 , kyup(np.abs(rdist[ss1,ss2])), kyun(np.abs(rdist[s1[0],s2[0]]) )   )
-                            dktheta1[ss1,ss2] = np.where(  rdist[ss1,ss2]>0 ,self.varianceU*self.varianceY*(dk1theta1(np.abs(rdist[ss1,ss2]))+dk2theta1(np.abs(rdist[ss1,ss2]))) , dkcrtheta1(np.abs(rdist[ss1,ss2])) )
-                            dktheta2[ss1,ss2] = np.where(  rdist[ss1,ss2]>0 ,self.varianceU*self.varianceY*(dk1theta2(np.abs(rdist[ss1,ss2]))+dk2theta2(np.abs(rdist[ss1,ss2]))) , dkcrtheta2(np.abs(rdist[ss1,ss2])) )
-                            dkUdvar[ss1,ss2] = np.where(  rdist[ss1,ss2]>0 , k1(np.abs(rdist[ss1,ss2]))+k2(np.abs(rdist[ss1,ss2])), kyu3(np.abs(rdist[ss1,ss2])) )
-                            dkYdvar[ss1,ss2] = dkUdvar[ss1,ss2]
-
-
-        target[0] += np.sum(self.varianceY*dkUdvar * dL_dK)
-        target[1] += np.sum(self.varianceU*dkYdvar * dL_dK)
-        target[2] += np.sum(dktheta1*(-np.sqrt(3)*self.lengthscaleU**(-2)) * dL_dK)
-        target[3] += np.sum(dktheta2*(-self.lengthscaleY**(-2)) * dL_dK)
-
-
-    # def dKdiag_dtheta(self, dL_dKdiag, X, target):
-    #     """derivative of the diagonal of the covariance matrix with respect to the parameters."""
-    #     # NB: derivative of diagonal elements wrt lengthscale is 0
-    #     target[0] += np.sum(dL_dKdiag)
-
-    # def dK_dX(self, dL_dK, X, X2, target):
-    #     """derivative of the covariance matrix with respect to X."""
-    #     if X2 is None: X2 = X
-    #     dist = np.sqrt(np.sum(np.square((X[:, None, :] - X2[None, :, :]) / self.lengthscale), -1))[:, :, None]
-    #     ddist_dX = (X[:, None, :] - X2[None, :, :]) / self.lengthscale ** 2 / np.where(dist != 0., dist, np.inf)
-    #     dK_dX = -np.transpose(self.variance * np.exp(-dist) * ddist_dX, (1, 0, 2))
-    #     target += np.sum(dK_dX * dL_dK.T[:, :, None], 0)
-
-    # def dKdiag_dX(self, dL_dKdiag, X, target):
-    #     pass
--- a/GPy/kern/parts/sympy_helpers.py
+++ b/GPy/kern/parts/sympy_helpers.py
@ -1,71 +0,0 @@
-# Code for testing functions written in sympy_helpers.cpp
-from scipy import weave
-import tempfile
-import os
-import numpy as np
-current_dir = os.path.dirname(os.path.abspath(os.path.dirname(__file__)))
-extra_compile_args = []
-
-weave_kwargs = {
-    'support_code': "",
-    'include_dirs':[tempfile.gettempdir(), current_dir],
-    'headers':['"parts/sympy_helpers.h"'],
-    'sources':[os.path.join(current_dir,"parts/sympy_helpers.cpp")],
-    'extra_compile_args':extra_compile_args,
-    'extra_link_args':['-lgomp'],
-    'verbose':True}
-
-def erfcx(x):
-    code = """
-        // Code for computing scaled complementary erf
-        int i;
-        int dim;
-        int elements = Ntarget[0];
-        for (dim=1; dim<Dtarget; dim++)
-          elements *= Ntarget[dim];
-        for (i=0;i<elements;i++) 
-            target[i] = erfcx(x[i]);
-        """
-    x = np.asarray(x)
-    arg_names = ['target','x']
-    target = np.zeros_like(x)
-    weave.inline(code=code, arg_names=arg_names,**weave_kwargs)
-    return target
-
-def ln_diff_erf(x, y):
-    code = """
-        // Code for computing scaled complementary erf
-        int i;
-        int dim;
-        int elements = Ntarget[0];
-        for (dim=1; dim<Dtarget; dim++)
-          elements *= Ntarget[dim];
-        for (i=0;i<elements;i++) 
-          target[i] = ln_diff_erf(x[i], y[i]);
-        """
-    x = np.asarray(x)
-    y = np.asarray(y)
-    assert(x.shape==y.shape)
-    target = np.zeros_like(x)
-    arg_names = ['target','x', 'y']
-    weave.inline(code=code, arg_names=arg_names,**weave_kwargs)
-    return target
-
-def h(t, tprime, d_i, d_j, l):
-    code = """
-        // Code for computing the 1st order ODE h helper function.
-        int i;
-        int dim;
-        int elements = Ntarget[0];
-        for (dim=1; dim<Dtarget; dim++)
-          elements *= Ntarget[dim];
-        for (i=0;i<elements;i++) 
-          target[i] = h(t[i], tprime[i], d_i, d_j, l);
-        """
-    t = np.asarray(t)
-    tprime = np.asarray(tprime)
-    assert(tprime.shape==t.shape)
-    target = np.zeros_like(t)
-    arg_names = ['target','t', 'tprime', 'd_i', 'd_j', 'l']
-    weave.inline(code=code, arg_names=arg_names,**weave_kwargs)
-    return target
--- a/GPy/likelihoods/laplace.py
+++ b/GPy/likelihoods/laplace.py
@ -1,406 +0,0 @@
-# Copyright (c) 2013, GPy authors (see AUTHORS.txt).
-# Licensed under the BSD 3-clause license (see LICENSE.txt)
-#
-#Parts of this file were influenced by the Matlab GPML framework written by
-#Carl Edward Rasmussen & Hannes Nickisch, however all bugs are our own.
-#
-#The GPML code is released under the FreeBSD License.
-#Copyright (c) 2005-2013 Carl Edward Rasmussen & Hannes Nickisch. All rights reserved.
-#
-#The code and associated documentation is available from
-#http://gaussianprocess.org/gpml/code.
-
-import numpy as np
-import scipy as sp
-from likelihood import likelihood
-from ..util.linalg import mdot, jitchol, pddet, dpotrs
-from functools import partial as partial_func
-import warnings
-
-class Laplace(likelihood):
-    """Laplace approximation to a posterior"""
-
-    def __init__(self, data, noise_model, extra_data=None):
-        """
-        Laplace Approximation
-
-        Find the moments \hat{f} and the hessian at this point
-        (using Newton-Raphson) of the unnormalised posterior
-
-        Compute the GP variables (i.e. generate some Y^{squiggle} and
-        z^{squiggle} which makes a gaussian the same as the laplace
-        approximation to the posterior, but normalised
-
-        Arguments
-        ---------
-
-        :param data: array of data the likelihood function is approximating
-        :type data: NxD
-        :param noise_model: likelihood function - subclass of noise_model
-        :type noise_model: noise_model
-        :param extra_data: additional data used by some likelihood functions,
-        """
-        self.data = data
-        self.noise_model = noise_model
-        self.extra_data = extra_data
-
-        #Inital values
-        self.N, self.D = self.data.shape
-        self.is_heteroscedastic = True
-        self.Nparams = 0
-        self.NORMAL_CONST = ((0.5 * self.N) * np.log(2 * np.pi))
-
-        self.restart()
-        likelihood.__init__(self)
-
-    def restart(self):
-        """
-        Reset likelihood variables to their defaults
-        """
-        #Initial values for the GP variables
-        self.Y = np.zeros((self.N, 1))
-        self.covariance_matrix = np.eye(self.N)
-        self.precision = np.ones(self.N)[:, None]
-        self.Z = 0
-        self.YYT = None
-
-        self.old_Ki_f = None
-        self.bad_fhat = False
-
-    def predictive_values(self,mu,var,full_cov,**noise_args):
-        if full_cov:
-            raise NotImplementedError, "Cannot make correlated predictions with an EP likelihood"
-        return self.noise_model.predictive_values(mu,var,**noise_args)
-
-    def log_predictive_density(self, y_test, mu_star, var_star):
-        """
-        Calculation of the log predictive density
-
-        .. math:
-            p(y_{*}|D) = p(y_{*}|f_{*})p(f_{*}|\mu_{*}\\sigma^{2}_{*})
-
-        :param y_test: test observations (y_{*})
-        :type y_test: (Nx1) array
-        :param mu_star: predictive mean of gaussian p(f_{*}|mu_{*}, var_{*})
-        :type mu_star: (Nx1) array
-        :param var_star: predictive variance of gaussian p(f_{*}|mu_{*}, var_{*})
-        :type var_star: (Nx1) array
-        """
-        return self.noise_model.log_predictive_density(y_test, mu_star, var_star)
-
-    def _get_params(self):
-        return np.asarray(self.noise_model._get_params())
-
-    def _get_param_names(self):
-        return self.noise_model._get_param_names()
-
-    def _set_params(self, p):
-        return self.noise_model._set_params(p)
-
-    def _shared_gradients_components(self):
-        d3lik_d3fhat = self.noise_model.d3logpdf_df3(self.f_hat, self.data, extra_data=self.extra_data)
-        dL_dfhat = 0.5*(np.diag(self.Ki_W_i)[:, None]*d3lik_d3fhat).T #why isn't this -0.5?
-        I_KW_i = np.eye(self.N) - np.dot(self.K, self.Wi_K_i)
-        return dL_dfhat, I_KW_i
-
-    def _Kgradients(self):
-        """
-        Gradients with respect to prior kernel parameters dL_dK to be chained
-        with dK_dthetaK to give dL_dthetaK
-        :returns: dL_dK matrix
-        :rtype: Matrix (1 x num_kernel_params)
-        """
-        dL_dfhat, I_KW_i = self._shared_gradients_components()
-        dlp = self.noise_model.dlogpdf_df(self.f_hat, self.data, extra_data=self.extra_data)
-
-        #Explicit
-        #expl_a = np.dot(self.Ki_f, self.Ki_f.T)
-        #expl_b = self.Wi_K_i
-        #expl = 0.5*expl_a - 0.5*expl_b
-        #dL_dthetaK_exp = dK_dthetaK(expl, X)
-
-        #Implicit
-        impl = mdot(dlp, dL_dfhat, I_KW_i)
-
-        #No longer required as we are computing these in the gp already
-        #otherwise we would take them away and add them back
-        #dL_dthetaK_imp = dK_dthetaK(impl, X)
-        #dL_dthetaK = dL_dthetaK_exp + dL_dthetaK_imp
-        #dL_dK = expl + impl
-
-        #No need to compute explicit as we are computing dZ_dK to account
-        #for the difference between the K gradients of a normal GP,
-        #and the K gradients including the implicit part
-        dL_dK = impl
-        return dL_dK
-
-    def _gradients(self, partial):
-        """
-        Gradients with respect to likelihood parameters (dL_dthetaL)
-
-        :param partial: Not needed by this likelihood
-        :type partial: lambda function
-        :rtype: array of derivatives (1 x num_likelihood_params)
-        """
-        dL_dfhat, I_KW_i = self._shared_gradients_components()
-        dlik_dthetaL, dlik_grad_dthetaL, dlik_hess_dthetaL = self.noise_model._laplace_gradients(self.f_hat, self.data, extra_data=self.extra_data)
-
-        #len(dlik_dthetaL)
-        num_params = len(self._get_param_names())
-        # make space for one derivative for each likelihood parameter
-        dL_dthetaL = np.zeros(num_params)
-        for thetaL_i in range(num_params):
-            #Explicit
-            dL_dthetaL_exp = ( np.sum(dlik_dthetaL[:, thetaL_i])
-                             #- 0.5*np.trace(mdot(self.Ki_W_i, (self.K, np.diagflat(dlik_hess_dthetaL[thetaL_i]))))
-                             + np.dot(0.5*np.diag(self.Ki_W_i)[:,None].T, dlik_hess_dthetaL[:, thetaL_i])
-                             )
-
-            #Implicit
-            dfhat_dthetaL = mdot(I_KW_i, self.K, dlik_grad_dthetaL[:, thetaL_i])
-            dL_dthetaL_imp = np.dot(dL_dfhat, dfhat_dthetaL)
-            dL_dthetaL[thetaL_i] = dL_dthetaL_exp + dL_dthetaL_imp
-
-        return dL_dthetaL
-
-    def _compute_GP_variables(self):
-        """
-        Generate data Y which would give the normal distribution identical
-        to the laplace approximation to the posterior, but normalised
-
-        GPy expects a likelihood to be gaussian, so need to caluclate
-        the data Y^{\tilde} that makes the posterior match that found
-        by a laplace approximation to a non-gaussian likelihood but with
-        a gaussian likelihood
-
-        Firstly,
-        The hessian of the unormalised posterior distribution is (K^{-1} + W)^{-1},
-        i.e. z*N(f|f^{\hat}, (K^{-1} + W)^{-1}) but this assumes a non-gaussian likelihood,
-        we wish to find the hessian \Sigma^{\tilde}
-        that has the same curvature but using our new simulated data Y^{\tilde}
-        i.e. we do N(Y^{\tilde}|f^{\hat}, \Sigma^{\tilde})N(f|0, K) = z*N(f|f^{\hat}, (K^{-1} + W)^{-1})
-        and we wish to find what Y^{\tilde} and \Sigma^{\tilde}
-        We find that Y^{\tilde} = W^{-1}(K^{-1} + W)f^{\hat} and \Sigma^{tilde} = W^{-1}
-
-        Secondly,
-        GPy optimizes the log marginal log p(y) = -0.5*ln|K+\Sigma^{\tilde}| - 0.5*Y^{\tilde}^{T}(K^{-1} + \Sigma^{tilde})^{-1}Y + lik.Z
-        So we can suck up any differences between that and our log marginal likelihood approximation
-        p^{\squiggle}(y) = -0.5*f^{\hat}K^{-1}f^{\hat} + log p(y|f^{\hat}) - 0.5*log |K||K^{-1} + W|
-        which we want to optimize instead, by equating them and rearranging, the difference is added onto
-        the log p(y) that GPy optimizes by default
-
-        Thirdly,
-        Since we have gradients that depend on how we move f^{\hat}, we have implicit components
-        aswell as the explicit dL_dK, we hold these differences in dZ_dK and add them to dL_dK in the
-        gp.py code
-        """
-        Wi = 1.0/self.W
-        self.Sigma_tilde = np.diagflat(Wi)
-
-        Y_tilde = Wi*self.Ki_f + self.f_hat
-
-        self.Wi_K_i = self.W12BiW12
-        ln_det_Wi_K = pddet(self.Sigma_tilde + self.K)
-        lik = self.noise_model.logpdf(self.f_hat, self.data, extra_data=self.extra_data)
-        y_Wi_K_i_y = mdot(Y_tilde.T, self.Wi_K_i, Y_tilde)
-
-        Z_tilde = (+ lik
-                   - 0.5*self.ln_B_det
-                   + 0.5*ln_det_Wi_K
-                   - 0.5*self.f_Ki_f
-                   + 0.5*y_Wi_K_i_y
-                   + self.NORMAL_CONST
-                  )
-
-        #Convert to float as its (1, 1) and Z must be a scalar
-        self.Z = np.float64(Z_tilde)
-        self.Y = Y_tilde
-        self.YYT = np.dot(self.Y, self.Y.T)
-        self.covariance_matrix = self.Sigma_tilde
-        self.precision = 1.0 / np.diag(self.covariance_matrix)[:, None]
-
-        #Compute dZ_dK which is how the approximated distributions gradients differ from the dL_dK computed for other likelihoods
-        self.dZ_dK = self._Kgradients()
-        #+ 0.5*self.Wi_K_i - 0.5*np.dot(self.Ki_f, self.Ki_f.T) #since we are not adding the K gradients explicit part theres no need to compute this again
-
-    def fit_full(self, K):
-        """
-        The laplace approximation algorithm, find K and expand hessian
-        For nomenclature see Rasmussen & Williams 2006 - modified for numerical stability
-
-        :param K: Prior covariance matrix evaluated at locations X
-        :type K: NxN matrix
-        """
-        self.K = K.copy()
-
-        #Find mode
-        self.f_hat = self.rasm_mode(self.K)
-
-        #Compute hessian and other variables at mode
-        self._compute_likelihood_variables()
-
-        #Compute fake variables replicating laplace approximation to posterior
-        self._compute_GP_variables()
-
-    def _compute_likelihood_variables(self):
-        """
-        Compute the variables required to compute gaussian Y variables
-        """
-        #At this point get the hessian matrix (or vector as W is diagonal)
-        self.W = -self.noise_model.d2logpdf_df2(self.f_hat, self.data, extra_data=self.extra_data)
-
-        if not self.noise_model.log_concave:
-            i = self.W < 1e-6
-            if np.any(i):
-                warnings.warn('truncating non log-concave likelihood curvature')
-                # FIXME-HACK: This is a hack since GPy can't handle negative variances which can occur
-                self.W[i] = 1e-6
-
-        self.W12BiW12, self.ln_B_det = self._compute_B_statistics(self.K, self.W, np.eye(self.N))
-
-        self.Ki_f = self.Ki_f
-        self.f_Ki_f = np.dot(self.f_hat.T, self.Ki_f)
-        self.Ki_W_i = self.K - mdot(self.K, self.W12BiW12, self.K)
-
-    def _compute_B_statistics(self, K, W, a):
-        """
-        Rasmussen suggests the use of a numerically stable positive definite matrix B
-        Which has a positive diagonal element and can be easyily inverted
-
-        :param K: Prior Covariance matrix evaluated at locations X
-        :type K: NxN matrix
-        :param W: Negative hessian at a point (diagonal matrix)
-        :type W: Vector of diagonal values of hessian (1xN)
-        :param a: Matrix to calculate W12BiW12a
-        :type a: Matrix NxN
-        :returns: (W12BiW12a, ln_B_det)
-        """
-        if not self.noise_model.log_concave:
-            #print "Under 1e-10: {}".format(np.sum(W < 1e-6))
-            W[W < 1e-10] = 1e-10  # FIXME-HACK: This is a hack since GPy can't handle negative variances which can occur
-                                  # If the likelihood is non-log-concave. We wan't to say that there is a negative variance
-                                  # To cause the posterior to become less certain than the prior and likelihood,
-                                  # This is a property only held by non-log-concave likelihoods
-
-
-        #W is diagonal so its sqrt is just the sqrt of the diagonal elements
-        W_12 = np.sqrt(W)
-        B = np.eye(self.N) + W_12*K*W_12.T
-        L = jitchol(B)
-
-        W12BiW12a = W_12*dpotrs(L, np.asfortranarray(W_12*a), lower=1)[0]
-        ln_B_det = 2*np.sum(np.log(np.diag(L)))
-        return W12BiW12a, ln_B_det
-
-    def rasm_mode(self, K, MAX_ITER=40):
-        """
-        Rasmussen's numerically stable mode finding
-        For nomenclature see Rasmussen & Williams 2006
-        Influenced by GPML (BSD) code, all errors are our own
-
-        :param K: Covariance matrix evaluated at locations X
-        :type K: NxD matrix
-        :param MAX_ITER: Maximum number of iterations of newton-raphson before forcing finish of optimisation
-        :type MAX_ITER: scalar
-        :returns: f_hat, mode on which to make laplace approxmiation
-        :rtype: NxD matrix
-        """
-        #old_Ki_f = np.zeros((self.N, 1))
-
-        #Start f's at zero originally of if we have gone off track, try restarting
-        if self.old_Ki_f is None or self.bad_fhat:
-            old_Ki_f = np.random.rand(self.N, 1)/50.0
-            #old_Ki_f = self.Y
-            f = np.dot(K, old_Ki_f)
-        else:
-            #Start at the old best point
-            old_Ki_f = self.old_Ki_f.copy()
-            f = self.f_hat.copy()
-
-        new_obj = -np.inf
-        old_obj = np.inf
-
-        def obj(Ki_f, f):
-            return -0.5*np.dot(Ki_f.T, f) + self.noise_model.logpdf(f, self.data, extra_data=self.extra_data)
-
-        difference = np.inf
-        epsilon = 1e-7
-        #step_size = 1
-        #rs = 0
-        i = 0
-
-        while difference > epsilon and i < MAX_ITER:
-            W = -self.noise_model.d2logpdf_df2(f, self.data, extra_data=self.extra_data)
-
-            W_f = W*f
-            grad = self.noise_model.dlogpdf_df(f, self.data, extra_data=self.extra_data)
-
-            b = W_f + grad
-            W12BiW12Kb, _ = self._compute_B_statistics(K, W.copy(), np.dot(K, b))
-
-            #Work out the DIRECTION that we want to move in, but don't choose the stepsize yet
-            full_step_Ki_f = b - W12BiW12Kb
-            dKi_f = full_step_Ki_f - old_Ki_f
-
-            f_old = f.copy()
-            def inner_obj(step_size, old_Ki_f, dKi_f, K):
-                Ki_f = old_Ki_f + step_size*dKi_f
-                f = np.dot(K, Ki_f)
-                # This is nasty, need to set something within an optimization though
-                self.tmp_Ki_f = Ki_f.copy()
-                self.tmp_f = f.copy()
-                return -obj(Ki_f, f)
-
-            i_o = partial_func(inner_obj, old_Ki_f=old_Ki_f, dKi_f=dKi_f, K=K)
-            #Find the stepsize that minimizes the objective function using a brent line search
-            #The tolerance and maxiter matter for speed! Seems to be best to keep them low and make more full
-            #steps than get this exact then make a step, if B was bigger it might be the other way around though
-            #new_obj = sp.optimize.minimize_scalar(i_o, method='brent', tol=1e-4, options={'maxiter':5}).fun
-            new_obj = sp.optimize.brent(i_o, tol=1e-4, maxiter=10)
-            f = self.tmp_f.copy()
-            Ki_f = self.tmp_Ki_f.copy()
-
-            #Optimize without linesearch
-            #f_old = f.copy()
-            #update_passed = False
-            #while not update_passed:
-                #Ki_f = old_Ki_f + step_size*dKi_f
-                #f = np.dot(K, Ki_f)
-
-                #old_obj = new_obj
-                #new_obj = obj(Ki_f, f)
-                #difference = new_obj - old_obj
-                ##print "difference: ",difference
-                #if difference < 0:
-                    ##print "Objective function rose", np.float(difference)
-                    ##If the objective function isn't rising, restart optimization
-                    #step_size *= 0.8
-                    ##print "Reducing step-size to {ss:.3} and restarting optimization".format(ss=step_size)
-                    ##objective function isn't increasing, try reducing step size
-                    #f = f_old.copy() #it's actually faster not to go back to old location and just zigzag across the mode
-                    #old_obj = new_obj
-                    #rs += 1
-                #else:
-                    #update_passed = True
-
-            #old_Ki_f = self.Ki_f.copy()
-
-            #difference = abs(new_obj - old_obj)
-            #old_obj = new_obj.copy()
-            difference = np.abs(np.sum(f - f_old)) + np.abs(np.sum(Ki_f - old_Ki_f))
-            #difference = np.abs(np.sum(Ki_f - old_Ki_f))/np.float(self.N)
-            old_Ki_f = Ki_f.copy()
-            i += 1
-
-        self.old_Ki_f = old_Ki_f.copy()
-
-        #Warn of bad fits
-        if difference > epsilon:
-            self.bad_fhat = True
-            warnings.warn("Not perfect f_hat fit difference: {}".format(difference))
-        elif self.bad_fhat:
-            self.bad_fhat = False
-            warnings.warn("f_hat now perfect again")
-
-        self.Ki_f = Ki_f
-        return f
--- a/GPy/likelihoods/noise_models/bernoulli_noise.py
+++ b/GPy/likelihoods/noise_models/bernoulli_noise.py
@ -1,222 +0,0 @@
-# Copyright (c) 2012, 2013 Ricardo Andrade
-# Licensed under the BSD 3-clause license (see LICENSE.txt)
-
-
-import numpy as np
-from scipy import stats,special
-import scipy as sp
-from GPy.util.univariate_Gaussian import std_norm_pdf,std_norm_cdf
-import gp_transformations
-from noise_distributions import NoiseDistribution
-
-class Bernoulli(NoiseDistribution):
-    """
-    Bernoulli likelihood
-
-    .. math::
-        p(y_{i}|\\lambda(f_{i})) = \\lambda(f_{i})^{y_{i}}(1-f_{i})^{1-y_{i}}
-
-    .. Note::
-        Y is expected to take values in {-1,1}
-        Probit likelihood usually used
-    """
-    def __init__(self,gp_link=None,analytical_mean=False,analytical_variance=False):
-        super(Bernoulli, self).__init__(gp_link,analytical_mean,analytical_variance)
-        if isinstance(gp_link , (gp_transformations.Heaviside, gp_transformations.Probit)):
-            self.log_concave = True
-
-    def _preprocess_values(self,Y):
-        """
-        Check if the values of the observations correspond to the values
-        assumed by the likelihood function.
-
-        ..Note:: Binary classification algorithm works better with classes {-1,1}
-        """
-        Y_prep = Y.copy()
-        Y1 = Y[Y.flatten()==1].size
-        Y2 = Y[Y.flatten()==0].size
-        assert Y1 + Y2 == Y.size, 'Bernoulli likelihood is meant to be used only with outputs in {0,1}.'
-        Y_prep[Y.flatten() == 0] = -1
-        return Y_prep
-
-    def _moments_match_analytical(self,data_i,tau_i,v_i):
-        """
-        Moments match of the marginal approximation in EP algorithm
-
-        :param i: number of observation (int)
-        :param tau_i: precision of the cavity distribution (float)
-        :param v_i: mean/variance of the cavity distribution (float)
-        """
-        if data_i == 1:
-            sign = 1.
-        elif data_i == 0:
-            sign = -1
-        else:
-            raise ValueError("bad value for Bernouilli observation (0,1)")
-        if isinstance(self.gp_link,gp_transformations.Probit):
-            z = sign*v_i/np.sqrt(tau_i**2 + tau_i)
-            Z_hat = std_norm_cdf(z)
-            phi = std_norm_pdf(z)
-            mu_hat = v_i/tau_i + sign*phi/(Z_hat*np.sqrt(tau_i**2 + tau_i))
-            sigma2_hat = 1./tau_i - (phi/((tau_i**2+tau_i)*Z_hat))*(z+phi/Z_hat)
-
-        elif isinstance(self.gp_link,gp_transformations.Heaviside):
-            a = sign*v_i/np.sqrt(tau_i)
-            Z_hat = std_norm_cdf(a)
-            N = std_norm_pdf(a)
-            mu_hat = v_i/tau_i + sign*N/Z_hat/np.sqrt(tau_i)
-            sigma2_hat = (1. - a*N/Z_hat - np.square(N/Z_hat))/tau_i
-            if np.any(np.isnan([Z_hat, mu_hat, sigma2_hat])):
-                stop
-        else:
-            raise ValueError("Exact moment matching not available for link {}".format(self.gp_link.gp_transformations.__name__))
-
-        return Z_hat, mu_hat, sigma2_hat
-
-    def _predictive_mean_analytical(self,mu,variance):
-
-        if isinstance(self.gp_link,gp_transformations.Probit):
-            return stats.norm.cdf(mu/np.sqrt(1+variance))
-
-        elif isinstance(self.gp_link,gp_transformations.Heaviside):
-            return stats.norm.cdf(mu/np.sqrt(variance))
-
-        else:
-            raise NotImplementedError
-
-    def _predictive_variance_analytical(self,mu,variance, pred_mean):
-
-        if isinstance(self.gp_link,gp_transformations.Heaviside):
-            return 0.
-        else:
-            raise NotImplementedError
-
-    def pdf_link(self, link_f, y, extra_data=None):
-        """
-        Likelihood function given link(f)
-
-        .. math::
-            p(y_{i}|\\lambda(f_{i})) = \\lambda(f_{i})^{y_{i}}(1-f_{i})^{1-y_{i}}
-
-        :param link_f: latent variables link(f)
-        :type link_f: Nx1 array
-        :param y: data
-        :type y: Nx1 array
-        :param extra_data: extra_data not used in bernoulli
-        :returns: likelihood evaluated for this point
-        :rtype: float
-
-        .. Note:
-            Each y_i must be in {0,1}
-        """
-        assert np.atleast_1d(link_f).shape == np.atleast_1d(y).shape
-        objective = (link_f**y) * ((1.-link_f)**(1.-y))
-        return np.exp(np.sum(np.log(objective)))
-
-    def logpdf_link(self, link_f, y, extra_data=None):
-        """
-        Log Likelihood function given link(f)
-
-        .. math::
-            \\ln p(y_{i}|\\lambda(f_{i})) = y_{i}\\log\\lambda(f_{i}) + (1-y_{i})\\log (1-f_{i})
-
-        :param link_f: latent variables link(f)
-        :type link_f: Nx1 array
-        :param y: data
-        :type y: Nx1 array
-        :param extra_data: extra_data not used in bernoulli
-        :returns: log likelihood evaluated at points link(f)
-        :rtype: float
-        """
-        assert np.atleast_1d(link_f).shape == np.atleast_1d(y).shape
-        #objective = y*np.log(link_f) + (1.-y)*np.log(link_f)
-        objective = np.where(y==1, np.log(link_f), np.log(1-link_f))
-        return np.sum(objective)
-
-    def dlogpdf_dlink(self, link_f, y, extra_data=None):
-        """
-        Gradient of the pdf at y, given link(f) w.r.t link(f)
-
-        .. math::
-            \\frac{d\\ln p(y_{i}|\\lambda(f_{i}))}{d\\lambda(f)} = \\frac{y_{i}}{\\lambda(f_{i})} - \\frac{(1 - y_{i})}{(1 - \\lambda(f_{i}))}
-
-        :param link_f: latent variables link(f)
-        :type link_f: Nx1 array
-        :param y: data
-        :type y: Nx1 array
-        :param extra_data: extra_data not used in bernoulli
-        :returns: gradient of log likelihood evaluated at points link(f)
-        :rtype: Nx1 array
-        """
-        assert np.atleast_1d(link_f).shape == np.atleast_1d(y).shape
-        grad = (y/link_f) - (1.-y)/(1-link_f)
-        return grad
-
-    def d2logpdf_dlink2(self, link_f, y, extra_data=None):
-        """
-        Hessian at y, given link_f, w.r.t link_f the hessian will be 0 unless i == j
-        i.e. second derivative logpdf at y given link(f_i) link(f_j)  w.r.t link(f_i) and link(f_j)
-
-
-        .. math::
-            \\frac{d^{2}\\ln p(y_{i}|\\lambda(f_{i}))}{d\\lambda(f)^{2}} = \\frac{-y_{i}}{\\lambda(f)^{2}} - \\frac{(1-y_{i})}{(1-\\lambda(f))^{2}}
-
-        :param link_f: latent variables link(f)
-        :type link_f: Nx1 array
-        :param y: data
-        :type y: Nx1 array
-        :param extra_data: extra_data not used in bernoulli
-        :returns: Diagonal of log hessian matrix (second derivative of log likelihood evaluated at points link(f))
-        :rtype: Nx1 array
-
-        .. Note::
-            Will return diagonal of hessian, since every where else it is 0, as the likelihood factorizes over cases
-            (the distribution for y_i depends only on link(f_i) not on link(f_(j!=i))
-        """
-        assert np.atleast_1d(link_f).shape == np.atleast_1d(y).shape
-        d2logpdf_dlink2 = -y/(link_f**2) - (1-y)/((1-link_f)**2)
-        return d2logpdf_dlink2
-
-    def d3logpdf_dlink3(self, link_f, y, extra_data=None):
-        """
-        Third order derivative log-likelihood function at y given link(f) w.r.t link(f)
-
-        .. math::
-            \\frac{d^{3} \\ln p(y_{i}|\\lambda(f_{i}))}{d^{3}\\lambda(f)} = \\frac{2y_{i}}{\\lambda(f)^{3}} - \\frac{2(1-y_{i}}{(1-\\lambda(f))^{3}}
-
-        :param link_f: latent variables link(f)
-        :type link_f: Nx1 array
-        :param y: data
-        :type y: Nx1 array
-        :param extra_data: extra_data not used in bernoulli
-        :returns: third derivative of log likelihood evaluated at points link(f)
-        :rtype: Nx1 array
-        """
-        assert np.atleast_1d(link_f).shape == np.atleast_1d(y).shape
-        d3logpdf_dlink3 = 2*(y/(link_f**3) - (1-y)/((1-link_f)**3))
-        return d3logpdf_dlink3
-
-    def _mean(self,gp):
-        """
-        Mass (or density) function
-        """
-        return self.gp_link.transf(gp)
-
-    def _variance(self,gp):
-        """
-        Mass (or density) function
-        """
-        p = self.gp_link.transf(gp)
-        return p*(1.-p)
-
-    def samples(self, gp):
-        """
-        Returns a set of samples of observations based on a given value of the latent variable.
-
-        :param gp: latent variable
-        """
-        orig_shape = gp.shape
-        gp = gp.flatten()
-        ns = np.ones_like(gp, dtype=int)
-        Ysim = np.random.binomial(ns, self.gp_link.transf(gp))
-        return Ysim.reshape(orig_shape)
--- a/GPy/likelihoods/noise_models/student_t_noise.py
+++ b/GPy/likelihoods/noise_models/student_t_noise.py
@ -1,277 +0,0 @@
-# Copyright (c) 2012, 2013 Ricardo Andrade
-# Licensed under the BSD 3-clause license (see LICENSE.txt)
-
-import numpy as np
-from scipy import stats, special
-import scipy as sp
-import gp_transformations
-from noise_distributions import NoiseDistribution
-from scipy import stats, integrate
-from scipy.special import gammaln, gamma
-
-class StudentT(NoiseDistribution):
-    """
-    Student T likelihood
-
-    For nomanclature see Bayesian Data Analysis 2003 p576
-
-    .. math::
-        p(y_{i}|\\lambda(f_{i})) = \\frac{\\Gamma\\left(\\frac{v+1}{2}\\right)}{\\Gamma\\left(\\frac{v}{2}\\right)\\sqrt{v\\pi\\sigma^{2}}}\\left(1 + \\frac{1}{v}\\left(\\frac{(y_{i} - f_{i})^{2}}{\\sigma^{2}}\\right)\\right)^{\\frac{-v+1}{2}}
-
-    """
-    def __init__(self,gp_link=None,analytical_mean=True,analytical_variance=True, deg_free=5, sigma2=2):
-        self.v = deg_free
-        self.sigma2 = sigma2
-
-        self._set_params(np.asarray(sigma2))
-        super(StudentT, self).__init__(gp_link,analytical_mean,analytical_variance)
-        self.log_concave = False
-
-    def _get_params(self):
-        return np.asarray(self.sigma2)
-
-    def _get_param_names(self):
-        return ["t_noise_std2"]
-
-    def _set_params(self, x):
-        self.sigma2 = float(x)
-
-    @property
-    def variance(self, extra_data=None):
-        return (self.v / float(self.v - 2)) * self.sigma2
-
-    def pdf_link(self, link_f, y, extra_data=None):
-        """
-        Likelihood function given link(f)
-
-        .. math::
-            p(y_{i}|\\lambda(f_{i})) = \\frac{\\Gamma\\left(\\frac{v+1}{2}\\right)}{\\Gamma\\left(\\frac{v}{2}\\right)\\sqrt{v\\pi\\sigma^{2}}}\\left(1 + \\frac{1}{v}\\left(\\frac{(y_{i} - \\lambda(f_{i}))^{2}}{\\sigma^{2}}\\right)\\right)^{\\frac{-v+1}{2}}
-
-        :param link_f: latent variables link(f)
-        :type link_f: Nx1 array
-        :param y: data
-        :type y: Nx1 array
-        :param extra_data: extra_data which is not used in student t distribution
-        :returns: likelihood evaluated for this point
-        :rtype: float
-        """
-        assert np.atleast_1d(link_f).shape == np.atleast_1d(y).shape
-        e = y - link_f
-        #Careful gamma(big_number) is infinity!
-        objective = ((np.exp(gammaln((self.v + 1)*0.5) - gammaln(self.v * 0.5))
-                     / (np.sqrt(self.v * np.pi * self.sigma2)))
-                     * ((1 + (1./float(self.v))*((e**2)/float(self.sigma2)))**(-0.5*(self.v + 1)))
-                    )
-        return np.prod(objective)
-
-    def logpdf_link(self, link_f, y, extra_data=None):
-        """
-        Log Likelihood Function given link(f)
-
-        .. math::
-            \\ln p(y_{i}|\lambda(f_{i})) = \\ln \\Gamma\\left(\\frac{v+1}{2}\\right) - \\ln \\Gamma\\left(\\frac{v}{2}\\right) - \\ln \\sqrt{v \\pi\\sigma^{2}} - \\frac{v+1}{2}\\ln \\left(1 + \\frac{1}{v}\\left(\\frac{(y_{i} - \lambda(f_{i}))^{2}}{\\sigma^{2}}\\right)\\right)
-
-        :param link_f: latent variables (link(f))
-        :type link_f: Nx1 array
-        :param y: data
-        :type y: Nx1 array
-        :param extra_data: extra_data which is not used in student t distribution
-        :returns: likelihood evaluated for this point
-        :rtype: float
-
-        """
-        assert np.atleast_1d(link_f).shape == np.atleast_1d(y).shape
-        e = y - link_f
-        objective = (+ gammaln((self.v + 1) * 0.5)
-                     - gammaln(self.v * 0.5)
-                     - 0.5*np.log(self.sigma2 * self.v * np.pi)
-                     - 0.5*(self.v + 1)*np.log(1 + (1/np.float(self.v))*((e**2)/self.sigma2))
-                    )
-        return np.sum(objective)
-
-    def dlogpdf_dlink(self, link_f, y, extra_data=None):
-        """
-        Gradient of the log likelihood function at y, given link(f) w.r.t link(f)
-
-        .. math::
-            \\frac{d \\ln p(y_{i}|\lambda(f_{i}))}{d\\lambda(f)} = \\frac{(v+1)(y_{i}-\lambda(f_{i}))}{(y_{i}-\lambda(f_{i}))^{2} + \\sigma^{2}v}
-
-        :param link_f: latent variables (f)
-        :type link_f: Nx1 array
-        :param y: data
-        :type y: Nx1 array
-        :param extra_data: extra_data which is not used in student t distribution
-        :returns: gradient of likelihood evaluated at points
-        :rtype: Nx1 array
-
-        """
-        assert np.atleast_1d(link_f).shape == np.atleast_1d(y).shape
-        e = y - link_f
-        grad = ((self.v + 1) * e) / (self.v * self.sigma2 + (e**2))
-        return grad
-
-    def d2logpdf_dlink2(self, link_f, y, extra_data=None):
-        """
-        Hessian at y, given link(f), w.r.t link(f)
-        i.e. second derivative logpdf at y given link(f_i) and link(f_j)  w.r.t link(f_i) and link(f_j)
-        The hessian will be 0 unless i == j
-
-        .. math::
-            \\frac{d^{2} \\ln p(y_{i}|\lambda(f_{i}))}{d^{2}\\lambda(f)} = \\frac{(v+1)((y_{i}-\lambda(f_{i}))^{2} - \\sigma^{2}v)}{((y_{i}-\lambda(f_{i}))^{2} + \\sigma^{2}v)^{2}}
-
-        :param link_f: latent variables link(f)
-        :type link_f: Nx1 array
-        :param y: data
-        :type y: Nx1 array
-        :param extra_data: extra_data which is not used in student t distribution
-        :returns: Diagonal of hessian matrix (second derivative of likelihood evaluated at points f)
-        :rtype: Nx1 array
-
-        .. Note::
-            Will return diagonal of hessian, since every where else it is 0, as the likelihood factorizes over cases
-            (the distribution for y_i depends only on link(f_i) not on link(f_(j!=i))
-        """
-        assert np.atleast_1d(link_f).shape == np.atleast_1d(y).shape
-        e = y - link_f
-        hess = ((self.v + 1)*(e**2 - self.v*self.sigma2)) / ((self.sigma2*self.v + e**2)**2)
-        return hess
-
-    def d3logpdf_dlink3(self, link_f, y, extra_data=None):
-        """
-        Third order derivative log-likelihood function at y given link(f) w.r.t link(f)
-
-        .. math::
-            \\frac{d^{3} \\ln p(y_{i}|\lambda(f_{i}))}{d^{3}\\lambda(f)} = \\frac{-2(v+1)((y_{i} - \lambda(f_{i}))^3 - 3(y_{i} - \lambda(f_{i})) \\sigma^{2} v))}{((y_{i} - \lambda(f_{i})) + \\sigma^{2} v)^3}
-
-        :param link_f: latent variables link(f)
-        :type link_f: Nx1 array
-        :param y: data
-        :type y: Nx1 array
-        :param extra_data: extra_data which is not used in student t distribution
-        :returns: third derivative of likelihood evaluated at points f
-        :rtype: Nx1 array
-        """
-        assert np.atleast_1d(link_f).shape == np.atleast_1d(y).shape
-        e = y - link_f
-        d3lik_dlink3 = ( -(2*(self.v + 1)*(-e)*(e**2 - 3*self.v*self.sigma2)) /
-                       ((e**2 + self.sigma2*self.v)**3)
-                    )
-        return d3lik_dlink3
-
-    def dlogpdf_link_dvar(self, link_f, y, extra_data=None):
-        """
-        Gradient of the log-likelihood function at y given f, w.r.t variance parameter (t_noise)
-
-        .. math::
-            \\frac{d \\ln p(y_{i}|\lambda(f_{i}))}{d\\sigma^{2}} = \\frac{v((y_{i} - \lambda(f_{i}))^{2} - \\sigma^{2})}{2\\sigma^{2}(\\sigma^{2}v + (y_{i} - \lambda(f_{i}))^{2})}
-
-        :param link_f: latent variables link(f)
-        :type link_f: Nx1 array
-        :param y: data
-        :type y: Nx1 array
-        :param extra_data: extra_data which is not used in student t distribution
-        :returns: derivative of likelihood evaluated at points f w.r.t variance parameter
-        :rtype: float
-        """
-        assert np.atleast_1d(link_f).shape == np.atleast_1d(y).shape
-        e = y - link_f
-        dlogpdf_dvar = self.v*(e**2 - self.sigma2)/(2*self.sigma2*(self.sigma2*self.v + e**2))
-        return np.sum(dlogpdf_dvar)
-
-    def dlogpdf_dlink_dvar(self, link_f, y, extra_data=None):
-        """
-        Derivative of the dlogpdf_dlink w.r.t variance parameter (t_noise)
-
-        .. math::
-            \\frac{d}{d\\sigma^{2}}(\\frac{d \\ln p(y_{i}|\lambda(f_{i}))}{df}) = \\frac{-2\\sigma v(v + 1)(y_{i}-\lambda(f_{i}))}{(y_{i}-\lambda(f_{i}))^2 + \\sigma^2 v)^2}
-
-        :param link_f: latent variables link_f
-        :type link_f: Nx1 array
-        :param y: data
-        :type y: Nx1 array
-        :param extra_data: extra_data which is not used in student t distribution
-        :returns: derivative of likelihood evaluated at points f w.r.t variance parameter
-        :rtype: Nx1 array
-        """
-        assert np.atleast_1d(link_f).shape == np.atleast_1d(y).shape
-        e = y - link_f
-        dlogpdf_dlink_dvar = (self.v*(self.v+1)*(-e))/((self.sigma2*self.v + e**2)**2)
-        return dlogpdf_dlink_dvar
-
-    def d2logpdf_dlink2_dvar(self, link_f, y, extra_data=None):
-        """
-        Gradient of the hessian (d2logpdf_dlink2) w.r.t variance parameter (t_noise)
-
-        .. math::
-            \\frac{d}{d\\sigma^{2}}(\\frac{d^{2} \\ln p(y_{i}|\lambda(f_{i}))}{d^{2}f}) = \\frac{v(v+1)(\\sigma^{2}v - 3(y_{i} - \lambda(f_{i}))^{2})}{(\\sigma^{2}v + (y_{i} - \lambda(f_{i}))^{2})^{3}}
-
-        :param link_f: latent variables link(f)
-        :type link_f: Nx1 array
-        :param y: data
-        :type y: Nx1 array
-        :param extra_data: extra_data which is not used in student t distribution
-        :returns: derivative of hessian evaluated at points f and f_j w.r.t variance parameter
-        :rtype: Nx1 array
-        """
-        assert np.atleast_1d(link_f).shape == np.atleast_1d(y).shape
-        e = y - link_f
-        d2logpdf_dlink2_dvar = ( (self.v*(self.v+1)*(self.sigma2*self.v - 3*(e**2)))
-                              / ((self.sigma2*self.v + (e**2))**3)
-                           )
-        return d2logpdf_dlink2_dvar
-
-    def dlogpdf_link_dtheta(self, f, y, extra_data=None):
-        dlogpdf_dvar = self.dlogpdf_link_dvar(f, y, extra_data=extra_data)
-        return np.asarray([[dlogpdf_dvar]])
-
-    def dlogpdf_dlink_dtheta(self, f, y, extra_data=None):
-        dlogpdf_dlink_dvar = self.dlogpdf_dlink_dvar(f, y, extra_data=extra_data)
-        return dlogpdf_dlink_dvar
-
-    def d2logpdf_dlink2_dtheta(self, f, y, extra_data=None):
-        d2logpdf_dlink2_dvar = self.d2logpdf_dlink2_dvar(f, y, extra_data=extra_data)
-        return d2logpdf_dlink2_dvar
-
-    def _predictive_variance_analytical(self, mu, sigma, predictive_mean=None):
-        """
-        Compute predictive variance of student_t*normal p(y*|f*)p(f*)
-
-        Need to find what the variance is at the latent points for a student t*normal p(y*|f*)p(f*)
-        (((g((v+1)/2))/(g(v/2)*s*sqrt(v*pi)))*(1+(1/v)*((y-f)/s)^2)^(-(v+1)/2))
-        *((1/(s*sqrt(2*pi)))*exp(-(1/(2*(s^2)))*((y-f)^2)))
-        """
-
-        #FIXME: Not correct
-        #We want the variance around test points y which comes from int p(y*|f*)p(f*) df*
-        #Var(y*) = Var(E[y*|f*]) + E[Var(y*|f*)]
-        #Since we are given f* (mu) which is our mean (expected) value of y*|f* then the variance is the variance around this
-        #Which was also given to us as (var)
-        #We also need to know the expected variance of y* around samples f*, this is the variance of the student t distribution
-        #However the variance of the student t distribution is not dependent on f, only on sigma and the degrees of freedom
-        true_var = 1/(1/sigma**2 + 1/self.variance)
-
-        return true_var
-
-    def _predictive_mean_analytical(self, mu, sigma):
-        """
-        Compute mean of the prediction
-        """
-        #FIXME: Not correct
-        return mu
-
-    def samples(self, gp):
-        """
-        Returns a set of samples of observations based on a given value of the latent variable.
-
-        :param gp: latent variable
-        """
-        orig_shape = gp.shape
-        gp = gp.flatten()
-        #FIXME: Very slow as we are computing a new random variable per input!
-        #Can't get it to sample all at the same time
-        #student_t_samples = np.array([stats.t.rvs(self.v, self.gp_link.transf(gpj),scale=np.sqrt(self.sigma2), size=1) for gpj in gp])
-        dfs = np.ones_like(gp)*self.v
-        scales = np.ones_like(gp)*np.sqrt(self.sigma2)
-        student_t_samples = stats.t.rvs(dfs, loc=self.gp_link.transf(gp),
-                                        scale=scales)
-        return student_t_samples.reshape(orig_shape)
--- a/GPy/models.py
+++ b/GPy/models.py
@ -1,33 +0,0 @@
-'''
-.. module:: GPy.models
-
-Implementations for common models used in GP regression and classification.
-The different models can be viewed in :mod:`GPy.models_modules`, which holds
-detailed explanations for the different models.
-
-.. note::
-    This module is a convienince module for endusers to use. For developers 
-    see :mod:`GPy.models_modules`, which holds the implementions for each model.: 
-
-.. moduleauthor:: Max Zwiessele <ibinbei@gmail.com>
-'''
-
-__updated__ = '2013-11-28'
-
-from models_modules.bayesian_gplvm import BayesianGPLVM, BayesianGPLVMWithMissingData
-from models_modules.gp_regression import GPRegression
-from models_modules.gp_classification import GPClassification#; _gp_classification = gp_classification ; del gp_classification 
-from models_modules.sparse_gp_regression import SparseGPRegression#; _sparse_gp_regression = sparse_gp_regression ; del sparse_gp_regression 
-from models_modules.svigp_regression import SVIGPRegression#; _svigp_regression = svigp_regression ; del svigp_regression 
-from models_modules.sparse_gp_classification import SparseGPClassification#; _sparse_gp_classification = sparse_gp_classification ; del sparse_gp_classification 
-from models_modules.fitc_classification import FITCClassification#; _fitc_classification = fitc_classification ; del fitc_classification 
-from models_modules.gplvm import GPLVM#; _gplvm = gplvm ; del gplvm 
-from models_modules.bcgplvm import BCGPLVM#; _bcgplvm = bcgplvm; del bcgplvm
-from models_modules.sparse_gplvm import SparseGPLVM#; _sparse_gplvm = sparse_gplvm ; del sparse_gplvm 
-from models_modules.warped_gp import WarpedGP#; _warped_gp = warped_gp ; del warped_gp 
-from models_modules.bayesian_gplvm import BayesianGPLVM#; _bayesian_gplvm = bayesian_gplvm ; del bayesian_gplvm 
-from models_modules.mrd import MRD#; _mrd = mrd; del mrd 
-from models_modules.gradient_checker import GradientChecker#; _gradient_checker = gradient_checker ; del gradient_checker 
-from models_modules.gp_multioutput_regression import GPMultioutputRegression#; _gp_multioutput_regression = gp_multioutput_regression ; del gp_multioutput_regression 
-from models_modules.sparse_gp_multioutput_regression import SparseGPMultioutputRegression#; _sparse_gp_multioutput_regression = sparse_gp_multioutput_regression ; del sparse_gp_multioutput_regression 
-from models_modules.gradient_checker import GradientChecker
--- a/GPy/models_modules/init.py
+++ b/GPy/models_modules/init.py
@ -1,19 +0,0 @@
-# Copyright (c) 2012, GPy authors (see AUTHORS.txt).
-# Licensed under the BSD 3-clause license (see LICENSE.txt)
-
-# from gp_regression import GPRegression; _gp_regression = gp_regression ; del gp_regression 
-# from gp_classification import GPClassification; _gp_classification = gp_classification ; del gp_classification 
-# from sparse_gp_regression import SparseGPRegression; _sparse_gp_regression = sparse_gp_regression ; del sparse_gp_regression 
-# from svigp_regression import SVIGPRegression; _svigp_regression = svigp_regression ; del svigp_regression 
-# from sparse_gp_classification import SparseGPClassification; _sparse_gp_classification = sparse_gp_classification ; del sparse_gp_classification 
-# from fitc_classification import FITCClassification; _fitc_classification = fitc_classification ; del fitc_classification 
-# from gplvm import GPLVM; _gplvm = gplvm ; del gplvm 
-# from bcgplvm import BCGPLVM; _bcgplvm = bcgplvm; del bcgplvm
-# from sparse_gplvm import SparseGPLVM; _sparse_gplvm = sparse_gplvm ; del sparse_gplvm 
-# from warped_gp import WarpedGP; _warped_gp = warped_gp ; del warped_gp 
-# from bayesian_gplvm import BayesianGPLVM; _bayesian_gplvm = bayesian_gplvm ; del bayesian_gplvm 
-# from mrd import MRD; _mrd = mrd ; del mrd 
-# from gradient_checker import GradientChecker; _gradient_checker = gradient_checker ; del gradient_checker 
-# from gp_multioutput_regression import GPMultioutputRegression; _gp_multioutput_regression = gp_multioutput_regression ; del gp_multioutput_regression 
-# from sparse_gp_multioutput_regression import SparseGPMultioutputRegression; _sparse_gp_multioutput_regression = sparse_gp_multioutput_regression ; del sparse_gp_multioutput_regression 
-
--- a/GPy/models_modules/bayesian_gplvm.py
+++ b/GPy/models_modules/bayesian_gplvm.py
@ -1,234 +0,0 @@
-# Copyright (c) 2012 - 2014 the GPy Austhors (see AUTHORS.txt)
-# Licensed under the BSD 3-clause license (see LICENSE.txt)
-
-import numpy as np
-from .. import kern
-from ..core.sparse_gp_mpi import SparseGP_MPI
-from ..likelihoods import Gaussian
-from ..core.parameterization.variational import NormalPosterior, NormalPrior
-from ..inference.latent_function_inference.var_dtc_parallel import VarDTC_minibatch
-import logging
-
-class BayesianGPLVM(SparseGP_MPI):
-    """
-    Bayesian Gaussian Process Latent Variable Model
-
-    :param Y: observed data (np.ndarray) or GPy.likelihood
-    :type Y: np.ndarray| GPy.likelihood instance
-    :param input_dim: latent dimensionality
-    :type input_dim: int
-    :param init: initialisation method for the latent space
-    :type init: 'PCA'|'random'
-
-    """
-    def __init__(self, Y, input_dim, X=None, X_variance=None, init='PCA', num_inducing=10,
-                 Z=None, kernel=None, inference_method=None, likelihood=None,
-                 name='bayesian gplvm', mpi_comm=None, normalizer=None,
-                 missing_data=False, stochastic=False, batchsize=1):
-
-        self.logger = logging.getLogger(self.__class__.__name__)
-        if X is None:
-            from ..util.initialization import initialize_latent
-            self.logger.info("initializing latent space X with method {}".format(init))
-            X, fracs = initialize_latent(init, input_dim, Y)
-        else:
-            fracs = np.ones(input_dim)
-
-        self.init = init
-
-        if X_variance is None:
-            self.logger.info("initializing latent space variance ~ uniform(0,.1)")
-            X_variance = np.random.uniform(0,.1,X.shape)
-
-        if Z is None:
-            self.logger.info("initializing inducing inputs")
-            Z = np.random.permutation(X.copy())[:num_inducing]
-        assert Z.shape[1] == X.shape[1]
-
-        if kernel is None:
-            self.logger.info("initializing kernel RBF")
-            kernel = kern.RBF(input_dim, lengthscale=1./fracs, ARD=True) #+ kern.Bias(input_dim) + kern.White(input_dim)
-
-        if likelihood is None:
-            likelihood = Gaussian()
-
-        self.variational_prior = NormalPrior()
-        X = NormalPosterior(X, X_variance)
-
-        if inference_method is None:
-            if mpi_comm is not None:
-                inference_method = VarDTC_minibatch(mpi_comm=mpi_comm)
-            else:
-                from ..inference.latent_function_inference.var_dtc import VarDTC
-                self.logger.debug("creating inference_method var_dtc")
-                inference_method = VarDTC(limit=1 if not missing_data else Y.shape[1])
-        if isinstance(inference_method,VarDTC_minibatch):
-            inference_method.mpi_comm = mpi_comm
-
-        super(BayesianGPLVM,self).__init__(X, Y, Z, kernel, likelihood=likelihood,
-                                           name=name, inference_method=inference_method,
-                                           normalizer=normalizer, mpi_comm=mpi_comm,
-                                           variational_prior=self.variational_prior,
-                                           )
-        self.link_parameter(self.X, index=0)
-
-    def set_X_gradients(self, X, X_grad):
-        """Set the gradients of the posterior distribution of X in its specific form."""
-        X.mean.gradient, X.variance.gradient = X_grad
-
-    def get_X_gradients(self, X):
-        """Get the gradients of the posterior distribution of X in its specific form."""
-        return X.mean.gradient, X.variance.gradient
-
-    def parameters_changed(self):
-        super(BayesianGPLVM,self).parameters_changed()
-        if isinstance(self.inference_method, VarDTC_minibatch):
-            return        
-
-        kl_fctr = 1.
-        self._log_marginal_likelihood -= kl_fctr*self.variational_prior.KL_divergence(self.X)
-
-        self.X.mean.gradient, self.X.variance.gradient = self.kern.gradients_qX_expectations(
-                                            variational_posterior=self.X,
-                                            Z=self.Z,
-                                            dL_dpsi0=self.grad_dict['dL_dpsi0'],
-                                            dL_dpsi1=self.grad_dict['dL_dpsi1'],
-                                            dL_dpsi2=self.grad_dict['dL_dpsi2'])
-
-        self.variational_prior.update_gradients_KL(self.X)
-
-
-        #super(BayesianGPLVM, self).parameters_changed()
-        #self._log_marginal_likelihood -= self.variational_prior.KL_divergence(self.X)
-
-        #self.X.mean.gradient, self.X.variance.gradient = self.kern.gradients_qX_expectations(variational_posterior=self.X, Z=self.Z, dL_dpsi0=self.grad_dict['dL_dpsi0'], dL_dpsi1=self.grad_dict['dL_dpsi1'], dL_dpsi2=self.grad_dict['dL_dpsi2'])
-
-        # This is testing code -------------------------
-#         i = np.random.randint(self.X.shape[0])
-#         X_ = self.X.mean
-#         which = np.sqrt(((X_ - X_[i:i+1])**2).sum(1)).argsort()>(max(0, self.X.shape[0]-51))
-#         _, _, grad_dict = self.inference_method.inference(self.kern, self.X[which], self.Z, self.likelihood, self.Y[which], self.Y_metadata)
-#         grad = self.kern.gradients_qX_expectations(variational_posterior=self.X[which], Z=self.Z, dL_dpsi0=grad_dict['dL_dpsi0'], dL_dpsi1=grad_dict['dL_dpsi1'], dL_dpsi2=grad_dict['dL_dpsi2'])
-#
-#         self.X.mean.gradient[:] = 0
-#         self.X.variance.gradient[:] = 0
-#         self.X.mean.gradient[which] = grad[0]
-#         self.X.variance.gradient[which] = grad[1]
-
-        # update for the KL divergence
-#         self.variational_prior.update_gradients_KL(self.X, which)
-        # -----------------------------------------------
-
-        # update for the KL divergence
-        #self.variational_prior.update_gradients_KL(self.X)
-
-    def plot_latent(self, labels=None, which_indices=None,
-                resolution=50, ax=None, marker='o', s=40,
-                fignum=None, plot_inducing=True, legend=True,
-                plot_limits=None,
-                aspect='auto', updates=False, predict_kwargs={}, imshow_kwargs={}):
-        import sys
-        assert "matplotlib" in sys.modules, "matplotlib package has not been imported."
-        from ..plotting.matplot_dep import dim_reduction_plots
-
-        return dim_reduction_plots.plot_latent(self, labels, which_indices,
-                resolution, ax, marker, s,
-                fignum, plot_inducing, legend,
-                plot_limits, aspect, updates, predict_kwargs, imshow_kwargs)
-
-    def do_test_latents(self, Y):
-        """
-        Compute the latent representation for a set of new points Y
-
-        Notes:
-        This will only work with a univariate Gaussian likelihood (for now)
-        """
-        N_test = Y.shape[0]
-        input_dim = self.Z.shape[1]
-
-        means = np.zeros((N_test, input_dim))
-        covars = np.zeros((N_test, input_dim))
-
-        dpsi0 = -0.5 * self.input_dim / self.likelihood.variance
-        dpsi2 = self.grad_dict['dL_dpsi2'][0][None, :, :] # TODO: this may change if we ignore het. likelihoods
-        V = Y/self.likelihood.variance
-
-        #compute CPsi1V
-        #if self.Cpsi1V is None:
-        #    psi1V = np.dot(self.psi1.T, self.likelihood.V)
-        #    tmp, _ = linalg.dtrtrs(self._Lm, np.asfortranarray(psi1V), lower=1, trans=0)
-        #    tmp, _ = linalg.dpotrs(self.LB, tmp, lower=1)
-        #    self.Cpsi1V, _ = linalg.dtrtrs(self._Lm, tmp, lower=1, trans=1)
-
-        dpsi1 = np.dot(self.posterior.woodbury_vector, V.T)
-
-        #start = np.zeros(self.input_dim * 2)
-
-
-        from scipy.optimize import minimize
-
-        for n, dpsi1_n in enumerate(dpsi1.T[:, :, None]):
-            args = (input_dim, self.kern.copy(), self.Z, dpsi0, dpsi1_n.T, dpsi2)
-            res = minimize(latent_cost_and_grad, jac=True, x0=np.hstack((means[n], covars[n])), args=args, method='BFGS')
-            xopt = res.x
-            mu, log_S = xopt.reshape(2, 1, -1)
-            means[n] = mu[0].copy()
-            covars[n] = np.exp(log_S[0]).copy()
-
-        X = NormalPosterior(means, covars)
-
-        return X
-
-    def dmu_dX(self, Xnew):
-        """
-        Calculate the gradient of the prediction at Xnew w.r.t Xnew.
-        """
-        dmu_dX = np.zeros_like(Xnew)
-        for i in range(self.Z.shape[0]):
-            dmu_dX += self.kern.gradients_X(self.grad_dict['dL_dpsi1'][i:i + 1, :], Xnew, self.Z[i:i + 1, :])
-        return dmu_dX
-
-    def dmu_dXnew(self, Xnew):
-        """
-        Individual gradient of prediction at Xnew w.r.t. each sample in Xnew
-        """
-        gradients_X = np.zeros((Xnew.shape[0], self.num_inducing))
-        ones = np.ones((1, 1))
-        for i in range(self.Z.shape[0]):
-            gradients_X[:, i] = self.kern.gradients_X(ones, Xnew, self.Z[i:i + 1, :]).sum(-1)
-        return np.dot(gradients_X, self.grad_dict['dL_dpsi1'])
-
-    def plot_steepest_gradient_map(self, *args, ** kwargs):
-        """
-        See GPy.plotting.matplot_dep.dim_reduction_plots.plot_steepest_gradient_map
-        """
-        import sys
-        assert "matplotlib" in sys.modules, "matplotlib package has not been imported."
-        from ..plotting.matplot_dep import dim_reduction_plots
-
-        return dim_reduction_plots.plot_steepest_gradient_map(self,*args,**kwargs)
-
-
-def latent_cost_and_grad(mu_S, input_dim, kern, Z, dL_dpsi0, dL_dpsi1, dL_dpsi2):
-    """
-    objective function for fitting the latent variables for test points
-    (negative log-likelihood: should be minimised!)
-    """
-    mu = mu_S[:input_dim][None]
-    log_S = mu_S[input_dim:][None]
-    S = np.exp(log_S)
-
-    X = NormalPosterior(mu, S)
-
-    psi0 = kern.psi0(Z, X)
-    psi1 = kern.psi1(Z, X)
-    psi2 = kern.psi2(Z, X)
-
-    lik = dL_dpsi0 * psi0.sum() + np.einsum('ij,kj->...', dL_dpsi1, psi1) + np.einsum('ijk,lkj->...', dL_dpsi2, psi2) - 0.5 * np.sum(np.square(mu) + S) + 0.5 * np.sum(log_S)
-
-    dLdmu, dLdS = kern.gradients_qX_expectations(dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, X)
-    dmu = dLdmu - mu
-    # dS = S0 + S1 + S2 -0.5 + .5/S
-    dlnS = S * (dLdS - 0.5) + .5
-
-    return -lik, -np.hstack((dmu.flatten(), dlnS.flatten()))
--- a/GPy/models_modules/bcgplvm.py
+++ b/GPy/models_modules/bcgplvm.py
@ -1,48 +0,0 @@
-# Copyright (c) 2012-2014, GPy authors (see AUTHORS.txt).
-# Licensed under the BSD 3-clause license (see LICENSE.txt)
-
-
-import numpy as np
-from ..core import GP
-from ..models import GPLVM
-from ..mappings import Kernel
-
-
-class BCGPLVM(GPLVM):
-    """
-    Back constrained Gaussian Process Latent Variable Model
-
-    :param Y: observed data
-    :type Y: np.ndarray
-    :param input_dim: latent dimensionality
-    :type input_dim: int
-    :param init: initialisation method for the latent space
-    :type init: 'PCA'|'random'
-    :param mapping: mapping for back constraint
-    :type mapping: GPy.core.Mapping object
-
-    """
-    def __init__(self, Y, input_dim, init='PCA', X=None, kernel=None, normalize_Y=False, mapping=None):
-        
-        if mapping is None:
-            mapping = Kernel(X=Y, output_dim=input_dim)
-        self.mapping = mapping
-        GPLVM.__init__(self, Y, input_dim, init, X, kernel, normalize_Y)
-        self.X = self.mapping.f(self.likelihood.Y)
-
-    def _get_param_names(self):
-        return self.mapping._get_param_names() + GP._get_param_names(self)
-
-    def _get_params(self):
-        return np.hstack((self.mapping._get_params(), GP._get_params(self)))
-
-    def _set_params(self, x):
-        self.mapping._set_params(x[:self.mapping.num_params])
-        self.X = self.mapping.f(self.likelihood.Y)
-        GP._set_params(self, x[self.mapping.num_params:])
-
-    def _log_likelihood_gradients(self):
-        dL_df = self.kern.gradients_X(self.dL_dK, self.X)
-        dL_dtheta = self.mapping.df_dtheta(dL_df, self.likelihood.Y)
-        return np.hstack((dL_dtheta.flatten(), GP._log_likelihood_gradients(self)))
-
--- a/GPy/models_modules/gp_classification.py
+++ b/GPy/models_modules/gp_classification.py
@ -1,29 +0,0 @@
-# Copyright (c) 2013, the GPy Authors (see AUTHORS.txt)
-# Licensed under the BSD 3-clause license (see LICENSE.txt)
-
-from ..core import GP
-from .. import likelihoods
-from .. import kern
-from ..inference.latent_function_inference.expectation_propagation import EP
-
-class GPClassification(GP):
-    """
-    Gaussian Process classification
-
-    This is a thin wrapper around the models.GP class, with a set of sensible defaults
-
-    :param X: input observations
-    :param Y: observed values, can be None if likelihood is not None
-    :param kernel: a GPy kernel, defaults to rbf
-
-    .. Note:: Multiple independent outputs are allowed using columns of Y
-
-    """
-
-    def __init__(self, X, Y, kernel=None,Y_metadata=None):
-        if kernel is None:
-            kernel = kern.RBF(X.shape[1])
-
-        likelihood = likelihoods.Bernoulli()
-
-        GP.__init__(self, X=X, Y=Y,  kernel=kernel, likelihood=likelihood, inference_method=EP(), name='gp_classification')
--- a/GPy/models_modules/gp_regression.py
+++ b/GPy/models_modules/gp_regression.py
@ -1,36 +0,0 @@
-# Copyright (c) 2012 - 2014 the GPy Austhors (see AUTHORS.txt)
-# Licensed under the BSD 3-clause license (see LICENSE.txt)
-
-import numpy as np
-from ..core import GP
-from .. import likelihoods
-from .. import kern
-
-class GPRegression(GP):
-    """
-    Gaussian Process model for regression
-
-    This is a thin wrapper around the models.GP class, with a set of sensible defaults
-
-    :param X: input observations
-    :param Y: observed values
-    :param kernel: a GPy kernel, defaults to rbf
-    :param Norm normalizer: [False]
-
-        Normalize Y with the norm given.
-        If normalizer is False, no normalization will be done
-        If it is None, we use GaussianNorm(alization)
-
-    .. Note:: Multiple independent outputs are allowed using columns of Y
-
-    """
-
-    def __init__(self, X, Y, kernel=None, Y_metadata=None, normalizer=None):
-
-        if kernel is None:
-            kernel = kern.RBF(X.shape[1])
-
-        likelihood = likelihoods.Gaussian()
-
-        super(GPRegression, self).__init__(X, Y, kernel, likelihood, name='GP regression', Y_metadata=Y_metadata, normalizer=normalizer)
-
--- a/GPy/models_modules/gplvm.py
+++ b/GPy/models_modules/gplvm.py
@ -1,83 +0,0 @@
-# Copyright (c) 2012-2014, GPy authors (see AUTHORS.txt).
-# Licensed under the BSD 3-clause license (see LICENSE.txt)
-
-
-import numpy as np
-from .. import kern
-from ..core import GP, Param
-from ..likelihoods import Gaussian
-from .. import util
-
-
-class GPLVM(GP):
-    """
-    Gaussian Process Latent Variable Model
-
-
-    """
-    def __init__(self, Y, input_dim, init='PCA', X=None, kernel=None, name="gplvm"):
-
-        """
-        :param Y: observed data
-        :type Y: np.ndarray
-        :param input_dim: latent dimensionality
-        :type input_dim: int
-        :param init: initialisation method for the latent space
-        :type init: 'PCA'|'random'
-        """
-        if X is None:
-            from ..util.initialization import initialize_latent
-            X, fracs = initialize_latent(init, input_dim, Y)
-        else:
-            fracs = np.ones(input_dim)
-        if kernel is None:
-            kernel = kern.RBF(input_dim, lengthscale=fracs, ARD=input_dim > 1) + kern.Bias(input_dim, np.exp(-2))
-
-        likelihood = Gaussian()
-
-        super(GPLVM, self).__init__(X, Y, kernel, likelihood, name='GPLVM')
-        self.X = Param('latent_mean', X)
-        self.link_parameter(self.X, index=0)
-
-    def parameters_changed(self):
-        super(GPLVM, self).parameters_changed()
-        self.X.gradient = self.kern.gradients_X(self.grad_dict['dL_dK'], self.X, None)
-
-    def jacobian(self,X):
-        J = np.zeros((X.shape[0],X.shape[1],self.output_dim))
-        for i in range(self.output_dim):
-            J[:,:,i] = self.kern.gradients_X(self.posterior.woodbury_vector[:,i:i+1], X, self.X)
-        return J
-
-    def magnification(self,X):
-        target=np.zeros(X.shape[0])
-        #J = np.zeros((X.shape[0],X.shape[1],self.output_dim))
-        J = self.jacobian(X)
-        for i in range(X.shape[0]):
-            target[i]=np.sqrt(np.linalg.det(np.dot(J[i,:,:],np.transpose(J[i,:,:]))))
-        return target
-
-    def plot(self):
-        assert self.likelihood.Y.shape[1] == 2
-        pb.scatter(self.likelihood.Y[:, 0], self.likelihood.Y[:, 1], 40, self.X[:, 0].copy(), linewidth=0, cmap=pb.cm.jet)  # @UndefinedVariable
-        Xnew = np.linspace(self.X.min(), self.X.max(), 200)[:, None]
-        mu, _ = self.predict(Xnew)
-        import pylab as pb
-        pb.plot(mu[:, 0], mu[:, 1], 'k', linewidth=1.5)
-
-    def plot_latent(self, labels=None, which_indices=None,
-                resolution=50, ax=None, marker='o', s=40,
-                fignum=None, legend=True,
-                plot_limits=None,
-                aspect='auto', updates=False, **kwargs):
-        import sys
-        assert "matplotlib" in sys.modules, "matplotlib package has not been imported."
-        from ..plotting.matplot_dep import dim_reduction_plots
-
-        return dim_reduction_plots.plot_latent(self, labels, which_indices,
-                resolution, ax, marker, s,
-                fignum, False, legend,
-                plot_limits, aspect, updates, **kwargs)
-
-    def plot_magnification(self, *args, **kwargs):
-        return util.plot_latent.plot_magnification(self, *args, **kwargs)
--- a/GPy/models_modules/gradient_checker.py
+++ b/GPy/models_modules/gradient_checker.py
@ -1,113 +0,0 @@
-# ## Copyright (c) 2012, GPy authors (see AUTHORS.txt).
-# Licensed under the BSD 3-clause license (see LICENSE.txt)
-
-from ..core.model import Model
-import itertools
-import numpy
-from ..core.parameterization import Param
-
-def get_shape(x):
-    if isinstance(x, numpy.ndarray):
-        return x.shape
-    return ()
-
-def at_least_one_element(x):
-    if isinstance(x, (list, tuple)):
-        return x
-    return [x]
-
-def flatten_if_needed(x):
-    return numpy.atleast_1d(x).flatten()
-
-class GradientChecker(Model):
-
-    def __init__(self, f, df, x0, names=None, *args, **kwargs):
-        """
-        :param f: Function to check gradient for
-        :param df: Gradient of function to check
-        :param x0:
-            Initial guess for inputs x (if it has a shape (a,b) this will be reflected in the parameter names).
-            Can be a list of arrays, if takes a list of arrays. This list will be passed
-            to f and df in the same order as given here.
-            If only one argument, make sure not to pass a list!!!
-
-        :type x0: [array-like] | array-like | float | int
-        :param names:
-            Names to print, when performing gradcheck. If a list was passed to x0
-            a list of names with the same length is expected.
-        :param args: Arguments passed as f(x, *args, **kwargs) and df(x, *args, **kwargs)
-
-        Examples:
-        ---------
-            from GPy.models import GradientChecker
-            N, M, Q = 10, 5, 3
-
-            Sinusoid:
-
-                X = numpy.random.rand(N, Q)
-                grad = GradientChecker(numpy.sin,numpy.cos,X,'x')
-                grad.checkgrad(verbose=1)
-
-            Using GPy:
-
-                X, Z = numpy.random.randn(N,Q), numpy.random.randn(M,Q)
-                kern = GPy.kern.linear(Q, ARD=True) + GPy.kern.rbf(Q, ARD=True)
-                grad = GradientChecker(kern.K,
-                                       lambda x: 2*kern.dK_dX(numpy.ones((1,1)), x),
-                                       x0 = X.copy(),
-                                       names='X')
-                grad.checkgrad(verbose=1)
-                grad.randomize()
-                grad.checkgrad(verbose=1)
-        """
-        Model.__init__(self, 'GradientChecker')
-        if isinstance(x0, (list, tuple)) and names is None:
-            self.shapes = [get_shape(xi) for xi in x0]
-            self.names = ['X{i}'.format(i=i) for i in range(len(x0))]
-        elif isinstance(x0, (list, tuple)) and names is not None:
-            self.shapes = [get_shape(xi) for xi in x0]
-            self.names = names
-        elif names is None:
-            self.names = ['X']
-            self.shapes = [get_shape(x0)]
-        else:
-            self.names = names
-            self.shapes = [get_shape(x0)]
-
-        for name, xi in zip(self.names, at_least_one_element(x0)):
-            self.__setattr__(name, Param(name, xi))
-            self.link_parameter(self.__getattribute__(name))
-#         self._param_names = []
-#         for name, shape in zip(self.names, self.shapes):
-#             self._param_names.extend(map(lambda nameshape: ('_'.join(nameshape)).strip('_'), itertools.izip(itertools.repeat(name), itertools.imap(lambda t: '_'.join(map(str, t)), itertools.product(*map(lambda xi: range(xi), shape))))))
-        self.args = args
-        self.kwargs = kwargs
-        self.f = f
-        self.df = df
-
-    def _get_x(self):
-        if len(self.names) > 1:
-            return [self.__getattribute__(name) for name in self.names] + list(self.args)
-        return [self.__getattribute__(self.names[0])] + list(self.args)
-
-    def log_likelihood(self):
-        return float(numpy.sum(self.f(*self._get_x(), **self.kwargs)))
-
-    def _log_likelihood_gradients(self):
-        return numpy.atleast_1d(self.df(*self._get_x(), **self.kwargs)).flatten()
-
-    #def _get_params(self):
-        #return numpy.atleast_1d(numpy.hstack(map(lambda name: flatten_if_needed(self.__getattribute__(name)), self.names)))
-
-    #def _set_params(self, x):
-        #current_index = 0
-        #for name, shape in zip(self.names, self.shapes):
-            #current_size = numpy.prod(shape)
-            #self.__setattr__(name, x[current_index:current_index + current_size].reshape(shape))
-            #current_index += current_size
-
-    #def _get_param_names(self):
-        #_param_names = []
-        #for name, shape in zip(self.names, self.shapes):
-            #_param_names.extend(map(lambda nameshape: ('_'.join(nameshape)).strip('_'), itertools.izip(itertools.repeat(name), itertools.imap(lambda t: '_'.join(map(str, t)), itertools.product(*map(lambda xi: range(xi), shape))))))
-        #return _param_names
--- a/GPy/models_modules/mrd.py
+++ b/GPy/models_modules/mrd.py
@ -1,341 +0,0 @@
-# ## Copyright (c) 2013, GPy authors (see AUTHORS.txt).
-# Licensed under the BSD 3-clause license (see LICENSE.txt)
-
-import numpy as np
-import itertools, logging
-
-from ..kern import Kern
-from ..core.parameterization.variational import NormalPosterior, NormalPrior
-from ..core.parameterization import Param, Parameterized
-from ..core.parameterization.observable_array import ObsAr
-from ..inference.latent_function_inference.var_dtc import VarDTC
-from ..inference.latent_function_inference import InferenceMethodList
-from ..likelihoods import Gaussian
-from ..util.initialization import initialize_latent
-from ..core.sparse_gp import SparseGP, GP
-from GPy.core.parameterization.variational import VariationalPosterior
-from GPy.models.bayesian_gplvm_minibatch import BayesianGPLVMMiniBatch
-from GPy.models.sparse_gp_minibatch import SparseGPMiniBatch
-
-class MRD(BayesianGPLVMMiniBatch):
-    """
-    !WARNING: This is bleeding edge code and still in development.
-    Functionality may change fundamentally during development!
-
-    Apply MRD to all given datasets Y in Ylist.
-
-    Y_i in [n x p_i]
-
-    If Ylist is a dictionary, the keys of the dictionary are the names, and the
-    values are the different datasets to compare.
-
-    The samples n in the datasets need
-    to match up, whereas the dimensionality p_d can differ.
-
-    :param [array-like] Ylist: List of datasets to apply MRD on
-    :param input_dim: latent dimensionality
-    :type input_dim: int
-    :param array-like X: mean of starting latent space q in [n x q]
-    :param array-like X_variance: variance of starting latent space q in [n x q]
-    :param initx: initialisation method for the latent space :
-
-        * 'concat' - PCA on concatenation of all datasets
-        * 'single' - Concatenation of PCA on datasets, respectively
-        * 'random' - Random draw from a Normal(0,1)
-
-    :type initx: ['concat'|'single'|'random']
-    :param initz: initialisation method for inducing inputs
-    :type initz: 'permute'|'random'
-    :param num_inducing: number of inducing inputs to use
-    :param Z: initial inducing inputs
-    :param kernel: list of kernels or kernel to copy for each output
-    :type kernel: [GPy.kernels.kernels] | GPy.kernels.kernels | None (default)
-    :param :class:`~GPy.inference.latent_function_inference inference_method:
-        InferenceMethodList of inferences, or one inference method for all
-    :param :class:`~GPy.likelihoodss.likelihoods.likelihoods` likelihoods: the likelihoods to use
-    :param str name: the name of this model
-    :param [str] Ynames: the names for the datasets given, must be of equal length as Ylist or None
-    :param bool|Norm normalizer: How to normalize the data?
-    :param bool stochastic: Should this model be using stochastic gradient descent over the dimensions?
-    :param bool|[bool] batchsize: either one batchsize for all, or one batchsize per dataset.
-    """
-    def __init__(self, Ylist, input_dim, X=None, X_variance=None,
-                 initx = 'PCA', initz = 'permute',
-                 num_inducing=10, Z=None, kernel=None,
-                 inference_method=None, likelihoods=None, name='mrd',
-                 Ynames=None, normalizer=False, stochastic=False, batchsize=10):
-
-        self.logger = logging.getLogger(self.__class__.__name__)
-        self.input_dim = input_dim
-        self.num_inducing = num_inducing
-
-        if isinstance(Ylist, dict):
-            Ynames, Ylist = zip(*Ylist.items())
-
-        self.logger.debug("creating observable arrays")
-        self.Ylist = [ObsAr(Y) for Y in Ylist]
-
-        if Ynames is None:
-            self.logger.debug("creating Ynames")
-            Ynames = ['Y{}'.format(i) for i in range(len(Ylist))]
-        self.names = Ynames
-        assert len(self.names) == len(self.Ylist), "one name per dataset, or None if Ylist is a dict"
-
-        if inference_method is None:
-            self.inference_method = InferenceMethodList([VarDTC() for _ in xrange(len(self.Ylist))])
-        else:
-            assert isinstance(inference_method, InferenceMethodList), "please provide one inference method per Y in the list and provide it as InferenceMethodList, inference_method given: {}".format(inference_method)
-            self.inference_method = inference_method
-
-        if X is None:
-            X, fracs = self._init_X(initx, Ylist)
-        else:
-            fracs = [X.var(0)]*len(Ylist)
-
-        Z = self._init_Z(initz, X)
-        self.Z = Param('inducing inputs', Z)
-        self.num_inducing = self.Z.shape[0] # ensure M==N if M>N
-
-        # sort out the kernels
-        self.logger.info("building kernels")
-        if kernel is None:
-            from ..kern import RBF
-            kernels = [RBF(input_dim, ARD=1, lengthscale=1./fracs[i]) for i in range(len(Ylist))]
-        elif isinstance(kernel, Kern):
-            kernels = []
-            for i in range(len(Ylist)):
-                k = kernel.copy()
-                kernels.append(k)
-        else:
-            assert len(kernel) == len(Ylist), "need one kernel per output"
-            assert all([isinstance(k, Kern) for k in kernel]), "invalid kernel object detected!"
-            kernels = kernel
-
-        if X_variance is None:
-            X_variance = np.random.uniform(0.1, 0.2, X.shape)
-
-        self.variational_prior = NormalPrior()
-        #self.X = NormalPosterior(X, X_variance)
-
-        if likelihoods is None:
-            likelihoods = [Gaussian(name='Gaussian_noise'.format(i)) for i in range(len(Ylist))]
-        else: likelihoods = likelihoods
-
-        self.logger.info("adding X and Z")
-        super(MRD, self).__init__(Y, input_dim, X=X, X_variance=X_variance, num_inducing=num_inducing,
-                 Z=self.Z, kernel=None, inference_method=self.inference_method, likelihood=Gaussian(),
-                 name='manifold relevance determination', normalizer=None,
-                 missing_data=False, stochastic=False, batchsize=1)
-
-        self._log_marginal_likelihood = 0
-
-        self.unlink_parameter(self.likelihood)
-        self.unlink_parameter(self.kern)
-        del self.kern
-        del self.likelihood
-
-        self.num_data = Ylist[0].shape[0]
-        if isinstance(batchsize, int):
-            batchsize = itertools.repeat(batchsize)
-
-        self.bgplvms = []
-
-        for i, n, k, l, Y, im, bs in itertools.izip(itertools.count(), Ynames, kernels, likelihoods, Ylist, self.inference_method, batchsize):
-            assert Y.shape[0] == self.num_data, "All datasets need to share the number of datapoints, and those have to correspond to one another"
-            md = np.isnan(Y).any()
-            spgp = BayesianGPLVMMiniBatch(Y, input_dim, X, X_variance,
-                                          Z=Z, kernel=k, likelihood=l,
-                                          inference_method=im, name=n,
-                                          normalizer=normalizer,
-                                          missing_data=md,
-                                          stochastic=stochastic,
-                                          batchsize=bs)
-            spgp.kl_factr = 1./len(Ynames)
-            spgp.unlink_parameter(spgp.Z)
-            spgp.unlink_parameter(spgp.X)
-            del spgp.Z
-            del spgp.X
-            spgp.Z = self.Z
-            spgp.X = self.X
-            self.link_parameter(spgp, i+2)
-            self.bgplvms.append(spgp)
-
-        self.posterior = None
-        self.logger.info("init done")
-
-    def parameters_changed(self):
-        self._log_marginal_likelihood = 0
-        self.Z.gradient[:] = 0.
-        self.X.gradient[:] = 0.
-        for b, i in itertools.izip(self.bgplvms, self.inference_method):
-            self._log_marginal_likelihood += b._log_marginal_likelihood
-
-            self.logger.info('working on im <{}>'.format(hex(id(i))))
-            self.Z.gradient[:] += b.full_values['Zgrad']
-            grad_dict = b.full_values
-
-            self.X.mean.gradient += grad_dict['meangrad']
-            self.X.variance.gradient += grad_dict['vargrad']
-
-        if isinstance(self.X, VariationalPosterior):
-            # update for the KL divergence
-            self.variational_prior.update_gradients_KL(self.X)
-            self._log_marginal_likelihood -= self.variational_prior.KL_divergence(self.X)
-            pass
-
-    def log_likelihood(self):
-        return self._log_marginal_likelihood
-
-    def _init_X(self, init='PCA', Ylist=None):
-        if Ylist is None:
-            Ylist = self.Ylist
-        if init in "PCA_concat":
-            X, fracs = initialize_latent('PCA', self.input_dim, np.hstack(Ylist))
-            fracs = [fracs]*len(Ylist)
-        elif init in "PCA_single":
-            X = np.zeros((Ylist[0].shape[0], self.input_dim))
-            fracs = []
-            for qs, Y in itertools.izip(np.array_split(np.arange(self.input_dim), len(Ylist)), Ylist):
-                x,frcs = initialize_latent('PCA', len(qs), Y)
-                X[:, qs] = x
-                fracs.append(frcs)
-        else: # init == 'random':
-            X = np.random.randn(Ylist[0].shape[0], self.input_dim)
-            fracs = X.var(0)
-            fracs = [fracs]*len(Ylist)
-        X -= X.mean()
-        X /= X.std()
-        return X, fracs
-
-    def _init_Z(self, init="permute", X=None):
-        if X is None:
-            X = self.X
-        if init in "permute":
-            Z = np.random.permutation(X.copy())[:self.num_inducing]
-        elif init in "random":
-            Z = np.random.randn(self.num_inducing, self.input_dim) * X.var()
-        return Z
-
-    def _handle_plotting(self, fignum, axes, plotf, sharex=False, sharey=False):
-        import matplotlib.pyplot as plt
-        if axes is None:
-            fig = plt.figure(num=fignum)
-        sharex_ax = None
-        sharey_ax = None
-        plots = []
-        for i, g in enumerate(self.bgplvms):
-            try:
-                if sharex:
-                    sharex_ax = ax # @UndefinedVariable
-                    sharex = False # dont set twice
-                if sharey:
-                    sharey_ax = ax # @UndefinedVariable
-                    sharey = False # dont set twice
-            except:
-                pass
-            if axes is None:
-                ax = fig.add_subplot(1, len(self.bgplvms), i + 1, sharex=sharex_ax, sharey=sharey_ax)
-            elif isinstance(axes, (tuple, list, np.ndarray)):
-                ax = axes[i]
-            else:
-                raise ValueError("Need one axes per latent dimension input_dim")
-            plots.append(plotf(i, g, ax))
-            if sharey_ax is not None:
-                plt.setp(ax.get_yticklabels(), visible=False)
-        plt.draw()
-        if axes is None:
-            try:
-                fig.tight_layout()
-            except:
-                pass
-        return plots
-
-    def predict(self, Xnew, full_cov=False, Y_metadata=None, kern=None, Yindex=0):
-        """
-        Prediction for data set Yindex[default=0].
-        This predicts the output mean and variance for the dataset given in Ylist[Yindex]
-        """
-        b = self.bgplvms[Yindex]
-        self.posterior = b.posterior
-        self.kern = b.kern
-        self.likelihood = b.likelihood
-        return super(MRD, self).predict(Xnew, full_cov, Y_metadata, kern)
-
-    #===============================================================================
-    # TODO: Predict! Maybe even change to several bgplvms, which share an X?
-    #===============================================================================
-    #     def plot_predict(self, fignum=None, ax=None, sharex=False, sharey=False, **kwargs):
-    #         fig = self._handle_plotting(fignum,
-    #                                     ax,
-    #                                     lambda i, g, ax: ax.imshow(g.predict(g.X)[0], **kwargs),
-    #                                     sharex=sharex, sharey=sharey)
-    #         return fig
-
-    def plot_scales(self, fignum=None, ax=None, titles=None, sharex=False, sharey=True, *args, **kwargs):
-        """
-
-        TODO: Explain other parameters
-
-        :param titles: titles for axes of datasets
-
-        """
-        if titles is None:
-            titles = [r'${}$'.format(name) for name in self.names]
-        ymax = reduce(max, [np.ceil(max(g.kern.input_sensitivity())) for g in self.bgplvms])
-        def plotf(i, g, ax):
-            #ax.set_ylim([0,ymax])
-            return g.kern.plot_ARD(ax=ax, title=titles[i], *args, **kwargs)
-        fig = self._handle_plotting(fignum, ax, plotf, sharex=sharex, sharey=sharey)
-        return fig
-
-    def plot_latent(self, labels=None, which_indices=None,
-                resolution=50, ax=None, marker='o', s=40,
-                fignum=None, plot_inducing=True, legend=True,
-                plot_limits=None,
-                aspect='auto', updates=False, predict_kwargs={}, imshow_kwargs={}):
-        """
-        see plotting.matplot_dep.dim_reduction_plots.plot_latent
-        if predict_kwargs is None, will plot latent spaces for 0th dataset (and kernel), otherwise give
-        predict_kwargs=dict(Yindex='index') for plotting only the latent space of dataset with 'index'.
-        """
-        import sys
-        assert "matplotlib" in sys.modules, "matplotlib package has not been imported."
-        from matplotlib import pyplot as plt
-        from ..plotting.matplot_dep import dim_reduction_plots
-        if "Yindex" not in predict_kwargs:
-            predict_kwargs['Yindex'] = 0
-
-        Yindex = predict_kwargs['Yindex']
-        if ax is None:
-            fig = plt.figure(num=fignum)
-            ax = fig.add_subplot(111)
-        else:
-            fig = ax.figure
-        self.kern = self.bgplvms[Yindex].kern
-        self.likelihood = self.bgplvms[Yindex].likelihood
-        plot = dim_reduction_plots.plot_latent(self, labels, which_indices,
-                                        resolution, ax, marker, s,
-                                        fignum, plot_inducing, legend,
-                                        plot_limits, aspect, updates, predict_kwargs, imshow_kwargs)
-        ax.set_title(self.bgplvms[Yindex].name)
-        try:
-            fig.tight_layout()
-        except:
-            pass
-
-        return plot
-
-    def __getstate__(self):
-        state = super(MRD, self).__getstate__()
-        if state.has_key('kern'):
-            del state['kern']
-        if state.has_key('likelihood'):
-            del state['likelihood']
-        return state
-
-    def __setstate__(self, state):
-        # TODO:
-        super(MRD, self).__setstate__(state)
-        self.kern = self.bgplvms[0].kern
-        self.likelihood = self.bgplvms[0].likelihood
-        self.parameters_changed()
--- a/GPy/models_modules/sparse_gp_classification.py
+++ b/GPy/models_modules/sparse_gp_classification.py
@ -1,46 +0,0 @@
-# Copyright (c) 2013, Ricardo Andrade
-# Licensed under the BSD 3-clause license (see LICENSE.txt)
-
-
-import numpy as np
-from ..core import SparseGP
-from .. import likelihoods
-from .. import kern
-from ..likelihoods import likelihood
-from ..inference.latent_function_inference import expectation_propagation_dtc
-
-class SparseGPClassification(SparseGP):
-    """
-    sparse Gaussian Process model for classification
-
-    This is a thin wrapper around the sparse_GP class, with a set of sensible defaults
-
-    :param X: input observations
-    :param Y: observed values
-    :param likelihood: a GPy likelihood, defaults to Binomial with probit link_function
-    :param kernel: a GPy kernel, defaults to rbf+white
-    :param normalize_X:  whether to normalize the input data before computing (predictions will be in original scales)
-    :type normalize_X: False|True
-    :param normalize_Y:  whether to normalize the input data before computing (predictions will be in original scales)
-    :type normalize_Y: False|True
-    :rtype: model object
-
-    """
-
-    #def __init__(self, X, Y=None, likelihood=None, kernel=None, normalize_X=False, normalize_Y=False, Z=None, num_inducing=10):
-    def __init__(self, X, Y=None, likelihood=None, kernel=None, Z=None, num_inducing=10, Y_metadata=None):
-
-
-        if kernel is None:
-            kernel = kern.RBF(X.shape[1])
-
-        likelihood = likelihoods.Bernoulli()
-
-        if Z is None:
-            i = np.random.permutation(X.shape[0])[:num_inducing]
-            Z = X[i].copy()
-        else:
-            assert Z.shape[1] == X.shape[1]
-
-        SparseGP.__init__(self, X, Y, Z, kernel, likelihood, inference_method=expectation_propagation_dtc.EPDTC(), name='SparseGPClassification',Y_metadata=Y_metadata)
-    #def __init__(self, X, Y, Z, kernel, likelihood, inference_method=None, name='sparse gp', Y_metadata=None):
--- a/GPy/models_modules/sparse_gp_regression.py
+++ b/GPy/models_modules/sparse_gp_regression.py
@ -1,109 +0,0 @@
-# Copyright (c) 2012, James Hensman
-# Licensed under the BSD 3-clause license (see LICENSE.txt)
-
-
-import numpy as np
-from ..core import SparseGP
-from ..core.sparse_gp_mpi import SparseGP_MPI
-from .. import likelihoods
-from .. import kern
-from ..inference.latent_function_inference import VarDTC
-from ..core.parameterization.variational import NormalPosterior
-from GPy.inference.latent_function_inference.var_dtc_parallel import VarDTC_minibatch
-
-class SparseGPRegression(SparseGP_MPI):
-    """
-    Gaussian Process model for regression
-
-    This is a thin wrapper around the SparseGP class, with a set of sensible defalts
-
-    :param X: input observations
-    :param Y: observed values
-    :param kernel: a GPy kernel, defaults to rbf+white
-    :param Z: inducing inputs (optional, see note)
-    :type Z: np.ndarray (num_inducing x input_dim) | None
-    :param num_inducing: number of inducing points (ignored if Z is passed, see note)
-    :type num_inducing: int
-    :rtype: model object
-
-    .. Note:: If no Z array is passed, num_inducing (default 10) points are selected from the data. Other wise num_inducing is ignored
-    .. Note:: Multiple independent outputs are allowed using columns of Y
-
-    """
-
-    def __init__(self, X, Y, kernel=None, Z=None, num_inducing=10, X_variance=None, normalizer=None, mpi_comm=None):
-        num_data, input_dim = X.shape
-
-        # kern defaults to rbf (plus white for stability)
-        if kernel is None:
-            kernel = kern.RBF(input_dim)#  + kern.white(input_dim, variance=1e-3)
-
-        # Z defaults to a subset of the data
-        if Z is None:
-            i = np.random.permutation(num_data)[:min(num_inducing, num_data)]
-            Z = X.view(np.ndarray)[i].copy()
-        else:
-            assert Z.shape[1] == input_dim
-
-        likelihood = likelihoods.Gaussian()
-
-        if not (X_variance is None):
-            X = NormalPosterior(X,X_variance)
-            
-        if mpi_comm is not None:
-            from ..inference.latent_function_inference.var_dtc_parallel import VarDTC_minibatch
-            infr = VarDTC_minibatch(mpi_comm=mpi_comm)
-        else:
-            infr = VarDTC()
-
-        SparseGP_MPI.__init__(self, X, Y, Z, kernel, likelihood, inference_method=infr, normalizer=normalizer, mpi_comm=mpi_comm)
-
-    def parameters_changed(self):
-        from ..inference.latent_function_inference.var_dtc_parallel import update_gradients_sparsegp,VarDTC_minibatch
-        if isinstance(self.inference_method,VarDTC_minibatch):
-            update_gradients_sparsegp(self, mpi_comm=self.mpi_comm)
-        else:
-            super(SparseGPRegression, self).parameters_changed()
-
-class SparseGPRegressionUncertainInput(SparseGP):
-    """
-    Gaussian Process model for regression with Gaussian variance on the inputs (X_variance)
-
-    This is a thin wrapper around the SparseGP class, with a set of sensible defalts
-
-    """
-
-    def __init__(self, X, X_variance, Y, kernel=None, Z=None, num_inducing=10, normalizer=None):
-        """
-        :param X: input observations
-        :type X: np.ndarray (num_data x input_dim)
-        :param X_variance: The uncertainty in the measurements of X (Gaussian variance, optional)
-        :type X_variance: np.ndarray (num_data x input_dim)
-        :param Y: observed values
-        :param kernel: a GPy kernel, defaults to rbf+white
-        :param Z: inducing inputs (optional, see note)
-        :type Z: np.ndarray (num_inducing x input_dim) | None
-        :param num_inducing: number of inducing points (ignored if Z is passed, see note)
-        :type num_inducing: int
-        :rtype: model object
-
-        .. Note:: If no Z array is passed, num_inducing (default 10) points are selected from the data. Other wise num_inducing is ignored
-        .. Note:: Multiple independent outputs are allowed using columns of Y
-        """
-        num_data, input_dim = X.shape
-
-        # kern defaults to rbf (plus white for stability)
-        if kernel is None:
-            kernel = kern.RBF(input_dim) + kern.White(input_dim, variance=1e-3)
-
-        # Z defaults to a subset of the data
-        if Z is None:
-            i = np.random.permutation(num_data)[:min(num_inducing, num_data)]
-            Z = X[i].copy()
-        else:
-            assert Z.shape[1] == input_dim
-
-        likelihood = likelihoods.Gaussian()
-
-        SparseGP.__init__(self, X, Y, Z, kernel, likelihood, X_variance=X_variance, inference_method=VarDTC(), normalizer=normalizer)
-        self.ensure_default_constraints()
--- a/GPy/models_modules/sparse_gplvm.py
+++ b/GPy/models_modules/sparse_gplvm.py
@ -1,43 +0,0 @@
-# Copyright (c) 2012, GPy authors (see AUTHORS.txt).
-# Licensed under the BSD 3-clause license (see LICENSE.txt)
-
-
-import numpy as np
-import sys
-from GPy.models.sparse_gp_regression import SparseGPRegression
-
-class SparseGPLVM(SparseGPRegression):
-    """
-    Sparse Gaussian Process Latent Variable Model
-
-    :param Y: observed data
-    :type Y: np.ndarray
-    :param input_dim: latent dimensionality
-    :type input_dim: int
-    :param init: initialisation method for the latent space
-    :type init: 'PCA'|'random'
-
-    """
-    def __init__(self, Y, input_dim, X=None, kernel=None, init='PCA', num_inducing=10):
-        if X is None:
-            from ..util.initialization import initialize_latent
-            X, fracs = initialize_latent(init, input_dim, Y)
-        SparseGPRegression.__init__(self, X, Y, kernel=kernel, num_inducing=num_inducing)
-
-    def parameters_changed(self):
-        super(SparseGPLVM, self).parameters_changed()
-        self.X.gradient = self.kern.gradients_X_diag(self.grad_dict['dL_dKdiag'], self.X)
-        self.X.gradient += self.kern.gradients_X(self.grad_dict['dL_dKnm'], self.X, self.Z)
-
-    def plot_latent(self, labels=None, which_indices=None,
-                resolution=50, ax=None, marker='o', s=40,
-                fignum=None, plot_inducing=True, legend=True,
-                plot_limits=None, 
-                aspect='auto', updates=False, predict_kwargs={}, imshow_kwargs={}):
-        assert "matplotlib" in sys.modules, "matplotlib package has not been imported."
-        from ..plotting.matplot_dep import dim_reduction_plots
-
-        return dim_reduction_plots.plot_latent(self, labels, which_indices,
-                resolution, ax, marker, s,
-                fignum, plot_inducing, legend,
-                plot_limits, aspect, updates, predict_kwargs, imshow_kwargs)
--- a/GPy/models_modules/warped_gp.py
+++ b/GPy/models_modules/warped_gp.py
@ -1,99 +0,0 @@
-# Copyright (c) 2012, GPy authors (see AUTHORS.txt).
-# Licensed under the BSD 3-clause license (see LICENSE.txt)
-
-
-import numpy as np
-from ..util.warping_functions import *
-from ..core import GP
-from .. import likelihoods
-from GPy.util.warping_functions import TanhWarpingFunction_d
-from GPy import kern
-
-class WarpedGP(GP):
-    def __init__(self, X, Y, kernel=None, warping_function=None, warping_terms=3, normalize_X=False, normalize_Y=False):
-
-        if kernel is None:
-            kernel = kern.rbf(X.shape[1])
-
-        if warping_function == None:
-            self.warping_function = TanhWarpingFunction_d(warping_terms)
-            self.warping_params = (np.random.randn(self.warping_function.n_terms * 3 + 1,) * 1)
-
-        self.scale_data = False
-        if self.scale_data:
-            Y = self._scale_data(Y)
-        self.has_uncertain_inputs = False
-        self.Y_untransformed = Y.copy()
-        self.predict_in_warped_space = False
-        likelihood = likelihoods.Gaussian(self.transform_data(), normalize=normalize_Y)
-
-        GP.__init__(self, X, likelihood, kernel, normalize_X=normalize_X)
-        self._set_params(self._get_params())
-
-    def _scale_data(self, Y):
-        self._Ymax = Y.max()
-        self._Ymin = Y.min()
-        return (Y - self._Ymin) / (self._Ymax - self._Ymin) - 0.5
-
-    def _unscale_data(self, Y):
-        return (Y + 0.5) * (self._Ymax - self._Ymin) + self._Ymin
-
-    def _set_params(self, x):
-        self.warping_params = x[:self.warping_function.num_parameters]
-        Y = self.transform_data()
-        self.likelihood.set_data(Y)
-        GP._set_params(self, x[self.warping_function.num_parameters:].copy())
-
-    def _get_params(self):
-        return np.hstack((self.warping_params.flatten().copy(), GP._get_params(self).copy()))
-
-    def _get_param_names(self):
-        warping_names = self.warping_function._get_param_names()
-        param_names = GP._get_param_names(self)
-        return warping_names + param_names
-
-    def transform_data(self):
-        Y = self.warping_function.f(self.Y_untransformed.copy(), self.warping_params).copy()
-        return Y
-
-    def log_likelihood(self):
-        ll = GP.log_likelihood(self)
-        jacobian = self.warping_function.fgrad_y(self.Y_untransformed, self.warping_params)
-        return ll + np.log(jacobian).sum()
-
-    def _log_likelihood_gradients(self):
-        ll_grads = GP._log_likelihood_gradients(self)
-        alpha = np.dot(self.Ki, self.likelihood.Y.flatten())
-        warping_grads = self.warping_function_gradients(alpha)
-
-        warping_grads = np.append(warping_grads[:, :-1].flatten(), warping_grads[0, -1])
-        return np.hstack((warping_grads.flatten(), ll_grads.flatten()))
-
-    def warping_function_gradients(self, Kiy):
-        grad_y = self.warping_function.fgrad_y(self.Y_untransformed, self.warping_params)
-        grad_y_psi, grad_psi = self.warping_function.fgrad_y_psi(self.Y_untransformed, self.warping_params,
-                                                                 return_covar_chain=True)
-        djac_dpsi = ((1.0 / grad_y[:, :, None, None]) * grad_y_psi).sum(axis=0).sum(axis=0)
-        dquad_dpsi = (Kiy[:, None, None, None] * grad_psi).sum(axis=0).sum(axis=0)
-
-        return -dquad_dpsi + djac_dpsi
-
-    def plot_warping(self):
-        self.warping_function.plot(self.warping_params, self.Y_untransformed.min(), self.Y_untransformed.max())
-
-    def predict(self, Xnew, which_parts='all', full_cov=False, pred_init=None):
-        # normalize X values
-        Xnew = (Xnew.copy() - self._Xoffset) / self._Xscale
-        mu, var = GP._raw_predict(self, Xnew, full_cov=full_cov, which_parts=which_parts)
-
-        # now push through likelihood
-        mean, var, _025pm, _975pm = self.likelihood.predictive_values(mu, var, full_cov)
-
-        if self.predict_in_warped_space:
-            mean = self.warping_function.f_inv(mean, self.warping_params, y=pred_init)
-            var = self.warping_function.f_inv(var, self.warping_params)
-
-        if self.scale_data:
-            mean = self._unscale_data(mean)
-        
-        return mean, var, _025pm, _975pm
--- a/GPy/plotting/matplot_dep/Tango.py
+++ b/GPy/plotting/matplot_dep/Tango.py
@ -2,6 +2,9 @@
 # Licensed under the BSD 3-clause license (see LICENSE.txt)


+import matplotlib as mpl
+import pylab as pb
+import sys
 #sys.path.append('/home/james/mlprojects/sitran_cluster/')
 #from switch_pylab_backend import *

@ -81,7 +84,6 @@ def reset():
        lightList.append(lightList.pop(0))

 def setLightFigures():
-    import matplotlib as mpl
    mpl.rcParams['axes.edgecolor']=colorsHex['Aluminium6']
    mpl.rcParams['axes.facecolor']=colorsHex['Aluminium2']
    mpl.rcParams['axes.labelcolor']=colorsHex['Aluminium6']
@ -95,7 +97,6 @@ def setLightFigures():
    mpl.rcParams['ytick.color']=colorsHex['Aluminium6']

 def setDarkFigures():
-    import matplotlib as mpl
    mpl.rcParams['axes.edgecolor']=colorsHex['Aluminium2']
    mpl.rcParams['axes.facecolor']=colorsHex['Aluminium6']
    mpl.rcParams['axes.labelcolor']=colorsHex['Aluminium2']
@ -156,10 +157,10 @@ cdict_Alu = {'red' :((0./5,colorsRGB['Aluminium1'][0]/256.,colorsRGB['Aluminium1
                     (5./5,colorsRGB['Aluminium6'][2]/256.,colorsRGB['Aluminium6'][2]/256.))}
 # cmap_Alu = mpl.colors.LinearSegmentedColormap('TangoAluminium',cdict_Alu,256)
 # cmap_BGR = mpl.colors.LinearSegmentedColormap('TangoRedBlue',cdict_BGR,256)
+# cmap_RB = mpl.colors.LinearSegmentedColormap('TangoRedBlue',cdict_RB,256)
 if __name__=='__main__':
-    import matplotlib.pyplot as pb, numpy as np
+    import pylab as pb
    pb.figure()
-    cmap_RB = mpl.colors.LinearSegmentedColormap('TangoRedBlue',cdict_RB,256)
-    pb.pcolor(np.random.rand(10,10),cmap=cmap_RB)
+    pb.pcolor(pb.rand(10,10),cmap=cmap_RB)
    pb.colorbar()
    pb.show()
--- a/GPy/util/datasets/data_resources_create.py
+++ b/GPy/util/datasets/data_resources_create.py
@ -1,134 +0,0 @@
-import json
-
-neil_url = 'http://staffwww.dcs.shef.ac.uk/people/N.Lawrence/dataset_mirror/'
-sam_url = 'http://www.cs.nyu.edu/~roweis/data/'
-cmu_url = 'http://mocap.cs.cmu.edu/subjects/'
-
-data_resources = {'ankur_pose_data' : {'urls' : [neil_url + 'ankur_pose_data/'],
-                                       'files' : [['ankurDataPoseSilhouette.mat']],
-                                       'license' : None,
-                                       'citation' : """3D Human Pose from Silhouettes by Relevance Vector Regression (In CVPR'04). A. Agarwal and B. Triggs.""",
-                                       'details' : """Artificially generated data of silhouettes given poses. Note that the data does not display a left/right ambiguity because across the entire data set one of the arms sticks out more the the other, disambiguating the pose as to which way the individual is facing."""},
-
-                  'boston_housing' : {'urls' : ['http://archive.ics.uci.edu/ml/machine-learning-databases/housing/'],
-                                      'files' : [['Index', 'housing.data', 'housing.names']],
-                                      'citation' : """Harrison, D. and Rubinfeld, D.L. 'Hedonic prices and the demand for clean air', J. Environ. Economics & Management, vol.5, 81-102, 1978.""",
-                                      'details' : """The Boston Housing data relates house values in Boston to a range of input variables.""",
-                                      'license' : None,
-                                      'size' : 51276
-                                      },
-                  'brendan_faces' : {'urls' : [sam_url],
-                                     'files': [['frey_rawface.mat']],
-                                     'citation' : 'Frey, B. J., Colmenarez, A and Huang, T. S. Mixtures of Local Linear Subspaces for Face Recognition. Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition 1998, 32-37, June 1998. Computer Society Press, Los Alamitos, CA.',
-                                     'details' : """A video of Brendan Frey's face popularized as a benchmark for visualization by the Locally Linear Embedding.""",
-                                     'license': None,
-                                     'size' : 1100584},
-                  'cmu_mocap_full' : {'urls' : ['http://mocap.cs.cmu.edu'],
-                                      'files' : [['allasfamc.zip']],
-                                      'citation' : """Please include this in your acknowledgements: The data used in this project was obtained from mocap.cs.cmu.edu.'
-                                      'The database was created with funding from NSF EIA-0196217.""",
-                                      'details' : """CMU Motion Capture data base. Captured by a Vicon motion capture system consisting of 12 infrared MX-40 cameras, each of which is capable of recording at 120 Hz with images of 4 megapixel resolution. Motions are captured in a working volume of approximately 3m x 8m. The capture subject wears 41 markers and a stylish black garment.""",
-                                      'license' : """From http://mocap.cs.cmu.edu. This data is free for use in research projects. You may include this data in commercially-sold products, but you may not resell this data directly, even in converted form. If you publish results obtained using this data, we would appreciate it if you would send the citation to your published paper to jkh+mocap@cs.cmu.edu, and also would add this text to your acknowledgments section: The data used in this project was obtained from mocap.cs.cmu.edu. The database was created with funding from NSF EIA-0196217.""",
-                                      'size' : None},
-                  'creep_rupture' : {'urls' : ['http://www.msm.cam.ac.uk/map/data/tar/'],
-                                     'files' : [['creeprupt.tar']],
-                                     'citation' : 'Materials Algorithms Project Data Library: MAP_DATA_CREEP_RUPTURE. F. Brun and T. Yoshida.',
-                                     'details' : """Provides 2066 creep rupture test results of steels (mainly of two kinds of steels: 2.25Cr and 9-12 wt% Cr ferritic steels). See http://www.msm.cam.ac.uk/map/data/materials/creeprupt-b.html.""",
-                                     'license' : None,
-                                     'size' : 602797},
-                  'della_gatta' : {'urls' : [neil_url + 'della_gatta/'],
-                                   'files': [['DellaGattadata.mat']],
-                                   'citation' : 'Direct targets of the TRP63 transcription factor revealed by a combination of gene expression profiling and reverse engineering. Giusy Della Gatta, Mukesh Bansal, Alberto Ambesi-Impiombato, Dario Antonini, Caterina Missero, and Diego di Bernardo, Genome Research 2008',
-                                   'details': "The full gene expression data set from della Gatta et al (http://www.ncbi.nlm.nih.gov/pmc/articles/PMC2413161/) processed by RMA.",
-                                   'license':None,
-                                   'size':3729650},
-                  'epomeo_gpx' : {'urls' : [neil_url + 'epomeo_gpx/'],
-                                   'files': [['endomondo_1.gpx', 'endomondo_2.gpx', 'garmin_watch_via_endomondo.gpx','viewranger_phone.gpx','viewranger_tablet.gpx']],
-                                   'citation' : '',
-                                   'details': "Five different GPS traces of the same run up Mount Epomeo in Ischia. The traces are from different sources. endomondo_1 and endomondo_2 are traces from the mobile phone app Endomondo, with a split in the middle. garmin_watch_via_endomondo is the trace from a Garmin watch, with a segment missing about 4 kilometers in. viewranger_phone and viewranger_tablet are traces from a phone and a tablet through the viewranger app. The viewranger_phone data comes from the same mobile phone as the Endomondo data (i.e. there are 3 GPS devices, but one device recorded two traces).",
-                                   'license':None,
-                                   'size': 2031872},
-                  'three_phase_oil_flow': {'urls' : [neil_url + 'three_phase_oil_flow/'],
-                                           'files' : [['DataTrnLbls.txt', 'DataTrn.txt', 'DataTst.txt', 'DataTstLbls.txt', 'DataVdn.txt', 'DataVdnLbls.txt']],
-                                           'citation' : 'Bishop, C. M. and G. D. James (1993). Analysis of multiphase flows using dual-energy gamma densitometry and neural networks. Nuclear Instruments and Methods in Physics Research A327, 580-593',
-                                           'details' : """The three phase oil data used initially for demonstrating the Generative Topographic mapping.""",
-                                           'license' : None,
-                                           'size' : 712796},
-                  'rogers_girolami_data' : {'urls' : ['https://www.dropbox.com/sh/7p6tu1t29idgliq/_XqlH_3nt9/'],
-                                            'files' : [['firstcoursemldata.tar.gz']],
-                                            'suffices' : [['?dl=1']],
-                                            'citation' : 'A First Course in Machine Learning. Simon Rogers and Mark Girolami: Chapman & Hall/CRC, ISBN-13: 978-1439824146',
-                                            'details' : """Data from the textbook 'A First Course in Machine Learning'. Available from http://www.dcs.gla.ac.uk/~srogers/firstcourseml/.""",
-                                            'license' : None,
-                                            'size' : 21949154},
-                  'olivetti_faces' : {'urls' : [neil_url + 'olivetti_faces/', sam_url],
-                                      'files' : [['att_faces.zip'], ['olivettifaces.mat']],
-                                            'citation' : 'Ferdinando Samaria and Andy Harter, Parameterisation of a Stochastic Model for Human Face Identification. Proceedings of 2nd IEEE Workshop on Applications of Computer Vision, Sarasota FL, December 1994',
-                                            'details' : """Olivetti Research Labs Face data base, acquired between December 1992 and December 1994 in the Olivetti Research Lab, Cambridge (which later became AT&T Laboratories, Cambridge). When using these images please give credit to AT&T Laboratories, Cambridge. """,
-                                            'license': None,
-                                            'size' : 8561331},
-                  'olympic_marathon_men' : {'urls' : [neil_url + 'olympic_marathon_men/'],
-                                            'files' : [['olympicMarathonTimes.csv']],
-                                            'citation' : None,
-                                            'details' : """Olympic mens' marathon gold medal winning times from 1896 to 2012. Time given in pace (minutes per kilometer). Data is originally downloaded and collated from Wikipedia, we are not responsible for errors in the data""",
-                                            'license': None,
-                                            'size' : 584},
-                  'osu_run1' : {'urls': ['http://accad.osu.edu/research/mocap/data/', neil_url + 'stick/'],
-                                'files': [['run1TXT.ZIP'],['connections.txt']],
-                                'details' : "Motion capture data of a stick man running from the Open Motion Data Project at Ohio State University.",
-                                'citation' : 'The Open Motion Data Project by The Ohio State University Advanced Computing Center for the Arts and Design, http://accad.osu.edu/research/mocap/mocap_data.htm.',
-                                'license' : 'Data is licensed under a Creative Commons Attribution-NonCommercial-ShareAlike 3.0 Unported License (http://creativecommons.org/licenses/by-nc-sa/3.0/).',
-                                'size': 338103},
-                  'osu_accad' : {'urls': ['http://accad.osu.edu/research/mocap/data/', neil_url + 'stick/'],
-                                'files': [['swagger1TXT.ZIP','handspring1TXT.ZIP','quickwalkTXT.ZIP','run1TXT.ZIP','sprintTXT.ZIP','dogwalkTXT.ZIP','camper_04TXT.ZIP','dance_KB3_TXT.ZIP','per20_TXT.ZIP','perTWO07_TXT.ZIP','perTWO13_TXT.ZIP','perTWO14_TXT.ZIP','perTWO15_TXT.ZIP','perTWO16_TXT.ZIP'],['connections.txt']],
-                                'details' : "Motion capture data of different motions from the Open Motion Data Project at Ohio State University.",
-                                'citation' : 'The Open Motion Data Project by The Ohio State University Advanced Computing Center for the Arts and Design, http://accad.osu.edu/research/mocap/mocap_data.htm.',
-                                'license' : 'Data is licensed under a Creative Commons Attribution-NonCommercial-ShareAlike 3.0 Unported License (http://creativecommons.org/licenses/by-nc-sa/3.0/).',
-                                'size': 15922790},
-                  'pumadyn-32nm' : {'urls' : ['ftp://ftp.cs.toronto.edu/pub/neuron/delve/data/tarfiles/pumadyn-family/'],
-                                    'files' : [['pumadyn-32nm.tar.gz']],
-                                    'details' : """Pumadyn non linear 32 input data set with moderate noise. See http://www.cs.utoronto.ca/~delve/data/pumadyn/desc.html for details.""",
-                                    'citation' : """Created by Zoubin Ghahramani using the Matlab Robotics Toolbox of Peter Corke. Corke, P. I. (1996). A Robotics Toolbox for MATLAB. IEEE Robotics and Automation Magazine, 3 (1): 24-32.""",
-                                    'license' : """Data is made available by the Delve system at the University of Toronto""",
-                                    'size' : 5861646},
-                  'robot_wireless' : {'urls' : [neil_url + 'robot_wireless/'],
-                                      'files' : [['uw-floor.txt']],
-                                      'citation' : """WiFi-SLAM using Gaussian Process Latent Variable Models by Brian Ferris, Dieter Fox and Neil Lawrence in IJCAI'07 Proceedings pages 2480-2485. Data used in A Unifying Probabilistic Perspective for Spectral Dimensionality Reduction: Insights and New Models by Neil D. Lawrence, JMLR 13 pg 1609--1638, 2012.""",
-                                      'details' : """Data created by Brian Ferris and Dieter Fox. Consists of WiFi access point strengths taken during a circuit of the Paul Allen building at the University of Washington.""",
-                                      'license' : None,
-                                      'size' : 284390},
-                  'swiss_roll' : {'urls' : ['http://isomap.stanford.edu/'],
-                                  'files' : [['swiss_roll_data.mat']],
-                                  'details' : """Swiss roll data made available by Tenenbaum, de Silva and Langford to demonstrate isomap, available from http://isomap.stanford.edu/datasets.html.""",
-                                  'citation' : 'A Global Geometric Framework for Nonlinear Dimensionality Reduction, J. B. Tenenbaum, V. de Silva and J. C. Langford, Science 290 (5500): 2319-2323, 22 December 2000',
-                                  'license' : None,
-                                  'size' : 800256},
-                  'ripley_prnn_data' : {'urls' : ['http://www.stats.ox.ac.uk/pub/PRNN/'],
-                                        'files' : [['Cushings.dat', 'README', 'crabs.dat', 'fglass.dat', 'fglass.grp', 'pima.te', 'pima.tr', 'pima.tr2', 'synth.te', 'synth.tr', 'viruses.dat', 'virus3.dat']],
-                                        'details' : """Data sets from Brian Ripley's Pattern Recognition and Neural Networks""",
-                                        'citation': """Pattern Recognition and Neural Networks by B.D. Ripley (1996) Cambridge University Press ISBN 0 521 46986 7""",
-                                        'license' : None,
-                                        'size' : 93565},
-                  'isomap_face_data' : {'urls' : [neil_url + 'isomap_face_data/'],
-                                        'files' : [['face_data.mat']],
-                                        'details' : """Face data made available by Tenenbaum, de Silva and Langford to demonstrate isomap, available from http://isomap.stanford.edu/datasets.html.""",
-                                        'citation' : 'A Global Geometric Framework for Nonlinear Dimensionality Reduction, J. B. Tenenbaum, V. de Silva and J. C. Langford, Science 290 (5500): 2319-2323, 22 December 2000',
-                                        'license' : None,
-                                        'size' : 24229368},
-                  'xw_pen' : {'urls' : [neil_url + 'xw_pen/'],
-                                        'files' : [['xw_pen_15.csv']],
-                                        'details' : """Accelerometer pen data used for robust regression by Tipping and Lawrence.""",
-                                        'citation' : 'Michael E. Tipping and Neil D. Lawrence. Variational inference for Student-t models: Robust Bayesian interpolation and generalised component analysis. Neurocomputing, 69:123--141, 2005',
-                                        'license' : None,
-                                        'size' : 3410},
-                  'hapmap3' : {'urls' : ['http://hapmap.ncbi.nlm.nih.gov/downloads/genotypes/latest_phaseIII_ncbi_b36/plink_format/'],
-                                 'files' : [['hapmap3_r2_b36_fwd.consensus.qc.poly.map.bz2', 'hapmap3_r2_b36_fwd.consensus.qc.poly.ped.bz2', 'relationships_w_pops_121708.txt']],
-                                 'details' : """HapMap Project: Single Nucleotide Polymorphism sequenced in all human populations. See http://www.nature.com/nature/journal/v426/n6968/abs/nature02168.html for details.""",
-                                 'citation': """Gibbs, Richard A., et al. "The international HapMap project." Nature 426.6968 (2003): 789-796.""",
-                                 'license' : """International HapMap Project Public Access License (http://hapmap.ncbi.nlm.nih.gov/cgi-perl/registration#licence)""",
-                                 'size' : 2*1729092237 + 62265},
-                  }
-
-with open('data_resources.json', 'w') as f:
-    print "writing data_resources"
-    json.dump(data_resources, f)
--- a/GPy/version
+++ b/GPy/version
@ -1 +0,0 @@
-0.4.9
--- a/doc/GPy.models_modules.rst
+++ b/doc/GPy.models_modules.rst
@ -1,134 +0,0 @@
-GPy.models_modules package
-==========================
-
-Submodules
----------
-
-GPy.models_modules.bayesian_gplvm module
----------------------------------------
-
-.. automodule:: GPy.models_modules.bayesian_gplvm
-    :members:
-    :undoc-members:
-    :show-inheritance:
-
-GPy.models_modules.bcgplvm module
---------------------------------
-
-.. automodule:: GPy.models_modules.bcgplvm
-    :members:
-    :undoc-members:
-    :show-inheritance:
-
-GPy.models_modules.fitc_classification module
---------------------------------------------
-
-.. automodule:: GPy.models_modules.fitc_classification
-    :members:
-    :undoc-members:
-    :show-inheritance:
-
-GPy.models_modules.gp_classification module
-------------------------------------------
-
-.. automodule:: GPy.models_modules.gp_classification
-    :members:
-    :undoc-members:
-    :show-inheritance:
-
-GPy.models_modules.gp_multioutput_regression module
---------------------------------------------------
-
-.. automodule:: GPy.models_modules.gp_multioutput_regression
-    :members:
-    :undoc-members:
-    :show-inheritance:
-
-GPy.models_modules.gp_regression module
---------------------------------------
-
-.. automodule:: GPy.models_modules.gp_regression
-    :members:
-    :undoc-members:
-    :show-inheritance:
-
-GPy.models_modules.gplvm module
-------------------------------
-
-.. automodule:: GPy.models_modules.gplvm
-    :members:
-    :undoc-members:
-    :show-inheritance:
-
-GPy.models_modules.gradient_checker module
------------------------------------------
-
-.. automodule:: GPy.models_modules.gradient_checker
-    :members:
-    :undoc-members:
-    :show-inheritance:
-
-GPy.models_modules.mrd module
-----------------------------
-
-.. automodule:: GPy.models_modules.mrd
-    :members:
-    :undoc-members:
-    :show-inheritance:
-
-GPy.models_modules.sparse_gp_classification module
--------------------------------------------------
-
-.. automodule:: GPy.models_modules.sparse_gp_classification
-    :members:
-    :undoc-members:
-    :show-inheritance:
-
-GPy.models_modules.sparse_gp_multioutput_regression module
----------------------------------------------------------
-
-.. automodule:: GPy.models_modules.sparse_gp_multioutput_regression
-    :members:
-    :undoc-members:
-    :show-inheritance:
-
-GPy.models_modules.sparse_gp_regression module
----------------------------------------------
-
-.. automodule:: GPy.models_modules.sparse_gp_regression
-    :members:
-    :undoc-members:
-    :show-inheritance:
-
-GPy.models_modules.sparse_gplvm module
--------------------------------------
-
-.. automodule:: GPy.models_modules.sparse_gplvm
-    :members:
-    :undoc-members:
-    :show-inheritance:
-
-GPy.models_modules.svigp_regression module
------------------------------------------
-
-.. automodule:: GPy.models_modules.svigp_regression
-    :members:
-    :undoc-members:
-    :show-inheritance:
-
-GPy.models_modules.warped_gp module
-----------------------------------
-
-.. automodule:: GPy.models_modules.warped_gp
-    :members:
-    :undoc-members:
-    :show-inheritance:
-
-
-Module contents
---------------
-
-.. automodule:: GPy.models_modules
-    :members:
-    :undoc-members:
-    :show-inheritance:
--- a/doc/GPy.rst
+++ b/doc/GPy.rst
@ -12,24 +12,11 @@ Subpackages
    GPy.kern
    GPy.likelihoods
    GPy.mappings
-    GPy.models_modules
    GPy.models
    GPy.plotting
    GPy.testing
    GPy.util

-Submodules
----------
-
-GPy.models module
-----------------
-
-.. automodule:: GPy.models
-    :members:
-    :undoc-members:
-    :show-inheritance:
-
-
 Module contents
 ---------------

--- a/doc/index.rst
+++ b/doc/index.rst
@ -22,9 +22,6 @@ The code can be found on our `Github project page <https://github.com/SheffieldM

 .. You may also be interested by some examples in the GPy/examples folder.

-The detailed Developers Documentation is listed below
-=====================================================
-
 Contents:

 .. toctree::
--- a/doc/tuto_GP_regression.rst
+++ b/doc/tuto_GP_regression.rst
@ -139,4 +139,4 @@ directly::
    :align:   center
    :height: 350px

-    Contour plot of the mean predictor (posterior mean).
+    Contour plot of the best predictor (posterior mean).