Merge branch 'cython2' into devel

2026-05-08 11:32:39 +02:00 · 2015-04-28 16:05:37 +01:00 · 2015-04-28 16:05:37 +01:00 · 44fbcb4914
commit 44fbcb4914
parent 5dd75d66ae 4ba928082f
23 changed files with 25833 additions and 377 deletions
--- a/GPy/core/parameterization/index_operations.py
+++ b/GPy/core/parameterization/index_operations.py
@ -5,6 +5,7 @@ import numpy
 from numpy.lib.function_base import vectorize
 from .lists_and_dicts import IntArrayDict
 from functools import reduce
+from transformations import Transformation

 def extract_properties_to_index(index, props):
    prop_index = dict()
--- a/GPy/core/parameterization/parameterized.py
+++ b/GPy/core/parameterization/parameterized.py
@ -6,10 +6,10 @@ import numpy; np = numpy
 import itertools
 from re import compile, _pattern_type
 from .param import ParamConcatenation
-from .parameter_core import HierarchyError, Parameterizable, adjust_name_for_printing
+from parameter_core import HierarchyError, Parameterizable, adjust_name_for_printing

 import logging
-from GPy.core.parameterization.index_operations import ParameterIndexOperationsView
+from index_operations import ParameterIndexOperationsView
 logger = logging.getLogger("parameters changed meta")

 class ParametersChangedMeta(type):
--- a/GPy/core/parameterization/priors.py
+++ b/GPy/core/parameterization/priors.py
@ -730,7 +730,7 @@ class DGPLVM(Prior):

 # ******************************************

-from parameterized import Parameterized
+from .. import Parameterized
 from .. import Param
 class DGPLVM_Lamda(Prior, Parameterized):
    """
--- a/GPy/defaults.cfg
+++ b/GPy/defaults.cfg
@ -25,3 +25,6 @@ MKL = False
 [weave]
 #if true, try to use weave, and fall back to numpy. if false, just use numpy.
 working = True
+
+[cython]
+working = True
--- a/GPy/kern/_src/coregionalize.py
+++ b/GPy/kern/_src/coregionalize.py
@ -5,12 +5,8 @@ from .kern import Kern
 import numpy as np
 from ...core.parameterization import Param
 from ...core.parameterization.transformations import Logexp
-from ...util.config import config # for assesing whether to use weave
-
-try:
-    from scipy import weave
-except ImportError:
-    config.set('weave', 'working', 'False')
+from ...util.config import config # for assesing whether to use cython
+import coregionalize_cython

 class Coregionalize(Kern):
    """
@ -61,13 +57,8 @@ class Coregionalize(Kern):
        self.B = np.dot(self.W, self.W.T) + np.diag(self.kappa)

    def K(self, X, X2=None):
-        if config.getboolean('weave', 'working'):
-            try:
-                return self._K_weave(X, X2)
-            except:
-                print("\n Weave compilation failed. Falling back to (slower) numpy implementation\n")
-                config.set('weave', 'working', 'False')
-                return self._K_numpy(X, X2)
+        if config.getboolean('cython', 'working'):
+            return self._K_cython(X, X2)
        else:
            return self._K_numpy(X, X2)

@ -80,36 +71,10 @@ class Coregionalize(Kern):
            index2 = np.asarray(X2, dtype=np.int)
            return self.B[index,index2.T]

-    def _K_weave(self, X, X2=None):
-        """compute the kernel function using scipy.weave"""
-        index = np.asarray(X, dtype=np.int)
-
+    def _K_cython(self, X, X2=None):
        if X2 is None:
-            target = np.empty((X.shape[0], X.shape[0]), dtype=np.float64)
-            code="""
-            for(int i=0;i<N; i++){
-              target[i+i*N] = B[index[i]+output_dim*index[i]];
-              for(int j=0; j<i; j++){
-                  target[j+i*N] = B[index[i]+output_dim*index[j]];
-                  target[i+j*N] = target[j+i*N];
-                }
-              }
-            """
-            N, B, output_dim = index.size, self.B, self.output_dim
-            weave.inline(code, ['target', 'index', 'N', 'B', 'output_dim'])
-        else:
-            index2 = np.asarray(X2, dtype=np.int)
-            target = np.empty((X.shape[0], X2.shape[0]), dtype=np.float64)
-            code="""
-            for(int i=0;i<num_inducing; i++){
-              for(int j=0; j<N; j++){
-                  target[i+j*num_inducing] = B[output_dim*index[j]+index2[i]];
-                }
-              }
-            """
-            N, num_inducing, B, output_dim = index.size, index2.size, self.B, self.output_dim
-            weave.inline(code, ['target', 'index', 'index2', 'N', 'num_inducing', 'B', 'output_dim'])
-        return target
+            return coregionalize_cython.K_symmetric(self.B, np.asarray(X, dtype=np.int64)[:,0])
+        return coregionalize_cython.K_asymmetric(self.B, np.asarray(X, dtype=np.int64)[:,0], np.asarray(X2, dtype=np.int64)[:,0])


    def Kdiag(self, X):
@ -122,19 +87,13 @@ class Coregionalize(Kern):
        else:
            index2 = np.asarray(X2, dtype=np.int)

-        #attempt to use weave for a nasty double indexing loop: fall back to numpy
-        if config.getboolean('weave', 'working'):
-            try:
-                dL_dK_small = self._gradient_reduce_weave(dL_dK, index, index2)
-            except:
-                print("\n Weave compilation failed. Falling back to (slower) numpy implementation\n")
-                config.set('weave', 'working', 'False')
-                dL_dK_small = self._gradient_reduce_weave(dL_dK, index, index2)
+        #attempt to use cython for a nasty double indexing loop: fall back to numpy
+        if config.getboolean('cython', 'working'):
+            dL_dK_small = self._gradient_reduce_cython(dL_dK, index, index2)
        else:
            dL_dK_small = self._gradient_reduce_numpy(dL_dK, index, index2)


-
        dkappa = np.diag(dL_dK_small)
        dL_dK_small += dL_dK_small.T
        dW = (self.W[:, None, :]*dL_dK_small[:, :, None]).sum(0)
@ -142,19 +101,6 @@ class Coregionalize(Kern):
        self.W.gradient = dW
        self.kappa.gradient = dkappa

-    def _gradient_reduce_weave(self, dL_dK, index, index2):
-        dL_dK_small = np.zeros_like(self.B)
-        code="""
-        for(int i=0; i<num_inducing; i++){
-          for(int j=0; j<N; j++){
-            dL_dK_small[index[j] + output_dim*index2[i]] += dL_dK[i+j*num_inducing];
-          }
-        }
-        """
-        N, num_inducing, output_dim = index.size, index2.size, self.output_dim
-        weave.inline(code, ['N', 'num_inducing', 'output_dim', 'dL_dK', 'dL_dK_small', 'index', 'index2'])
-        return dL_dK_small
-
    def _gradient_reduce_numpy(self, dL_dK, index, index2):
        index, index2 = index[:,0], index2[:,0]
        dL_dK_small = np.zeros_like(self.B)
@ -164,6 +110,11 @@ class Coregionalize(Kern):
                dL_dK_small[j,i] = tmp1[:,index2==j].sum()
        return dL_dK_small

+    def _gradient_reduce_cython(self, dL_dK, index, index2):
+        index, index2 = index[:,0], index2[:,0]
+        return coregionalize_cython.gradient_reduce(self.B.shape[0], dL_dK, index, index2)
+
+
    def update_gradients_diag(self, dL_dKdiag, X):
        index = np.asarray(X, dtype=np.int).flatten()
        dL_dKdiag_small = np.array([dL_dKdiag[index==i].sum() for i in range(self.output_dim)])
--- a/GPy/kern/_src/coregionalize_cython.c
+++ b/GPy/kern/_src/coregionalize_cython.c
--- a/GPy/kern/_src/coregionalize_cython.pyx
+++ b/GPy/kern/_src/coregionalize_cython.pyx
@ -0,0 +1,34 @@
+#cython: boundscheck=True
+#cython: wraparound=True
+import cython
+import numpy as np
+cimport numpy as np
+
+def K_symmetric(np.ndarray[double, ndim=2] B, np.ndarray[np.int64_t, ndim=1] X):
+    cdef int N = X.size
+    cdef np.ndarray[np.double_t, ndim=2] K = np.empty((N, N))
+    for n in range(N):
+        for m in range(N):
+            K[n,m] = B[X[n],X[m]]
+    return K
+
+def K_asymmetric(np.ndarray[double, ndim=2] B, np.ndarray[np.int64_t, ndim=1] X, np.ndarray[np.int64_t, ndim=1] X2):
+    cdef int N = X.size
+    cdef int M = X2.size
+    cdef np.ndarray[np.double_t, ndim=2] K = np.empty((N, M))
+    for n in range(N):
+        for m in range(M):
+            K[n,m] = B[X[n],X2[m]]
+    return K
+
+def gradient_reduce(int D, np.ndarray[double, ndim=2] dL_dK, np.ndarray[np.int64_t, ndim=1] index, np.ndarray[np.int64_t, ndim=1] index2):
+        cdef np.ndarray[np.double_t, ndim=2] dL_dK_small = np.zeros((D, D))
+        cdef int N = index.size
+        cdef int M = index2.size
+        for i in range(N):
+            for j in range(M):
+                dL_dK_small[index2[j],index[i]] += dL_dK[i,j];
+        return dL_dK_small
+
+
+
--- a/GPy/kern/_src/stationary.py
+++ b/GPy/kern/_src/stationary.py
@ -9,13 +9,15 @@ from ...util.linalg import tdot
 from ... import util
 import numpy as np
 from scipy import integrate
-from ...util.config import config # for assesing whether to use weave
+from ...util.config import config # for assesing whether to use cython
 from ...util.caching import Cache_this

 try:
-    from scipy import weave
+    import stationary_cython
 except ImportError:
-    config.set('weave', 'working', 'False')
+    print('warning: failed to import cython module: falling back to numpy')
+    config.set('cython', 'working', 'false')
+

 class Stationary(Kern):
    """
@ -153,28 +155,18 @@ class Stationary(Kern):
        (dL_dK), compute the gradient wrt the parameters of this kernel,
        and store in the parameters object as e.g. self.variance.gradient
        """
-        self.variance.gradient = np.einsum('ij,ij,i', self.K(X, X2), dL_dK, 1./self.variance)
+        self.variance.gradient = np.sum(self.K(X, X2)* dL_dK)/self.variance

        #now the lengthscale gradient(s)
        dL_dr = self.dK_dr_via_X(X, X2) * dL_dK
        if self.ARD:
-            #rinv = self._inv_dis# this is rather high memory? Should we loop instead?t(X, X2)
-            #d =  X[:, None, :] - X2[None, :, :]
-            #x_xl3 = np.square(d)
-            #self.lengthscale.gradient = -((dL_dr*rinv)[:,:,None]*x_xl3).sum(0).sum(0)/self.lengthscale**3
+
            tmp = dL_dr*self._inv_dist(X, X2)
            if X2 is None: X2 = X
-
-
-            if config.getboolean('weave', 'working'):
-                try:
-                    self.lengthscale.gradient = self.weave_lengthscale_grads(tmp, X, X2)
-                except:
-                    print("\n Weave compilation failed. Falling back to (slower) numpy implementation\n")
-                    config.set('weave', 'working', 'False')
-                    self.lengthscale.gradient = np.array([np.einsum('ij,ij,...', tmp, np.square(X[:,q:q+1] - X2[:,q:q+1].T), -1./self.lengthscale[q]**3) for q in range(self.input_dim)])
+            if config.getboolean('cython', 'working'):
+                self.lengthscale.gradient = self._lengthscale_grads_cython(tmp, X, X2)
            else:
-                self.lengthscale.gradient = np.array([np.einsum('ij,ij,...', tmp, np.square(X[:,q:q+1] - X2[:,q:q+1].T), -1./self.lengthscale[q]**3) for q in range(self.input_dim)])
+                self.lengthscale.gradient = self._lengthscale_grads_pure(tmp, X, X2)
        else:
            r = self._scaled_dist(X, X2)
            self.lengthscale.gradient = -np.sum(dL_dr*r)/self.lengthscale
@ -189,43 +181,27 @@ class Stationary(Kern):
        dist = self._scaled_dist(X, X2).copy()
        return 1./np.where(dist != 0., dist, np.inf)

-    def weave_lengthscale_grads(self, tmp, X, X2):
-        """Use scipy.weave to compute derivatives wrt the lengthscales"""
+    def _lengthscale_grads_pure(self, tmp, X, X2):
+        return -np.array([np.sum(tmp * np.square(X[:,q:q+1] - X2[:,q:q+1].T)) for q in range(self.input_dim)])/self.lengthscale**3
+
+    def _lengthscale_grads_cython(self, tmp, X, X2):
        N,M = tmp.shape
-        Q = X.shape[1]
-        if hasattr(X, 'values'):X = X.values
-        if hasattr(X2, 'values'):X2 = X2.values
+        Q = self.input_dim
+        X, X2 = np.ascontiguousarray(X), np.ascontiguousarray(X2)
        grads = np.zeros(self.input_dim)
-        code = """
-        double gradq;
-        for(int q=0; q<Q; q++){
-          gradq = 0;
-          for(int n=0; n<N; n++){
-            for(int m=0; m<M; m++){
-              gradq += tmp(n,m)*(X(n,q)-X2(m,q))*(X(n,q)-X2(m,q));
-            }
-          }
-          grads(q) = gradq;
-        }
-        """
-        weave.inline(code, ['tmp', 'X', 'X2', 'grads', 'N', 'M', 'Q'], type_converters=weave.converters.blitz, support_code="#include <math.h>")
+        stationary_cython.lengthscale_grads(N, M, Q, tmp, X, X2, grads)
        return -grads/self.lengthscale**3

    def gradients_X(self, dL_dK, X, X2=None):
        """
        Given the derivative of the objective wrt K (dL_dK), compute the derivative wrt X
        """
-        if config.getboolean('weave', 'working'):
-            try:
-                return self.gradients_X_weave(dL_dK, X, X2)
-            except:
-                print("\n Weave compilation failed. Falling back to (slower) numpy implementation\n")
-                config.set('weave', 'working', 'False')
-                return self.gradients_X_(dL_dK, X, X2)
+        if config.getboolean('cython', 'working'):
+            return self._gradients_X_cython(dL_dK, X, X2)
        else:
-            return self.gradients_X_(dL_dK, X, X2)
+            return self._gradients_X_pure(dL_dK, X, X2)

-    def gradients_X_(self, dL_dK, X, X2=None):
+    def _gradients_X_pure(self, dL_dK, X, X2=None):
        invdist = self._inv_dist(X, X2)
        dL_dr = self.dK_dr_via_X(X, X2) * dL_dK
        tmp = invdist*dL_dr
@ -235,54 +211,25 @@ class Stationary(Kern):

        #The high-memory numpy way:
        #d =  X[:, None, :] - X2[None, :, :]
-        #ret = np.sum(tmp[:,:,None]*d,1)/self.lengthscale**2
+        #grad = np.sum(tmp[:,:,None]*d,1)/self.lengthscale**2

        #the lower memory way with a loop
-        ret = np.empty(X.shape, dtype=np.float64)
+        grad = np.empty(X.shape, dtype=np.float64)
        for q in range(self.input_dim):
-            np.sum(tmp*(X[:,q][:,None]-X2[:,q][None,:]), axis=1, out=ret[:,q])
-        ret /= self.lengthscale**2
+            np.sum(tmp*(X[:,q][:,None]-X2[:,q][None,:]), axis=1, out=grad[:,q])
+        return grad/self.lengthscale**2

-        return ret
-
-    def gradients_X_weave(self, dL_dK, X, X2=None):
+    def _gradients_X_cython(self, dL_dK, X, X2=None):
        invdist = self._inv_dist(X, X2)
        dL_dr = self.dK_dr_via_X(X, X2) * dL_dK
        tmp = invdist*dL_dr
        if X2 is None:
            tmp = tmp + tmp.T
            X2 = X
-
-        code = """
-        int n,m,d;
-        double retnd;
-        #pragma omp parallel for private(n,d, retnd, m)
-        for(d=0;d<D;d++){
-          for(n=0;n<N;n++){
-            retnd = 0.0;
-            for(m=0;m<M;m++){
-              retnd += tmp(n,m)*(X(n,d)-X2(m,d));
-            }
-            ret(n,d) = retnd;
-          }
-        }
-
-        """
-        if hasattr(X, 'values'):X = X.values #remove the GPy wrapping to make passing into weave safe
-        if hasattr(X2, 'values'):X2 = X2.values
-        ret = np.zeros(X.shape)
-        N,D = X.shape
-        N,M = tmp.shape
-        from scipy import weave
-        support_code = """
-        #include <omp.h>
-        #include <stdio.h>
-        """
-        weave_options = {'headers'           : ['<omp.h>'],
-                         'extra_compile_args': ['-fopenmp -O3'], # -march=native'],
-                         'extra_link_args'   : ['-lgomp']}
-        weave.inline(code, ['ret', 'N', 'D', 'M', 'tmp', 'X', 'X2'], type_converters=weave.converters.blitz, support_code=support_code, **weave_options)
-        return ret/self.lengthscale**2
+        X, X2 = np.ascontiguousarray(X), np.ascontiguousarray(X2)
+        grad = np.zeros(X.shape)
+        stationary_cython.grad_X(X.shape[0], X.shape[1], X2.shape[0], X, X2, tmp, grad)
+        return grad/self.lengthscale**2

    def gradients_X_diag(self, dL_dKdiag, X):
        return np.zeros(X.shape)
@ -290,6 +237,9 @@ class Stationary(Kern):
    def input_sensitivity(self, summarize=True):
        return self.variance*np.ones(self.input_dim)/self.lengthscale**2

+
+
+
 class Exponential(Stationary):
    def __init__(self, input_dim, variance=1., lengthscale=None, ARD=False, active_dims=None, name='Exponential'):
        super(Exponential, self).__init__(input_dim, variance, lengthscale, ARD, active_dims, name)
--- a/GPy/kern/_src/stationary_cython.c
+++ b/GPy/kern/_src/stationary_cython.c
--- a/GPy/kern/_src/stationary_cython.pyx
+++ b/GPy/kern/_src/stationary_cython.pyx
@ -0,0 +1,36 @@
+#cython: boundscheck=False
+#cython: wraparound=False
+import numpy as np
+cimport numpy as np
+
+ctypedef np.float64_t DTYPE_t
+ 
+cdef extern from "stationary_utils.h":
+    void _grad_X "_grad_X" (int N, int D, int M, double* X, double* X2, double* tmp, double* grad)
+
+cdef extern from "stationary_utils.h":
+    void _lengthscale_grads "_lengthscale_grads" (int N, int M, int Q, double* tmp, double* X, double* X2, double* grad)
+ 
+def grad_X(int N, int D, int M,
+        np.ndarray[DTYPE_t, ndim=2] _X,
+        np.ndarray[DTYPE_t, ndim=2] _X2,
+        np.ndarray[DTYPE_t, ndim=2] _tmp,
+        np.ndarray[DTYPE_t, ndim=2] _grad):
+    cdef double *X = <double*> _X.data
+    cdef double *X2 = <double*> _X2.data
+    cdef double *tmp = <double*> _tmp.data
+    cdef double *grad = <double*> _grad.data
+    _grad_X(N, D, M, X, X2, tmp, grad) # return nothing, work in place.
+
+def lengthscale_grads(int N, int M, int Q,
+        np.ndarray[DTYPE_t, ndim=2] _tmp,
+        np.ndarray[DTYPE_t, ndim=2] _X,
+        np.ndarray[DTYPE_t, ndim=2] _X2,
+        np.ndarray[DTYPE_t, ndim=1] _grad):
+    cdef double *tmp = <double*> _tmp.data
+    cdef double *X = <double*> _X.data
+    cdef double *X2 = <double*> _X2.data
+    cdef double *grad = <double*> _grad.data
+    _lengthscale_grads(N, M, Q, tmp, X, X2, grad) # return nothing, work in place.
+
+
--- a/GPy/kern/_src/stationary_utils.c
+++ b/GPy/kern/_src/stationary_utils.c
@ -0,0 +1,35 @@
+void _grad_X(int N, int D, int M, double* X, double* X2, double* tmp, double* grad){
+int n,m,d;
+double retnd;
+//#pragma omp parallel for private(n,d, retnd, m)
+for(d=0;d<D;d++){
+  for(n=0;n<N;n++){
+    retnd = 0.0;
+    for(m=0;m<M;m++){
+      retnd += tmp[n*M+m]*(X[n*D+d]-X2[m*D+d]);
+    }
+    grad[n*D+d] = retnd;
+  }
+}
+} //grad_X
+
+
+void _lengthscale_grads(int N, int M, int Q, double* tmp, double* X, double* X2, double* grad){
+int n,m,q;
+double gradq, dist;
+#pragma omp parallel for private(n,m, gradq, dist)
+for(q=0; q<Q; q++){
+  gradq = 0;
+  for(n=0; n<N; n++){
+    for(m=0; m<M; m++){
+        dist = X[n*Q+q]-X2[m*Q+q];
+        gradq += tmp[n*M+m]*dist*dist;
+    }
+  }
+  grad[q] = gradq;
+}
+} //lengthscale_grads
+
+
+
+
--- a/GPy/kern/_src/stationary_utils.h
+++ b/GPy/kern/_src/stationary_utils.h
@ -0,0 +1,3 @@
+#include <omp.h>
+void _grad_X(int N, int D, int M, double*X, double* X2, double* tmp, double* grad);
+void _lengthscale_grads(int N, int D, int M, double* X, double* X2, double* tmp, double* grad);
--- a/GPy/testing/cython_tests.py
+++ b/GPy/testing/cython_tests.py
@ -0,0 +1,57 @@
+import numpy as np
+import scipy as sp
+from GPy.util import choleskies
+import GPy
+
+"""
+These tests make sure that the opure python and cython codes work the same
+"""
+
+class CythonTestChols(np.testing.TestCase):
+    def setUp(self):
+        self.flat = np.random.randn(45, 5)
+        self.triang = np.dstack([np.eye(20)[:,:,None] for i in range(3)])
+    def test_flat_to_triang(self):
+        L1 = choleskies._flat_to_triang_pure(self.flat)
+        L2 = choleskies._flat_to_triang_cython(self.flat)
+        np.testing.assert_allclose(L1, L2)
+    def test_triang_to_flat(self):
+        A1 = choleskies._triang_to_flat_pure(self.triang)
+        A2 = choleskies._triang_to_flat_cython(self.triang)
+        np.testing.assert_allclose(A1, A2)
+
+class test_stationary(np.testing.TestCase):
+    def setUp(self):
+        self.k = GPy.kern.RBF(10)
+        self.X = np.random.randn(300,10)
+        self.Z = np.random.randn(20,10)
+        self.dKxx = np.random.randn(300,300)
+        self.dKzz = np.random.randn(20,20)
+        self.dKxz = np.random.randn(300,20)
+
+    def test_square_gradX(self):
+        g1 = self.k._gradients_X_cython(self.dKxx, self.X)
+        g2 = self.k._gradients_X_pure(self.dKxx, self.X)
+        np.testing.assert_allclose(g1, g2)
+
+    def test_rect_gradx(self):
+        g1 = self.k._gradients_X_cython(self.dKxz, self.X, self.Z)
+        g2 = self.k._gradients_X_pure(self.dKxz, self.X, self.Z)
+        np.testing.assert_allclose(g1, g2)
+
+    def test_square_lengthscales(self):
+        g1 = self.k._lengthscale_grads_pure(self.dKxx, self.X, self.X)
+        g2 = self.k._lengthscale_grads_cython(self.dKxx, self.X, self.X)
+        np.testing.assert_allclose(g1, g2)
+
+    def test_rect_lengthscales(self):
+        g1 = self.k._lengthscale_grads_pure(self.dKxz, self.X, self.Z)
+        g2 = self.k._lengthscale_grads_cython(self.dKxz, self.X, self.Z)
+        np.testing.assert_allclose(g1, g2)
+
+
+
+
+
+
+
--- a/GPy/testing/kernel_tests.py
+++ b/GPy/testing/kernel_tests.py
@ -366,9 +366,9 @@ class KernelTestsNonContinuous(unittest.TestCase):
        X2 = self.X2[self.X2[:,-1]!=2]
        self.assertTrue(check_kernel_gradient_functions(kern, X=X, X2=X2, verbose=verbose, fixed_X_dims=-1))

-class Coregionalize_weave_test(unittest.TestCase):
+class Coregionalize_cython_test(unittest.TestCase):
    """
-    Make sure that the coregionalize kernel work with and without weave enabled
+    Make sure that the coregionalize kernel work with and without cython enabled
    """
    def setUp(self):
        self.k = GPy.kern.Coregionalize(1, output_dim=12)
@ -378,36 +378,42 @@ class Coregionalize_weave_test(unittest.TestCase):

    def test_sym(self):
        dL_dK = np.random.randn(self.N1, self.N1)
-        GPy.util.config.config.set('weave', 'working', 'True')
-        K_weave = self.k.K(self.X)
+        GPy.util.config.config.set('cython', 'working', 'True')
+        K_cython = self.k.K(self.X)
        self.k.update_gradients_full(dL_dK, self.X)
-        grads_weave = self.k.gradient.copy()
+        grads_cython = self.k.gradient.copy()

-        GPy.util.config.config.set('weave', 'working', 'False')
+        GPy.util.config.config.set('cython', 'working', 'False')
        K_numpy = self.k.K(self.X)
        self.k.update_gradients_full(dL_dK, self.X)
        grads_numpy = self.k.gradient.copy()

-        self.assertTrue(np.allclose(K_numpy, K_weave))
-        self.assertTrue(np.allclose(grads_numpy, grads_weave))
+        self.assertTrue(np.allclose(K_numpy, K_cython))
+        self.assertTrue(np.allclose(grads_numpy, grads_cython))
+
+        #reset the cython state for any other tests
+        GPy.util.config.config.set('cython', 'working', 'true')

    def test_nonsym(self):
        dL_dK = np.random.randn(self.N1, self.N2)
-        GPy.util.config.config.set('weave', 'working', 'True')
-        K_weave = self.k.K(self.X, self.X2)
+        GPy.util.config.config.set('cython', 'working', 'True')
+        K_cython = self.k.K(self.X, self.X2)
+        self.k.gradient = 0.
        self.k.update_gradients_full(dL_dK, self.X, self.X2)
-        grads_weave = self.k.gradient.copy()
+        grads_cython = self.k.gradient.copy()

-        GPy.util.config.config.set('weave', 'working', 'False')
+        GPy.util.config.config.set('cython', 'working', 'False')
        K_numpy = self.k.K(self.X, self.X2)
+        self.k.gradient = 0.
        self.k.update_gradients_full(dL_dK, self.X, self.X2)
        grads_numpy = self.k.gradient.copy()

-        self.assertTrue(np.allclose(K_numpy, K_weave))
-        self.assertTrue(np.allclose(grads_numpy, grads_weave))
+        self.assertTrue(np.allclose(K_numpy, K_cython))
+        self.assertTrue(np.allclose(grads_numpy, grads_cython))
+
+        #reset the cython state for any other tests
+        GPy.util.config.config.set('cython', 'working', 'true')

-    #reset the weave state for any other tests
-    GPy.util.config.config.set('weave', 'working', 'False')


 class KernelTestsProductWithZeroValues(unittest.TestCase):
--- a/GPy/util/choleskies.py
+++ b/GPy/util/choleskies.py
@ -5,10 +5,7 @@ import numpy as np
 from . import linalg
 from .config import config

-try:
-    from scipy import weave
-except ImportError:
-    config.set('weave', 'working', 'False')
+import choleskies_cython

 def safe_root(N):
    i = np.sqrt(N)
@ -17,36 +14,6 @@ def safe_root(N):
        raise ValueError("N is not square!")
    return j

-def _flat_to_triang_weave(flat):
-    """take a matrix N x D and return a M X M x D array where
-
-    N = M(M+1)/2
-
-    the lower triangluar portion of the d'th slice of the result is filled by the d'th column of flat.
-    This is the weave implementation
-    """
-    N, D = flat.shape
-    M = (-1 + safe_root(8*N+1))/2
-    ret = np.zeros((M, M, D))
-    flat = np.ascontiguousarray(flat)
-
-    code = """
-    int count = 0;
-    for(int m=0; m<M; m++)
-    {
-      for(int mm=0; mm<=m; mm++)
-      {
-        for(int d=0; d<D; d++)
-        {
-          ret[d + m*D*M + mm*D] = flat[count];
-          count++;
-        }
-      }
-    }
-    """
-    weave.inline(code, ['flat', 'ret', 'D', 'M'])
-    return ret
-
 def _flat_to_triang_pure(flat_mat):
    N, D = flat_mat.shape
    M = (-1 + safe_root(8*N+1))//2
@ -59,34 +26,11 @@ def _flat_to_triang_pure(flat_mat):
              count = count+1
    return ret

-if config.getboolean('weave', 'working'):
-	flat_to_triang =  _flat_to_triang_weave
-else:
-        flat_to_triang =  _flat_to_triang_pure
+def _flat_to_triang_cython(flat_mat):
+    N, D = flat_mat.shape
+    M = (-1 + safe_root(8*N+1))//2
+    return choleskies_cython.flat_to_triang(flat_mat, M)

-def _triang_to_flat_weave(L):
-    M, _, D = L.shape
-
-    L = np.ascontiguousarray(L) # should do nothing if L was created by flat_to_triang
-
-    N = M*(M+1)/2
-    flat = np.empty((N, D))
-    code = """
-    int count = 0;
-    for(int m=0; m<M; m++)
-    {
-      for(int mm=0; mm<=m; mm++)
-      {
-        for(int d=0; d<D; d++)
-        {
-          flat[count] = L[d + m*D*M + mm*D];
-          count++;
-        }
-      }
-    }
-    """
-    weave.inline(code, ['flat', 'L', 'D', 'M'])
-    return flat

 def _triang_to_flat_pure(L):
    M, _, D = L.shape
@ -101,41 +45,19 @@ def _triang_to_flat_pure(L):
                count = count +1
    return flat

-if config.getboolean('weave', 'working'):
-    triang_to_flat =  _triang_to_flat_weave
-else:
-    triang_to_flat =  _triang_to_flat_pure
+def _triang_to_flat_cython(L):
+    return choleskies_cython.triang_to_flat(L)

 def triang_to_cov(L):
    return np.dstack([np.dot(L[:,:,i], L[:,:,i].T) for i in range(L.shape[-1])])

-def multiple_dpotri_old(Ls):
-    M, _, D = Ls.shape
-    Kis = np.rollaxis(Ls, -1).copy()
-    [dpotri(Kis[i,:,:], overwrite_c=1, lower=1) for i in range(D)]
-    code = """
-    for(int d=0; d<D; d++)
-    {
-      for(int m=0; m<M; m++)
-      {
-        for(int mm=0; mm<m; mm++)
-        {
-          Kis[d*M*M + mm*M + m ] = Kis[d*M*M + m*M + mm];
-        }
-      }
-    }
-
-    """
-    weave.inline(code, ['Kis', 'D', 'M'])
-    Kis = np.rollaxis(Kis, 0, 3) #wtf rollaxis?
-    return Kis
-
 def multiple_dpotri(Ls):
    return np.dstack([linalg.dpotri(np.asfortranarray(Ls[:,:,i]), lower=1)[0] for i in range(Ls.shape[-1])])

 def indexes_to_fix_for_low_rank(rank, size):
    """
-    work out which indexes of the flatteneed array should be fixed if we want the cholesky to represent a low rank matrix
+    Work out which indexes of the flatteneed array should be fixed if we want
+    the cholesky to represent a low rank matrix
    """
    #first we'll work out what to keep, and the do the set difference.

@ -153,15 +75,10 @@ def indexes_to_fix_for_low_rank(rank, size):
    return np.setdiff1d(np.arange((size**2+size)/2), keep)


+if config.getboolean('cython', 'working'):
+    triang_to_flat = _triang_to_flat_cython
+    flat_to_triang = _flat_to_triang_cython
+else:
+    triang_to_flat =  _triang_to_flat_pure
+    flat_to_triang = _flat_to_triang_pure

-#class cholchecker(GPy.core.Model):
-    #def __init__(self, L, name='cholchecker'):
-        #super(cholchecker, self).__init__(name)
-        #self.L = GPy.core.Param('L',L)
-        #self.link_parameter(self.L)
-    #def parameters_changed(self):
-        #LL = flat_to_triang(self.L)
-        #Ki = multiple_dpotri(LL)
-        #self.L.gradient = 2*np.einsum('ijk,jlk->ilk', Ki, LL)
-        #self._loglik = np.sum([np.sum(np.log(np.abs(np.diag()))) for i in range(self.L.shape[-1])])
-#
--- a/GPy/util/choleskies_cython.c
+++ b/GPy/util/choleskies_cython.c
--- a/GPy/util/choleskies_cython.pyx
+++ b/GPy/util/choleskies_cython.pyx
@ -0,0 +1,41 @@
+# Copyright James Hensman and Alan Saul 2015
+
+import numpy as np
+cimport numpy as np
+
+from . import linalg
+
+def flat_to_triang(np.ndarray[double, ndim=2] flat, int M):
+    """take a matrix N x D and return a M X M x D array where
+
+    N = M(M+1)/2
+
+    the lower triangluar portion of the d'th slice of the result is filled by the d'th column of flat.
+    """
+    cdef int N = flat.shape[0]
+    cdef int D = flat.shape[1]
+    cdef int count = 0
+    cdef np.ndarray[double, ndim=3] ret = np.zeros((M, M, D))
+    for d in range(D):
+        count = 0
+        for m in range(M):
+            for mm in range(m+1):
+                ret[m, mm, d] = flat[count,d]
+                count += 1
+    return ret
+
+def triang_to_flat(np.ndarray[double, ndim=3] L):
+    cdef int M = L.shape[0]
+    cdef int D = L.shape[2]
+    cdef int N = M*(M+1)/2
+    cdef int count = 0
+    cdef np.ndarray[double, ndim=2] flat = np.empty((N, D))
+    for d in range(D):
+        count = 0
+        for m in range(M):
+            for mm in range(m+1):
+                flat[count,d] = L[m, mm, d]
+                count += 1
+    return flat
+
+
--- a/GPy/util/linalg.py
+++ b/GPy/util/linalg.py
@ -15,11 +15,7 @@ import warnings
 import os
 from .config import config
 import logging
-
-try:
-    from scipy import weave
-except ImportError:
-    config.set('weave', 'working', 'False')
+import linalg_cython


 _scipyversion = np.float64((scipy.__version__).split('.')[:2])
@ -422,114 +418,33 @@ def DSYR(*args, **kwargs):

 def symmetrify(A, upper=False):
    """
-    Take the square matrix A and make it symmetrical by copting elements from the lower half to the upper
+    Take the square matrix A and make it symmetrical by copting elements from
+    the lower half to the upper

    works IN PLACE.

-    note: tries to use weave, falls back to a slower numpy version
+    note: tries to use cython, falls back to a slower numpy version
    """
-    if config.getboolean('weave', 'working'):
-        try:
-            symmetrify_weave(A, upper)
-        except:
-            print("\n Weave compilation failed. Falling back to (slower) numpy implementation\n")
-            config.set('weave', 'working', 'False')
-            symmetrify_numpy(A, upper)
+    if config.getboolean('cython', 'working'):
+        _symmetrify_cython(A, upper)
    else:
-        symmetrify_numpy(A, upper)
+        _symmetrify_numpy(A, upper)


-def symmetrify_weave(A, upper=False):
-    """
-    Take the square matrix A and make it symmetrical by copting elements from the lower half to the upper
+def _symmetrify_cython(A, upper=False):
+    return linalg_cython.symmetrify(A, upper)

-    works IN PLACE.
-
-
-    """
-    N, M = A.shape
-    assert N == M
-
-    c_contig_code = """
-    int iN;
-    for (int i=1; i<N; i++){
-      iN = i*N;
-      for (int j=0; j<i; j++){
-        A[i+j*N] = A[iN+j];
-      }
-    }
-    """
-    f_contig_code = """
-    int iN;
-    for (int i=1; i<N; i++){
-      iN = i*N;
-      for (int j=0; j<i; j++){
-        A[iN+j] = A[i+j*N];
-      }
-    }
-    """
-
-    N = int(N) # for safe type casting
-    if A.flags['C_CONTIGUOUS'] and upper:
-        weave.inline(f_contig_code, ['A', 'N'], extra_compile_args=['-O3'])
-    elif A.flags['C_CONTIGUOUS'] and not upper:
-        weave.inline(c_contig_code, ['A', 'N'], extra_compile_args=['-O3'])
-    elif A.flags['F_CONTIGUOUS'] and upper:
-        weave.inline(c_contig_code, ['A', 'N'], extra_compile_args=['-O3'])
-    elif A.flags['F_CONTIGUOUS'] and not upper:
-        weave.inline(f_contig_code, ['A', 'N'], extra_compile_args=['-O3'])
-    else:
-        if upper:
-            tmp = np.tril(A.T)
-        else:
-            tmp = np.tril(A)
-        A[:] = 0.0
-        A += tmp
-        A += np.tril(tmp, -1).T
-
-
-def symmetrify_numpy(A, upper=False):
-    """
-    Force a matrix to be symmetric
-    """
+def _symmetrify_numpy(A, upper=False):
    triu = np.triu_indices_from(A,k=1)
    if upper:
        A.T[triu] = A[triu]
    else:
        A[triu] = A.T[triu]

-#This function appears to be unused. It's use of weave makes it problematic
-#Commenting out for now
-#def cholupdate(L, x):
-#    """
-#    update the LOWER cholesky factor of a pd matrix IN PLACE
-#
-#    if L is the lower chol. of K, then this function computes L\_
-#    where L\_ is the lower chol of K + x*x^T
-#    """
-#    support_code = """
-#    #include <math.h>
-#    """
-#    code = """
-#    double r,c,s;
-#    int j,i;
-#    for(j=0; j<N; j++){
-#      r = sqrt(L(j,j)*L(j,j) + x(j)*x(j));
-#      c = r / L(j,j);
-#      s = x(j) / L(j,j);
-#      L(j,j) = r;
-#      for (i=j+1; i<N; i++){
-#        L(i,j) = (L(i,j) + s*x(i))/c;
-#        x(i) = c*x(i) - s*L(i,j);
-#      }
-#    }
-#    """
-#    x = x.copy()
-#    N = x.size
-#    weave.inline(code, support_code=support_code, arg_names=['N', 'L', 'x'], type_converters=weave.converters.blitz)
-
 def backsub_both_sides(L, X, transpose='left'):
-    """ Return L^-T * X * L^-1, assumuing X is symmetrical and L is lower cholesky"""
+    """
+    Return L^-T * X * L^-1, assumuing X is symmetrical and L is lower cholesky
+    """
    if transpose == 'left':
        tmp, _ = dtrtrs(L, X, lower=1, trans=1)
        return dtrtrs(L, tmp.T, lower=1, trans=1)[0].T
--- a/GPy/util/linalg_cython.c
+++ b/GPy/util/linalg_cython.c
--- a/GPy/util/linalg_cython.pyx
+++ b/GPy/util/linalg_cython.pyx
@ -0,0 +1,34 @@
+cimport numpy as np
+from cpython cimport bool
+import cython
+
+@cython.boundscheck(False)
+@cython.wraparound(False)
+@cython.nonecheck(False)
+def symmetrify(np.ndarray[double, ndim=2] A, bool upper):
+    cdef int N = A.shape[0]
+    if not upper:
+        for i in xrange(N):
+            for j in xrange(i):
+                A[j, i] = A[i, j]
+    else:
+        for j in xrange(N):
+            for i in xrange(j):
+                A[j, i] = A[i, j]
+
+@cython.boundscheck(False)
+@cython.wraparound(False)
+@cython.nonecheck(False)
+def cholupdate(np.ndarray[double, ndim=1] x, np.ndarray[double, ndim=2] L, int N):
+    cdef double r
+    cdef double c
+    cdef double s
+    for j in xrange(N):
+        r = np.sqrt(L[j,j]*L[j,j] + x[j]*x[j])
+        c = r / L[j,j]
+        s = x[j] / L[j,j]
+        L[j,j] = r
+        for i in xrange(j):
+            L[i,j] = (L[i,j] + s*x[i])/c
+            x[i] = c*x[i] - s*L[i,j];
+        r = np.sqrt(L[j,j])
--- a/GPy/util/misc.py
+++ b/GPy/util/misc.py
@ -42,9 +42,6 @@ def chain_1(df_dg, dg_dx):
    """
    if np.all(dg_dx==1.):
        return df_dg
-    if len(df_dg) > 1 and len(df_dg.shape)>1 and df_dg.shape[-1] > 1:
-        #import ipdb; ipdb.set_trace()  # XXX BREAKPOINT
-        raise NotImplementedError('Not implemented for matricies yet')
    return df_dg * dg_dx

 def chain_2(d2f_dg2, dg_dx, df_dg, d2g_dx2):
@ -56,8 +53,6 @@ def chain_2(d2f_dg2, dg_dx, df_dg, d2g_dx2):
    """
    if np.all(dg_dx==1.) and np.all(d2g_dx2 == 0):
        return d2f_dg2
-    if  len(d2f_dg2) > 1 and len(d2f_dg2.shape)>1 and d2f_dg2.shape[-1] > 1:
-        raise NotImplementedError('Not implemented for matricies yet')
    dg_dx_2 = np.clip(dg_dx, -np.inf, _lim_val_square)**2
    #dg_dx_2 = dg_dx**2
    return d2f_dg2*(dg_dx_2) + df_dg*d2g_dx2
@ -71,11 +66,7 @@ def chain_3(d3f_dg3, dg_dx, d2f_dg2, d2g_dx2, df_dg, d3g_dx3):
    """
    if np.all(dg_dx==1.) and np.all(d2g_dx2==0) and np.all(d3g_dx3==0):
        return d3f_dg3
-    if (  (len(d2f_dg2) > 1 and d2f_dg2.shape[-1] > 1)
-           or (len(d3f_dg3) > 1 and d3f_dg3.shape[-1] > 1)):
-        raise NotImplementedError('Not implemented for matricies yet')
    dg_dx_3 = np.clip(dg_dx, -np.inf, _lim_val_cube)**3
-    #dg_dx_3 = dg_dx**3
    return d3f_dg3*(dg_dx_3) + 3*d2f_dg2*dg_dx*d2g_dx2 + df_dg*d3g_dx3

 def opt_wrapper(m, **kwargs):
@ -133,10 +124,11 @@ def kmm_init(X, m = 10):
 ### make a parameter to its corresponding array:
 def param_to_array(*param):
    """
-Convert an arbitrary number of parameters to :class:ndarray class objects. This is for
-converting parameter objects to numpy arrays, when using scipy.weave.inline routine.
-In scipy.weave.blitz there is no automatic array detection (even when the array inherits
-from :class:ndarray)"""
+    Convert an arbitrary number of parameters to :class:ndarray class objects.
+    This is for converting parameter objects to numpy arrays, when using
+    scipy.weave.inline routine.  In scipy.weave.blitz there is no automatic
+    array detection (even when the array inherits from :class:ndarray)
+    """
    import warnings
    warnings.warn("Please use param.values, as this function will be deprecated in the next release.", DeprecationWarning)
    assert len(param) > 0, "At least one parameter needed"
--- a/GPy/util/warping_functions.py
+++ b/GPy/util/warping_functions.py
@ -2,7 +2,7 @@
 # Licensed under the BSD 3-clause license (see LICENSE.txt)

 import numpy as np
-from GPy.core.parameterization import Parameterized, Param
+from ..core.parameterization import Parameterized, Param
 from ..core.parameterization.transformations import Logexp

 class WarpingFunction(Parameterized):
--- a/setup.py
+++ b/setup.py
@ -2,7 +2,8 @@
 # -*- coding: utf-8 -*-

 import os
-from setuptools import setup
+from setuptools import setup, Extension
+import numpy as np

 # Version number
 version = '0.6.1'
@ -10,6 +11,27 @@ version = '0.6.1'
 def read(fname):
    return open(os.path.join(os.path.dirname(__file__), fname)).read()

+#compile_flags = ["-march=native", '-fopenmp', '-O3', ]
+compile_flags = [ '-fopenmp', '-O3', ]
+
+ext_mods = [Extension(name='GPy.kern._src.stationary_cython',
+                      sources=['GPy/kern/_src/stationary_cython.c','GPy/kern/_src/stationary_utils.c'],
+                      include_dirs=[np.get_include()],
+                      extra_compile_args=compile_flags,
+                      extra_link_args = ['-lgomp']),
+            Extension(name='GPy.util.choleskies_cython',
+                      sources=['GPy/util/choleskies_cython.c'],
+                      include_dirs=[np.get_include()],
+                      extra_compile_args=compile_flags),
+            Extension(name='GPy.util.linalg_cython',
+                      sources=['GPy/util/linalg_cython.c'],
+                      include_dirs=[np.get_include()],
+                      extra_compile_args=compile_flags),
+            Extension(name='GPy.kern._src.coregionalize_cython',
+                      sources=['GPy/kern/_src/coregionalize_cython.c'],
+                      include_dirs=[np.get_include()],
+                      extra_compile_args=compile_flags)]
+
 setup(name = 'GPy',
      version = version,
      author = read('AUTHORS.txt'),
@ -18,6 +40,7 @@ setup(name = 'GPy',
      license = "BSD 3-clause",
      keywords = "machine-learning gaussian-processes kernels",
      url = "http://sheffieldml.github.com/GPy/",
+      ext_modules = ext_mods,
      packages = ["GPy.models",
                  "GPy.inference.optimization",
                  "GPy.inference.mcmc",