From ef15de9411123b936a8fe556e3257970c12a56d0 Mon Sep 17 00:00:00 2001
From: James Hensman <james.hensman@gmail.com>
Date: Fri, 26 Apr 2013 17:26:43 +0100
Subject: [PATCH] added a tdot function (thanks Iain)

---
 GPy/models/sparse_GP.py |  5 +--
 GPy/util/linalg.py      | 99 ++++++++++++++++++++++++++++++++++++++++-
 2 files changed, 99 insertions(+), 5 deletions(-)

diff --git a/GPy/models/sparse_GP.py b/GPy/models/sparse_GP.py
index e158e026..dc77e795 100644
--- a/GPy/models/sparse_GP.py
+++ b/GPy/models/sparse_GP.py
@@ -108,9 +108,6 @@ class sparse_GP(GP):
         self.Bi, self.LB, self.LBi, self.B_logdet = pdinv(self.B)
 
         self.psi1V = np.dot(self.psi1, self.V)
-        #tmp = np.dot(self.Lmi.T, self.LBi.T)
-        #tmp = linalg.lapack.clapack.dtrtrs(self.Lm.T,np.asarray(self.LBi.T,order='C'),lower=0)[0]
-        #self.C = np.dot(tmp,tmp.T) #TODO: tmp is triangular. replace with dtrmm (blas) when available
         tmp = linalg.lapack.flapack.dtrtrs(self.Lm,np.asfortranarray(self.Bi),lower=1,trans=1)[0]
         self.C = linalg.lapack.flapack.dtrtrs(self.Lm,np.asfortranarray(tmp.T),lower=1,trans=1)[0]
         self.Cpsi1V = np.dot(self.C,self.psi1V)
@@ -171,7 +168,7 @@ class sparse_GP(GP):
             #likelihood is not heterscedatic
             self.partial_for_likelihood =   - 0.5 * self.N*self.D*self.likelihood.precision + 0.5 * np.sum(np.square(self.likelihood.Y))*self.likelihood.precision**2
             self.partial_for_likelihood += 0.5 * self.D * (self.psi0.sum()*self.likelihood.precision**2 - np.trace(self.A)*self.likelihood.precision*sf2)
-            self.partial_for_likelihood += 0.5 * self.D * trace_dot(self.Bi,self.A)*self.likelihood.precision
+            self.partial_for_likelihood += 0.5 * self.D * trace_dot(self.Bi,self.A)*self.likelihood.precision # TODO: unstable?
             self.partial_for_likelihood += self.likelihood.precision*(0.5*trace_dot(self.psi2_beta_scaled,self.E*sf2) - np.trace(self.Cpsi1VVpsi1))
 
 
diff --git a/GPy/util/linalg.py b/GPy/util/linalg.py
index 79025d4f..34e30dca 100644
--- a/GPy/util/linalg.py
+++ b/GPy/util/linalg.py
@@ -1,9 +1,12 @@
 # Copyright (c) 2012, GPy authors (see AUTHORS.txt).
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 
+#tdot function courtesy of Ian Murray:
+# Iain Murray, April 2013. iain contactable via iainmurray.net
+# http://homepages.inf.ed.ac.uk/imurray2/code/tdot/tdot.py
 
 import numpy as np
-from scipy import linalg, optimize
+from scipy import linalg, optimize, weave
 import pylab as pb
 import Tango
 import sys
@@ -11,9 +14,17 @@ import re
 import pdb
 import cPickle
 import types
+import ctypes
+from ctypes import byref, c_char, c_int, c_double # TODO
 #import scipy.lib.lapack.flapack
 import scipy as sp
 
+try:
+    _blaslib = ctypes.cdll.LoadLibrary(np.core._dotblas.__file__)
+    _blas_available = True
+except:
+    _blas_available = False
+
 def trace_dot(a,b):
     """
     efficiently compute the trace of the matrix product of a and b
@@ -175,3 +186,89 @@ def PCA(Y, Q):
     X /= v;
     W *= v;
     return X, W.T
+
+
+def tdot_numpy(mat,out=None):
+    return np.dot(mat,mat.T,out)
+
+def tdot_blas(mat, out=None):
+    """returns np.dot(mat, mat.T), but faster for large 2D arrays of doubles."""
+    if (mat.dtype != 'float64') or (len(mat.shape) != 2):
+        return np.dot(mat, mat.T)
+    nn = mat.shape[0]
+    if not out:
+        out = np.zeros((nn,nn))
+    else:
+        assert(out.dtype == 'float64')
+        assert(out.shape == (nn,nn))
+        # FIXME: should allow non-contiguous out, and copy output into it:
+        assert(8 in out.strides)
+        # zeroing needed because of dumb way I copy across triangular answer
+        out[:] = 0.0
+
+    ## Call to DSYRK from BLAS
+    # If already in Fortran order (rare), and has the right sorts of strides I
+    # could avoid the copy. I also thought swapping to cblas API would allow use
+    # of C order. However, I tried that and had errors with large matrices:
+    # http://homepages.inf.ed.ac.uk/imurray2/code/tdot/tdot_broken.py
+    mat = mat.copy(order='F')
+    TRANS = c_char('n')
+    N = c_int(mat.shape[0])
+    K = c_int(mat.shape[1])
+    LDA = c_int(mat.shape[0])
+    UPLO = c_char('l')
+    ALPHA = c_double(1.0)
+    A = mat.ctypes.data_as(ctypes.c_void_p)
+    BETA = c_double(0.0)
+    C = out.ctypes.data_as(ctypes.c_void_p)
+    LDC = c_int(np.max(out.strides) / 8)
+    _blaslib.dsyrk_(byref(UPLO), byref(TRANS), byref(N), byref(K),
+            byref(ALPHA), A, byref(LDA), byref(BETA), C, byref(LDC))
+
+    symmetrify(out.T)
+
+    return out
+
+def tdot(*args, **kwargs):
+    if _blas_available:
+        return tdot_blas(*args,**kwargs)
+    else:
+        return tdot_numpy(*args,**kwargs)
+
+def symmetrify(A):
+    """
+    Take the square matrix A and make it symmetrical by copting elements from the lower half to the upper
+
+    works IN PLACE.
+    """
+    N,M = A.shape
+    assert N==M
+    c_contig_code = """
+    for (int i=1; i<N; i++){
+      for (int j=0; j<i; j++){
+        A[i+j*N] = A[i*N+j];
+      }
+    }
+    """
+    f_contig_code = """
+    for (int i=1; i<N; i++){
+      for (int j=0; j<i; j++){
+        A[i*N+j] = A[i+j*N];
+      }
+    }
+    """
+    if A.flags['C_CONTIGUOUS']:
+        weave.inline(c_contig_code,['A','N'])
+    elif A.flags['F_CONTIGUOUS']:
+        weave.inline(f_contig_code,['A','N'])
+    else:
+        tmp = np.tril(A)
+        A[:] = 0.0
+        A += tmp
+        A += np.tril(tmp,-1).T
+
+def symmetrify_murray(A):
+    A += A.T
+    nn = A.shape[0]
+    A[[range(nn),range(nn)]] /= 2.0
+