From f11090ad8a52ca687ed2e9e792c8bb34935230fe Mon Sep 17 00:00:00 2001
From: Zhenwen Dai <zhenwend@amazon.co.uk>
Date: Thu, 23 Nov 2017 09:20:05 +0000
Subject: [PATCH] improve the documentation for LVMOGP

---
 .../latent_function_inference/vardtc_md.py    | 12 ++----
 .../vardtc_svi_multiout.py                    | 14 +++----
 .../vardtc_svi_multiout_miss.py               |  5 ++-
 GPy/models/gp_multiout_regression.py          | 42 +++++++++++--------
 GPy/models/gp_multiout_regression_md.py       | 31 ++++++++++++--
 GPy/models/sparse_gp_regression_md.py         | 18 +++++---
 6 files changed, 78 insertions(+), 44 deletions(-)

diff --git a/GPy/inference/latent_function_inference/vardtc_md.py b/GPy/inference/latent_function_inference/vardtc_md.py
index 2297074a..89e42cb4 100644
--- a/GPy/inference/latent_function_inference/vardtc_md.py
+++ b/GPy/inference/latent_function_inference/vardtc_md.py
@@ -1,5 +1,5 @@
-
-
+# Copyright (c) 2017, GPy authors (see AUTHORS.txt).
+# Licensed under the BSD 3-clause license (see LICENSE.txt)
 
 from GPy.util.linalg import jitchol, backsub_both_sides, tdot, dtrtrs, dtrtri,pdinv, dpotri
 from GPy.util import diag
@@ -12,13 +12,7 @@ log_2_pi = np.log(2*np.pi)
 
 class VarDTC_MD(LatentFunctionInference):
     """
-    An object for inference when the likelihood is Gaussian, but we want to do sparse inference.
-
-    The function self.inference returns a Posterior object, which summarizes
-    the posterior.
-
-    For efficiency, we sometimes work with the cholesky of Y*Y.T. To save repeatedly recomputing this, we cache it.
-
+    The VarDTC inference method for sparse GP with missing data (GPy.models.SparseGPRegressionMD)
     """
     const_jitter = 1e-6
 
diff --git a/GPy/inference/latent_function_inference/vardtc_svi_multiout.py b/GPy/inference/latent_function_inference/vardtc_svi_multiout.py
index 99078ab5..c897236a 100644
--- a/GPy/inference/latent_function_inference/vardtc_svi_multiout.py
+++ b/GPy/inference/latent_function_inference/vardtc_svi_multiout.py
@@ -1,4 +1,6 @@
-#from .posterior import Posterior
+# Copyright (c) 2017, GPy authors (see AUTHORS.txt).
+# Licensed under the BSD 3-clause license (see LICENSE.txt)
+
 from GPy.util.linalg import jitchol, backsub_both_sides, tdot, dtrtrs, dtrtri,pdinv, dpotri
 from GPy.util import diag
 from GPy.core.parameterization.variational import VariationalPosterior
@@ -10,13 +12,7 @@ log_2_pi = np.log(2*np.pi)
 
 class VarDTC_SVI_Multiout(LatentFunctionInference):
     """
-    An object for inference when the likelihood is Gaussian, but we want to do sparse inference.
-
-    The function self.inference returns a Posterior object, which summarizes
-    the posterior.
-
-    For efficiency, we sometimes work with the cholesky of Y*Y.T. To save repeatedly recomputing this, we cache it.
-
+    The VarDTC inference method for Multi-output GP regression (GPy.models.GPMultioutRegression)
     """
     const_jitter = 1e-6
 
@@ -100,7 +96,7 @@ class VarDTC_SVI_Multiout(LatentFunctionInference):
                - (LcInvMLrInvT.T.dot(LcInvPsi2_cLcInvT).dot(LcInvMLrInvT)*LrInvPsi2_rLrInvT).sum() \
                -  tr_LrInvPsi2_rLrInvT_LrInvSrLrInvT* tr_LcInvPsi2_cLcInvT_LcInvScLcInvT \
                + 2 * (Y * LcInvPsi1_cT.T.dot(LcInvMLrInvT).dot(LrInvPsi1_rT)).sum() - psi0_c * psi0_r \
-               + tr_LrInvPsi2_rLrInvT * tr_LcInvPsi2_cLcInvT 
+               + tr_LrInvPsi2_rLrInvT * tr_LcInvPsi2_cLcInvT
 
         logL = -N*D/2.*(np.log(2.*np.pi)-np.log(beta)) + beta/2.* logL_A \
                -Mc * (np.log(np.diag(Lr)).sum()-np.log(np.diag(LSr)).sum())  -Mr * (np.log(np.diag(Lc)).sum()-np.log(np.diag(LSc)).sum()) \
diff --git a/GPy/inference/latent_function_inference/vardtc_svi_multiout_miss.py b/GPy/inference/latent_function_inference/vardtc_svi_multiout_miss.py
index dc4d24c3..52767c47 100644
--- a/GPy/inference/latent_function_inference/vardtc_svi_multiout_miss.py
+++ b/GPy/inference/latent_function_inference/vardtc_svi_multiout_miss.py
@@ -1,4 +1,6 @@
-#from .posterior import Posterior
+# Copyright (c) 2017, GPy authors (see AUTHORS.txt).
+# Licensed under the BSD 3-clause license (see LICENSE.txt)
+
 from GPy.util.linalg import jitchol, backsub_both_sides, tdot, dtrtrs, dtrtri,pdinv, dpotri
 from GPy.util import diag
 from GPy.core.parameterization.variational import VariationalPosterior
@@ -11,6 +13,7 @@ log_2_pi = np.log(2*np.pi)
 
 class VarDTC_SVI_Multiout_Miss(LatentFunctionInference):
     """
+    The VarDTC inference method for Multi-output GP regression with missing data (GPy.models.GPMultioutRegressionMD)
     """
     const_jitter = 1e-6
 
diff --git a/GPy/models/gp_multiout_regression.py b/GPy/models/gp_multiout_regression.py
index 193d9726..aa1cf965 100644
--- a/GPy/models/gp_multiout_regression.py
+++ b/GPy/models/gp_multiout_regression.py
@@ -13,26 +13,28 @@ from ..util.linalg import tdot
 
 class GPMultioutRegression(SparseGP):
     """
-    Gaussian Process model for scalable multioutput regression
+    Gaussian Process model for multi-output regression without missing data
 
-    This is a thin wrapper around the models.GP class, with a set of sensible defaults
+    This is an implementation of Latent Variable Multiple Output Gaussian Processes (LVMOGP) in [Dai et al. 2017].
 
-    :param X_list: list of input observations corresponding to each output
-    :type X_list: list of numpy arrays
-    :param Y_list: list of observed values related to the different noise models
-    :type Y_list: list of numpy arrays
-    :param kernel: a GPy kernel ** Coregionalized, defaults to RBF ** Coregionalized
-    :type kernel: None | GPy.kernel defaults
-    :likelihoods_list: a list of likelihoods, defaults to list of Gaussian likelihoods
-    :type likelihoods_list: None | a list GPy.likelihoods
-    :param name: model name
-    :type name: string
-    :param W_rank: number tuples of the corregionalization parameters 'W' (see coregionalize kernel documentation)
-    :type W_rank: integer
-    :param kernel_name: name of the kernel
-    :type kernel_name: string
+    Zhenwen Dai, Mauricio A. Álvarez and Neil D. Lawrence. Efficient Modeling of Latent Information in Supervised Learning using Gaussian Processes. In NIPS, 2017.
+
+    :param X: input observations. Numpy.ndarray
+    :param Y: output observations, each column corresponding to an output dimension. Numpy.ndarray
+    :param Xr_dim: the dimensionality of a latent space, in which output dimensions are embedded in
+    :param kernel: a GPy kernel for GP of individual output dimensions ** defaults to RBF **
+    :param kernel_row: a GPy kernel for the GP of the latent space ** defaults to RBF **
+    :param Z: inducing inputs
+    :param Z_row: inducing inputs for the latent space
+    :param X_row: the initial value of the mean of the variational posterior distribution of points in the latent space
+    :param Xvariance_row: the initial value of the variance of the variational posterior distribution of points in the latent space
+    :param num_inducing: a tuple (M, Mr). M is the number of inducing points for GP of individual output dimensions. Mr is the number of inducing points for the latent space.
+    :param qU_var_r_W_dim: the dimensionality of the covariance of q(U) for the latent space. If it is smaller than the number of inducing points, it represents a low-rank parameterization of the covariance matrix.
+    :param qU_var_c_W_dim: the dimensionality of the covariance of q(U) for the GP regression. If it is smaller than the number of inducing points, it represents a low-rank parameterization of the covariance matrix.
+    :param init: the choice of initialization: 'GP' or 'rand'. With 'rand', the model is initialized randomly. With 'GP', the model is initialized through a protocol as follows: (1) fits a sparse GP (2) fits a BGPLVM based on the outcome of sparse GP (3) initialize the model based on the outcome of the BGPLVM.
+    :param name: the name of the model
     """
-    def __init__(self, X, Y, Xr_dim, kernel=None, kernel_row=None, likelihood=None, Z=None, Z_row=None, X_row=None, Xvariance_row=None, num_inducing=(10,10), qU_var_r_W_dim=None, qU_var_c_W_dim=None, init='GP', name='GPMR'):
+    def __init__(self, X, Y, Xr_dim, kernel=None, kernel_row=None, Z=None, Z_row=None, X_row=None, Xvariance_row=None, num_inducing=(10,10), qU_var_r_W_dim=None, qU_var_c_W_dim=None, init='GP', name='GPMR'):
 
         #Kernel
         if kernel is None:
@@ -165,6 +167,12 @@ class GPMultioutRegression(SparseGP):
         self.variational_prior_row.update_gradients_KL(self.X_row)
 
     def optimize_auto(self,max_iters=10000,verbose=True):
+        """
+        Optimize the model parameters through a pre-defined protocol.
+
+        :param max_iters: the maximum number of iterations.
+        :param verbose: print the progress of optimization or not.
+        """
         self.Z.fix(warning=False)
         self.kern.fix(warning=False)
         self.kern_row.fix(warning=False)
diff --git a/GPy/models/gp_multiout_regression_md.py b/GPy/models/gp_multiout_regression_md.py
index 0d912843..5fa53038 100644
--- a/GPy/models/gp_multiout_regression_md.py
+++ b/GPy/models/gp_multiout_regression_md.py
@@ -14,11 +14,30 @@ from .sparse_gp_regression_md import SparseGPRegressionMD
 
 class GPMultioutRegressionMD(SparseGP):
     """
-    Gaussian Process model for scalable multioutput regression
+    Gaussian Process model for multi-output regression with missing data
 
-    This is a thin wrapper around the models.GP class, with a set of sensible defaults
+    This is an implementation of Latent Variable Multiple Output Gaussian Processes (LVMOGP) in [Dai et al. 2017]. This model targets at the use case, in which each output dimension is observed at a different set of inputs. The model takes a different data format: the inputs and outputs observations of all the output dimensions are stacked together correspondingly into two matrices. An extra array is used to indicate the index of output dimension for each data point. The output dimensions are indexed using integers from 0 to D-1 assuming there are D output dimensions.
+
+    Zhenwen Dai, Mauricio A. Álvarez and Neil D. Lawrence. Efficient Modeling of Latent Information in Supervised Learning using Gaussian Processes. In NIPS, 2017.
+
+    :param X: input observations. Numpy.ndarray
+    :param Y: output observations, each column corresponding to an output dimension. Numpy.ndarray
+    :param indexD: the array containing the index of output dimension for each data point
+    :param Xr_dim: the dimensionality of a latent space, in which output dimensions are embedded in
+    :param kernel: a GPy kernel for GP of individual output dimensions ** defaults to RBF **
+    :param kernel_row: a GPy kernel for the GP of the latent space ** defaults to RBF **
+    :param Z: inducing inputs
+    :param Z_row: inducing inputs for the latent space
+    :param X_row: the initial value of the mean of the variational posterior distribution of points in the latent space
+    :param Xvariance_row: the initial value of the variance of the variational posterior distribution of points in the latent space
+    :param num_inducing: a tuple (M, Mr). M is the number of inducing points for GP of individual output dimensions. Mr is the number of inducing points for the latent space.
+    :param qU_var_r_W_dim: the dimensionality of the covariance of q(U) for the latent space. If it is smaller than the number of inducing points, it represents a low-rank parameterization of the covariance matrix.
+    :param qU_var_c_W_dim: the dimensionality of the covariance of q(U) for the GP regression. If it is smaller than the number of inducing points, it represents a low-rank parameterization of the covariance matrix.
+    :param init: the choice of initialization: 'GP' or 'rand'. With 'rand', the model is initialized randomly. With 'GP', the model is initialized through a protocol as follows: (1) fits a sparse GP (2) fits a BGPLVM based on the outcome of sparse GP (3) initialize the model based on the outcome of the BGPLVM.
+    :param heter_noise: whether assuming heteroscedastic noise in the model, boolean
+    :param name: the name of the model
     """
-    def __init__(self, X, Y, indexD, Xr_dim, kernel=None, kernel_row=None, likelihood=None, Z=None, Z_row=None, X_row=None, Xvariance_row=None, num_inducing=(10,10), qU_var_r_W_dim=None, qU_var_c_W_dim=None, init='GP', heter_noise=False, name='GPMR'):
+    def __init__(self, X, Y, indexD, Xr_dim, kernel=None, kernel_row=None,  Z=None, Z_row=None, X_row=None, Xvariance_row=None, num_inducing=(10,10), qU_var_r_W_dim=None, qU_var_c_W_dim=None, init='GP', heter_noise=False, name='GPMRMD'):
 
         assert len(Y.shape)==1 or Y.shape[1]==1
 
@@ -163,6 +182,12 @@ class GPMultioutRegressionMD(SparseGP):
         self.variational_prior_row.update_gradients_KL(self.X_row)
 
     def optimize_auto(self,max_iters=10000,verbose=True):
+        """
+        Optimize the model parameters through a pre-defined protocol.
+
+        :param max_iters: the maximum number of iterations.
+        :param verbose: print the progress of optimization or not.
+        """
         self.Z.fix(warning=False)
         self.kern.fix(warning=False)
         self.kern_row.fix(warning=False)
diff --git a/GPy/models/sparse_gp_regression_md.py b/GPy/models/sparse_gp_regression_md.py
index 5762c44c..4b6de319 100644
--- a/GPy/models/sparse_gp_regression_md.py
+++ b/GPy/models/sparse_gp_regression_md.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2012, James Hensman
+# Copyright (c) 2017, Zhenwen Dai
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 
 
@@ -12,9 +12,20 @@ from GPy.core.parameterization.variational import NormalPosterior
 class SparseGPRegressionMD(SparseGP_MPI):
     """
     Sparse Gaussian Process Regression with Missing Data
+
+    This model targets at the use case, in which there are multiple output dimensions (different dimensions are assumed to be independent following the same GP prior) and each output dimension is observed at a different set of inputs. The model takes a different data format: the inputs and outputs observations of all the output dimensions are stacked together correspondingly into two matrices. An extra array is used to indicate the index of output dimension for each data point. The output dimensions are indexed using integers from 0 to D-1 assuming there are D output dimensions.
+
+    :param X: input observations. Numpy.ndarray
+    :param Y: output observations, each column corresponding to an output dimension. Numpy.ndarray
+    :param indexD: the array containing the index of output dimension for each data point
+    :param kernel: a GPy kernel for GP of individual output dimensions ** defaults to RBF **
+    :param Z: inducing inputs
+    :param num_inducing: a tuple (M, Mr). M is the number of inducing points for GP of individual output dimensions. Mr is the number of inducing points for the latent space.
+    :param individual_Y_noise: whether individual output dimensions have their own noise variance or not, boolean
+    :param name: the name of the model
     """
 
-    def __init__(self, X, Y, indexD, kernel=None, Z=None, num_inducing=10, X_variance=None, normalizer=None, mpi_comm=None, individual_Y_noise=False, name='sparse_gp'):
+    def __init__(self, X, Y, indexD, kernel=None, Z=None, num_inducing=10,  normalizer=None, mpi_comm=None, individual_Y_noise=False, name='sparse_gp'):
 
         assert len(Y.shape)==1 or Y.shape[1]==1
         self.individual_Y_noise = individual_Y_noise
@@ -39,9 +50,6 @@ class SparseGPRegressionMD(SparseGP_MPI):
         else:
             likelihood = likelihoods.Gaussian(variance=np.var(Y)*0.01)
 
-        if not (X_variance is None):
-            X = NormalPosterior(X,X_variance)
-
         infr = VarDTC_MD()
 
         SparseGP_MPI.__init__(self, X, Y, Z, kernel, likelihood, inference_method=infr, normalizer=normalizer, mpi_comm=mpi_comm, name=name)