[ { "title": "Preface", "start_index": 1, "end_index": 6 }, { "title": "Preface", "start_index": 7, "end_index": 10 }, { "title": "Mathematical notation", "start_index": 11, "end_index": 13 }, { "title": "Contents", "start_index": 13, "end_index": 20 }, { "title": "Introduction", "start_index": 21, "end_index": 24, "child_nodes": [ { "title": "Example: Polynomial Curve Fitting", "start_index": 24, "end_index": 32 }, { "title": "Probability Theory", "start_index": 32, "end_index": 37, "child_nodes": [ { "title": "Probability densities", "start_index": 37, "end_index": 39 }, { "title": "Expectations and covariances", "start_index": 39, "end_index": 41 }, { "title": "Bayesian probabilities", "start_index": 41, "end_index": 44 }, { "title": "The Gaussian distribution", "start_index": 44, "end_index": 48 }, { "title": "Curve fitting re-visited", "start_index": 48, "end_index": 50 }, { "title": "Bayesian curve fitting", "start_index": 50, "end_index": 52 } ] }, { "title": "Model Selection", "start_index": 52, "end_index": 53 }, { "title": "The Curse of Dimensionality", "start_index": 53, "end_index": 58 }, { "title": "Decision Theory", "start_index": 58, "end_index": 59, "child_nodes": [ { "title": "Minimizing the misclassification rate", "start_index": 59, "end_index": 61 }, { "title": "Minimizing the expected loss", "start_index": 61, "end_index": 62 }, { "title": "The reject option", "start_index": 62, "end_index": 62 }, { "title": "Inference and decision", "start_index": 62, "end_index": 66 }, { "title": "Loss functions for regression", "start_index": 66, "end_index": 68 } ] }, { "title": "Information Theory", "start_index": 68, "end_index": 75, "child_nodes": [ { "title": "Relative entropy and mutual information", "start_index": 75, "end_index": 78 } ] } ] }, { "title": "Exercises", "start_index": 78, "end_index": 87 }, { "title": "Probability Distributions", "start_index": 87, "end_index": 88, "child_nodes": [ { "title": "Binary Variables", "start_index": 88, "end_index": 91, "child_nodes": [ { "title": "The beta distribution", "start_index": 91, "end_index": 94 } ] }, { "title": "Multinomial Variables", "start_index": 94, "end_index": 96, "child_nodes": [ { "title": "The Dirichlet distribution", "start_index": 96, "end_index": 98 } ] }, { "title": "The Gaussian Distribution", "start_index": 98, "end_index": 105, "child_nodes": [ { "title": "Conditional Gaussian distributions", "start_index": 105, "end_index": 108 }, { "title": "Marginal Gaussian distributions", "start_index": 108, "end_index": 110 }, { "title": "Bayes\u2019 theorem for Gaussian variables", "start_index": 110, "end_index": 113 }, { "title": "Maximum likelihood for the Gaussian", "start_index": 113, "end_index": 114 }, { "title": "Sequential estimation", "start_index": 114, "end_index": 117 }, { "title": "Bayesian inference for the Gaussian", "start_index": 117, "end_index": 122 }, { "title": "Student\u2019s t-distribution", "start_index": 122, "end_index": 125 }, { "title": "Periodic variables", "start_index": 125, "end_index": 130 }, { "title": "Mixtures of Gaussians", "start_index": 130, "end_index": 133 } ] }, { "title": "The Exponential Family", "start_index": 133, "end_index": 136, "child_nodes": [ { "title": "Maximum likelihood and sufficient statistics", "start_index": 136, "end_index": 137 }, { "title": "Conjugate priors", "start_index": 137, "end_index": 137 }, { "title": "Noninformative priors", "start_index": 137, "end_index": 140 } ] }, { "title": "Nonparametric Methods", "start_index": 140, "end_index": 142, "child_nodes": [ { "title": "Kernel density estimators", "start_index": 142, "end_index": 144 }, { "title": "Nearest-neighbour methods", "start_index": 144, "end_index": 147 } ] } ] }, { "title": "Exercises", "start_index": 147, "end_index": 156 }, { "title": "Linear Models for Regression", "start_index": 157, "end_index": 158, "child_nodes": [ { "title": "Linear Basis Function Models", "start_index": 158, "end_index": 160, "child_nodes": [ { "title": "Maximum likelihood and least squares", "start_index": 160, "end_index": 163 }, { "title": "Geometry of least squares", "start_index": 163, "end_index": 163 }, { "title": "Sequential learning", "start_index": 163, "end_index": 164 }, { "title": "Regularized least squares", "start_index": 164, "end_index": 166 }, { "title": "Multiple outputs", "start_index": 166, "end_index": 167 } ] }, { "title": "The Bias-Variance Decomposition", "start_index": 167, "end_index": 172 }, { "title": "Bayesian Linear Regression", "start_index": 172, "end_index": 172, "child_nodes": [ { "title": "Parameter distribution", "start_index": 172, "end_index": 176 }, { "title": "Predictive distribution", "start_index": 176, "end_index": 179 }, { "title": "Equivalent kernel", "start_index": 179, "end_index": 181 } ] }, { "title": "Bayesian Model Comparison", "start_index": 181, "end_index": 185 }, { "title": "The Evidence Approximation", "start_index": 185, "end_index": 186, "child_nodes": [ { "title": "Evaluation of the evidence function", "start_index": 186, "end_index": 188 }, { "title": "Maximizing the evidence function", "start_index": 188, "end_index": 190 }, { "title": "Effective number of parameters", "start_index": 190, "end_index": 192 } ] }, { "title": "Limitations of Fixed Basis Functions", "start_index": 192, "end_index": 193 } ] }, { "title": "Exercises", "start_index": 193, "end_index": 198 }, { "title": "Linear Models for Classification", "start_index": 199, "end_index": 201, "child_nodes": [ { "title": "Discriminant Functions", "start_index": 201, "end_index": 201, "child_nodes": [ { "title": "Two classes", "start_index": 201, "end_index": 202 }, { "title": "Multiple classes", "start_index": 202, "end_index": 204 }, { "title": "Least squares for classification", "start_index": 204, "end_index": 206 }, { "title": "Fisher\u2019s linear discriminant", "start_index": 206, "end_index": 209 }, { "title": "Relation to least squares", "start_index": 209, "end_index": 211 }, { "title": "Fisher\u2019s discriminant for multiple classes", "start_index": 211, "end_index": 212 }, { "title": "The perceptron algorithm", "start_index": 212, "end_index": 216 } ] }, { "title": "Probabilistic Generative Models", "start_index": 216, "end_index": 218, "child_nodes": [ { "title": "Continuous inputs", "start_index": 218, "end_index": 220 }, { "title": "Maximum likelihood solution", "start_index": 220, "end_index": 222 }, { "title": "Discrete features", "start_index": 222, "end_index": 222 }, { "title": "Exponential family", "start_index": 222, "end_index": 223 } ] }, { "title": "Probabilistic Discriminative Models", "start_index": 223, "end_index": 224, "child_nodes": [ { "title": "Fixed basis functions", "start_index": 224, "end_index": 225 }, { "title": "Logistic regression", "start_index": 225, "end_index": 227 }, { "title": "Iterative reweighted least squares", "start_index": 227, "end_index": 229 }, { "title": "Multiclass logistic regression", "start_index": 229, "end_index": 230 }, { "title": "Probit regression", "start_index": 230, "end_index": 232 }, { "title": "Canonical link functions", "start_index": 232, "end_index": 232 } ] }, { "title": "The Laplace Approximation", "start_index": 233, "end_index": 236, "child_nodes": [ { "title": "Model comparison and BIC", "start_index": 236, "end_index": 237 } ] }, { "title": "Bayesian Logistic Regression", "start_index": 237, "end_index": 237, "child_nodes": [ { "title": "Laplace approximation", "start_index": 237, "end_index": 238 }, { "title": "Predictive distribution", "start_index": 238, "end_index": 240 } ] } ] }, { "title": "Exercises", "start_index": 240, "end_index": 245 }, { "title": "Neural Networks", "start_index": 245, "end_index": 247, "child_nodes": [ { "title": "Feed-forward Network Functions", "start_index": 247, "end_index": 251, "child_nodes": [ { "title": "Weight-space symmetries", "start_index": 251, "end_index": 252 } ] }, { "title": "Network Training", "start_index": 252, "end_index": 256, "child_nodes": [ { "title": "Parameter optimization", "start_index": 256, "end_index": 257 }, { "title": "Local quadratic approximation", "start_index": 257, "end_index": 259 }, { "title": "Use of gradient information", "start_index": 259, "end_index": 260 }, { "title": "Gradient descent optimization", "start_index": 260, "end_index": 261 } ] }, { "title": "Error Backpropagation", "start_index": 261, "end_index": 262, "child_nodes": [ { "title": "Evaluation of error-function derivatives", "start_index": 262, "end_index": 265 }, { "title": "A simple example", "start_index": 265, "end_index": 266 }, { "title": "Efficiency of backpropagation", "start_index": 266, "end_index": 267 }, { "title": "The Jacobian matrix", "start_index": 267, "end_index": 269 } ] }, { "title": "The Hessian Matrix", "start_index": 269, "end_index": 270, "child_nodes": [ { "title": "Diagonal approximation", "start_index": 270, "end_index": 271 }, { "title": "Outer product approximation", "start_index": 271, "end_index": 272 }, { "title": "Inverse Hessian", "start_index": 272, "end_index": 272 }, { "title": "Finite differences", "start_index": 272, "end_index": 273 }, { "title": "Exact evaluation of the Hessian", "start_index": 273, "end_index": 274 }, { "title": "Fast multiplication by the Hessian", "start_index": 274, "end_index": 276 } ] }, { "title": "Regularization in Neural Networks", "start_index": 276, "end_index": 277, "child_nodes": [ { "title": "Consistent Gaussian priors", "start_index": 277, "end_index": 279 }, { "title": "Early stopping", "start_index": 279, "end_index": 281 }, { "title": "Invariances", "start_index": 281, "end_index": 283 }, { "title": "Tangent propagation", "start_index": 283, "end_index": 285 }, { "title": "Training with transformed data", "start_index": 285, "end_index": 287 }, { "title": "Convolutional networks", "start_index": 287, "end_index": 289 }, { "title": "Soft weight sharing", "start_index": 289, "end_index": 292 } ] }, { "title": "Mixture Density Networks", "start_index": 292, "end_index": 297 }, { "title": "Bayesian Neural Networks", "start_index": 297, "end_index": 298, "child_nodes": [ { "title": "Posterior parameter distribution", "start_index": 298, "end_index": 300 }, { "title": "Hyperparameter optimization", "start_index": 300, "end_index": 301 }, { "title": "Bayesian neural networks for classification", "start_index": 301, "end_index": 304 } ] } ] }, { "title": "Exercises", "start_index": 304, "end_index": 311 }, { "title": "Kernel Methods", "start_index": 311, "end_index": 313, "child_nodes": [ { "title": "Dual Representations", "start_index": 313, "end_index": 314 }, { "title": "Constructing Kernels", "start_index": 314, "end_index": 319 }, { "title": "Radial Basis Function Networks", "start_index": 319, "end_index": 321, "child_nodes": [ { "title": "Nadaraya-Watson model", "start_index": 321, "end_index": 323 } ] }, { "title": "Gaussian Processes", "start_index": 323, "end_index": 324, "child_nodes": [ { "title": "Linear regression revisited", "start_index": 324, "end_index": 326 }, { "title": "Gaussian processes for regression", "start_index": 326, "end_index": 331 }, { "title": "Learning the hyperparameters", "start_index": 331, "end_index": 332 }, { "title": "Automatic relevance determination", "start_index": 332, "end_index": 333 }, { "title": "Gaussian processes for classification", "start_index": 333, "end_index": 335 }, { "title": "Laplace approximation", "start_index": 335, "end_index": 339 }, { "title": "Connection to neural networks", "start_index": 339, "end_index": 340 } ] } ] }, { "title": "Exercises", "start_index": 340, "end_index": 344 }, { "title": "Sparse Kernel Machines", "start_index": 345, "end_index": 346, "child_nodes": [ { "title": "Maximum Margin Classifiers", "start_index": 346, "end_index": 351, "child_nodes": [ { "title": "Overlapping class distributions", "start_index": 351, "end_index": 356 }, { "title": "Relation to logistic regression", "start_index": 356, "end_index": 358 }, { "title": "Multiclass SVMs", "start_index": 358, "end_index": 359 }, { "title": "SVMs for regression", "start_index": 359, "end_index": 364 }, { "title": "Computational learning theory", "start_index": 364, "end_index": 365 } ] }, { "title": "Relevance Vector Machines", "start_index": 365, "end_index": 365, "child_nodes": [ { "title": "RVM for regression", "start_index": 365, "end_index": 369 }, { "title": "Analysis of sparsity", "start_index": 369, "end_index": 373 }, { "title": "RVM for classification", "start_index": 373, "end_index": 377 } ] } ] }, { "title": "Exercises", "start_index": 377, "end_index": 379 }, { "title": "Graphical Models", "start_index": 379, "end_index": 380, "child_nodes": [ { "title": "Bayesian Networks", "start_index": 380, "end_index": 382, "child_nodes": [ { "title": "Example: Polynomial regression", "start_index": 382, "end_index": 385 }, { "title": "Generative models", "start_index": 385, "end_index": 386 }, { "title": "Discrete variables", "start_index": 386, "end_index": 390 }, { "title": "Linear-Gaussian models", "start_index": 390, "end_index": 392 } ] }, { "title": "Conditional Independence", "start_index": 392, "end_index": 393, "child_nodes": [ { "title": "Three example graphs", "start_index": 393, "end_index": 398 }, { "title": "D-separation", "start_index": 398, "end_index": 403 } ] }, { "title": "Markov Random Fields", "start_index": 403, "end_index": 403, "child_nodes": [ { "title": "Conditional independence properties", "start_index": 403, "end_index": 404 }, { "title": "Factorization properties", "start_index": 404, "end_index": 407 }, { "title": "Illustration: Image de-noising", "start_index": 407, "end_index": 410 }, { "title": "Relation to directed graphs", "start_index": 410, "end_index": 413 } ] }, { "title": "Inference in Graphical Models", "start_index": 413, "end_index": 414, "child_nodes": [ { "title": "Inference on a chain", "start_index": 414, "end_index": 418 }, { "title": "Trees", "start_index": 418, "end_index": 419 }, { "title": "Factor graphs", "start_index": 419, "end_index": 422 }, { "title": "The sum-product algorithm", "start_index": 422, "end_index": 431 }, { "title": "The max-sum algorithm", "start_index": 431, "end_index": 436 }, { "title": "Exact inference in general graphs", "start_index": 436, "end_index": 437 }, { "title": "Loopy belief propagation", "start_index": 437, "end_index": 438 }, { "title": "Learning the graph structure", "start_index": 438, "end_index": 438 } ] } ] }, { "title": "Exercises", "start_index": 438, "end_index": 442 }, { "title": "Mixture Models and EM", "start_index": 443, "end_index": 444, "child_nodes": [ { "title": "K-means Clustering", "start_index": 444, "end_index": 448, "child_nodes": [ { "title": "Image segmentation and compression", "start_index": 448, "end_index": 450 } ] }, { "title": "Mixtures of Gaussians", "start_index": 450, "end_index": 452, "child_nodes": [ { "title": "Maximum likelihood", "start_index": 452, "end_index": 455 }, { "title": "EM for Gaussian mixtures", "start_index": 455, "end_index": 459 } ] }, { "title": "An Alternative View of EM", "start_index": 459, "end_index": 461, "child_nodes": [ { "title": "Gaussian mixtures revisited", "start_index": 461, "end_index": 463 }, { "title": "Relation to K-means", "start_index": 463, "end_index": 464 }, { "title": "Mixtures of Bernoulli distributions", "start_index": 464, "end_index": 468 }, { "title": "EM for Bayesian linear regression", "start_index": 468, "end_index": 470 } ] }, { "title": "The EM Algorithm in General", "start_index": 470, "end_index": 475 } ] }, { "title": "Exercises", "start_index": 475, "end_index": 480 }, { "title": "Approximate Inference", "start_index": 481, "end_index": 482, "child_nodes": [ { "title": "Variational Inference", "start_index": 482, "end_index": 484, "child_nodes": [ { "title": "Factorized distributions", "start_index": 484, "end_index": 486 }, { "title": "Properties of factorized approximations", "start_index": 486, "end_index": 490 }, { "title": "Example: The univariate Gaussian", "start_index": 490, "end_index": 493 }, { "title": "Model comparison", "start_index": 493, "end_index": 494 } ] }, { "title": "Illustration: Variational Mixture of Gaussians", "start_index": 494, "end_index": 495, "child_nodes": [ { "title": "Variational distribution", "start_index": 495, "end_index": 501 }, { "title": "Variational lower bound", "start_index": 501, "end_index": 502 }, { "title": "Predictive density", "start_index": 502, "end_index": 503 }, { "title": "Determining the number of components", "start_index": 503, "end_index": 505 }, { "title": "Induced factorizations", "start_index": 505, "end_index": 506 } ] }, { "title": "Variational Linear Regression", "start_index": 506, "end_index": 506, "child_nodes": [ { "title": "Variational distribution", "start_index": 506, "end_index": 508 }, { "title": "Predictive distribution", "start_index": 508, "end_index": 509 }, { "title": "Lower bound", "start_index": 509, "end_index": 510 } ] }, { "title": "Exponential Family Distributions", "start_index": 510, "end_index": 511, "child_nodes": [ { "title": "Variational message passing", "start_index": 511, "end_index": 512 } ] }, { "title": "Local Variational Methods", "start_index": 513, "end_index": 518 }, { "title": "Variational Logistic Regression", "start_index": 518, "end_index": 518, "child_nodes": [ { "title": "Variational posterior distribution", "start_index": 518, "end_index": 520 }, { "title": "Optimizing the variational parameters", "start_index": 520, "end_index": 522 }, { "title": "Inference of hyperparameters", "start_index": 522, "end_index": 525 } ] }, { "title": "Expectation Propagation", "start_index": 525, "end_index": 531, "child_nodes": [ { "title": "Example: The clutter problem", "start_index": 531, "end_index": 533 }, { "title": "Expectation propagation on graphs", "start_index": 533, "end_index": 537 } ] } ] }, { "title": "Exercises", "start_index": 537, "end_index": 542 }, { "title": "Sampling Methods", "start_index": 543, "end_index": 546, "child_nodes": [ { "title": "Basic Sampling Algorithms", "start_index": 546, "end_index": 546, "child_nodes": [ { "title": "Standard distributions", "start_index": 546, "end_index": 548 }, { "title": "Rejection sampling", "start_index": 548, "end_index": 550 }, { "title": "Adaptive rejection sampling", "start_index": 550, "end_index": 552 }, { "title": "Importance sampling", "start_index": 552, "end_index": 554 }, { "title": "Sampling-importance-resampling", "start_index": 554, "end_index": 556 }, { "title": "Sampling and the EM algorithm", "start_index": 556, "end_index": 556 } ] }, { "title": "Markov Chain Monte Carlo", "start_index": 557, "end_index": 559, "child_nodes": [ { "title": "Markov chains", "start_index": 559, "end_index": 561 }, { "title": "The Metropolis-Hastings algorithm", "start_index": 561, "end_index": 562 } ] }, { "title": "Gibbs Sampling", "start_index": 562, "end_index": 566 }, { "title": "Slice Sampling", "start_index": 566, "end_index": 568 }, { "title": "The Hybrid Monte Carlo Algorithm", "start_index": 568, "end_index": 568, "child_nodes": [ { "title": "Dynamical systems", "start_index": 568, "end_index": 572 }, { "title": "Hybrid Monte Carlo", "start_index": 572, "end_index": 574 } ] }, { "title": "Estimating the Partition Function", "start_index": 574, "end_index": 576 } ] }, { "title": "Exercises", "start_index": 576, "end_index": 579 }, { "title": "Continuous Latent Variables", "start_index": 579, "end_index": 581, "child_nodes": [ { "title": "Principal Component Analysis", "start_index": 581, "end_index": 581, "child_nodes": [ { "title": "Maximum variance formulation", "start_index": 581, "end_index": 583 }, { "title": "Minimum-error formulation", "start_index": 583, "end_index": 585 }, { "title": "Applications of PCA", "start_index": 585, "end_index": 589 }, { "title": "PCA for high-dimensional data", "start_index": 589, "end_index": 590 } ] }, { "title": "Probabilistic PCA", "start_index": 590, "end_index": 594, "child_nodes": [ { "title": "Maximum likelihood PCA", "start_index": 594, "end_index": 597 }, { "title": "EM algorithm for PCA", "start_index": 597, "end_index": 600 }, { "title": "Bayesian PCA", "start_index": 600, "end_index": 603 }, { "title": "Factor analysis", "start_index": 603, "end_index": 606 } ] }, { "title": "Kernel PCA", "start_index": 606, "end_index": 610 }, { "title": "Nonlinear Latent Variable Models", "start_index": 611, "end_index": 611, "child_nodes": [ { "title": "Independent component analysis", "start_index": 611, "end_index": 612 }, { "title": "Autoassociative neural networks", "start_index": 612, "end_index": 615 }, { "title": "Modelling nonlinear manifolds", "start_index": 615, "end_index": 619 } ] } ] }, { "title": "Exercises", "start_index": 619, "end_index": 624 }, { "title": "Sequential Data", "start_index": 625, "end_index": 627, "child_nodes": [ { "title": "Markov Models", "start_index": 627, "end_index": 630 }, { "title": "Hidden Markov Models", "start_index": 630, "end_index": 635, "child_nodes": [ { "title": "Maximum likelihood for the HMM", "start_index": 635, "end_index": 638 }, { "title": "The forward-backward algorithm", "start_index": 638, "end_index": 645 }, { "title": "The sum-product algorithm for the HMM", "start_index": 645, "end_index": 647 }, { "title": "Scaling factors", "start_index": 647, "end_index": 649 }, { "title": "The Viterbi algorithm", "start_index": 649, "end_index": 651 }, { "title": "Extensions of the hidden Markov model", "start_index": 651, "end_index": 655 } ] }, { "title": "Linear Dynamical Systems", "start_index": 655, "end_index": 658, "child_nodes": [ { "title": "Inference in LDS", "start_index": 658, "end_index": 662 }, { "title": "Learning in LDS", "start_index": 662, "end_index": 664 }, { "title": "Extensions of LDS", "start_index": 664, "end_index": 665 }, { "title": "Particle filters", "start_index": 665, "end_index": 666 } ] } ] }, { "title": "Exercises", "start_index": 666, "end_index": 672 }, { "title": "Combining Models", "start_index": 673, "end_index": 674, "child_nodes": [ { "title": "Bayesian Model Averaging", "start_index": 674, "end_index": 675 }, { "title": "Committees", "start_index": 675, "end_index": 677 }, { "title": "Boosting", "start_index": 677, "end_index": 679, "child_nodes": [ { "title": "Minimizing exponential error", "start_index": 679, "end_index": 681 }, { "title": "Error functions for boosting", "start_index": 681, "end_index": 683 } ] }, { "title": "Tree-based Models", "start_index": 683, "end_index": 686 }, { "title": "Conditional Mixture Models", "start_index": 686, "end_index": 687, "child_nodes": [ { "title": "Mixtures of linear regression models", "start_index": 687, "end_index": 690 }, { "title": "Mixtures of logistic models", "start_index": 690, "end_index": 692 }, { "title": "Mixtures of experts", "start_index": 692, "end_index": 694 } ] } ] }, { "title": "Exercises", "start_index": 694, "end_index": 696 }, { "title": "Appendix A Data Sets", "start_index": 697, "end_index": 704 }, { "title": "Appendix B Probability Distributions", "start_index": 705, "end_index": 714 }, { "title": "Appendix C Properties of Matrices", "start_index": 715, "end_index": 722 }, { "title": "Appendix D Calculus of Variations", "start_index": 723, "end_index": 726 }, { "title": "Appendix E Lagrange Multipliers", "start_index": 727, "end_index": 730 }, { "title": "References", "start_index": 731, "end_index": 749 }, { "title": "Index", "start_index": 749, "end_index": 758 } ]