diff --git a/.gitignore b/.gitignore
index 947bdc4..3a97470 100644
--- a/.gitignore
+++ b/.gitignore
@@ -14,4 +14,7 @@ notes/2025/tikz/4
 notes/2025/mvp/chapters/4-nn.tex
 notes/2025/mvp/chapters/4-nn.pdf
 
-0-ac*
\ No newline at end of file
+0-ac*
+
+logs/
+movs/
\ No newline at end of file
diff --git a/notes/2025/mvp/chapters/2-lr.tex b/notes/2025/mvp/chapters/2-lr.tex
index 6c82f6a..fc89658 100644
--- a/notes/2025/mvp/chapters/2-lr.tex
+++ b/notes/2025/mvp/chapters/2-lr.tex
@@ -8,9 +8,7 @@ \chapter{Logistic Regression}
 \item Sigmoid Regression
 \item Maximum a posteriori
 \end{introduction}
-\section{Classification}
-
-  \subsection{Binary Classification Problem}
+\section{Binary Classification}
 
 Settings.
 \begin{itemize}
@@ -58,7 +56,7 @@ \section{Classification}
   \centering
   \includegraphics{../../tikz/2/2.pdf}
   \caption{Classification by heperplane.}
-  \label{2-lr}
+  \label{fig:2-hyperplane}
 \end{figure}
 
 
@@ -182,7 +180,7 @@ \section{Classification}
   If all points can be separated by a linear model without error, we say the dataset is {linearly separable}.
 \end{definition}
 
-Example \ref{2-lr} is linearly separable, and the final state leads to $\|W\| \to \infty, \quad \|b\| \to \infty$. However, this situation is not desirable in practice, since it implies poor robustness. Hence a natural question arises: under the condition of linear separability, how can we find a well-chosen separating hyperplane that maximizes robustness? The answer will be presented in the next chapter, where we introduce the Support Vector Machine (SVM). The SVM optimizes $\hat w, \hat b$ by maximizing the margin (the distance between data points and the separating hyperplane), instead of simply minimizing the cross-entropy loss.
+Figure~\ref{fig:2-hyperplane} illustrates a linearly separable setting, and the final state leads to $\|W\| \to \infty, \quad \|b\| \to \infty$. However, this situation is not desirable in practice, since it implies poor robustness. Hence a natural question arises: under the condition of linear separability, how can we find a well-chosen separating hyperplane that maximizes robustness? The answer will be presented in the next chapter, where we introduce the Support Vector Machine (SVM). The SVM optimizes $\hat w, \hat b$ by maximizing the margin (the distance between data points and the separating hyperplane), instead of simply minimizing the cross-entropy loss.
 
 Although logistic regression may suffer from divergence of parameters under separable data, it often achieves better performance than SVM in practice, due to the following reasons:
 \begin{enumerate}
@@ -261,7 +259,7 @@ \section{Rethink of Linear Regression}
   \centering
   \includegraphics{../../tikz/2/3.pdf}
   \caption{Normal distribution (95\%).}
-  \label{2-lr}
+  \label{fig:2-normal}
 \end{figure}
 While the Central Limit Theorem (CLT) does not imply that most datasets are normally distributed, 
 it motivates modeling additive noise as Gaussian. We assume:
diff --git a/notes/2025/mvp/chapters/3-svm.tex b/notes/2025/mvp/chapters/3-svm.tex
index d1e9a01..8f5c036 100644
--- a/notes/2025/mvp/chapters/3-svm.tex
+++ b/notes/2025/mvp/chapters/3-svm.tex
@@ -237,7 +237,8 @@ \subsection{Hard Margin}
 \begin{figure}[H]
     \centering
     \includegraphics{../../tikz/3/1.pdf}
-    \label{2-lr}
+    \caption{Distance from a point to a hyperplane.}
+    \label{fig:3-hyperplane-distance}
 \end{figure}
 \begin{proof}
     \begin{enumerate}
@@ -962,7 +963,7 @@ \section{Kernel}
             \begin{equation}
                 K(x, z) = x^\top z;
             \end{equation}
-            \item \textbf{Polynomial kernel}: maps $\mathbb{R}^n \to \mathbb{R}^{\scriptsize\begin{pmatrix}n+p\\p\end{pmatrix}}$,
+            \item \textbf{Polynomial kernel}: can be understood as mapping $\mathbb{R}^n$ into a feature space of dimension $\binom{n+p}{p}$,
             \begin{equation}
                 K(x, z) = (x^\top z + 1)^p;
             \end{equation}
diff --git a/notes/2025/mvp/chapters/7-gp.pdf b/notes/2025/mvp/chapters/7-gp.pdf
index 6edc425..6c164c3 100644
Binary files a/notes/2025/mvp/chapters/7-gp.pdf and b/notes/2025/mvp/chapters/7-gp.pdf differ
diff --git a/notes/2025/mvp/chapters/7-gp.tex b/notes/2025/mvp/chapters/7-gp.tex
index 0fa7a16..a412987 100644
--- a/notes/2025/mvp/chapters/7-gp.tex
+++ b/notes/2025/mvp/chapters/7-gp.tex
@@ -527,7 +527,7 @@ \section{Gaussian Process Regression (GPR)}
         \Sigma_\star
     \bigr),
 \end{equation}
-with mean
+with posterior mean
 \begin{equation}
     \mu_\star
     = k(x_\star,X)^\top (K + \sigma^2 I)^{-1} y,
@@ -564,7 +564,7 @@ \section{Gaussian Process Regression (GPR)}
     Gram matrix $K_\ell$ converges to a diagonal matrix:
     \begin{equation}
         \lim_{\ell \to 0} K_\ell
-        = \sigma_f^2 I_n.
+        = \sigma_f^2 I.
     \end{equation}
 
     In the noise-free case (i.e.\ $\sigma^2 = 0$ in the observation
@@ -581,7 +581,7 @@ \section{Gaussian Process Regression (GPR)}
         \qquad
         K_\ell^{-1}
         \;\longrightarrow\;
-        \frac{1}{\sigma_f^2} I_n,
+        \frac{1}{\sigma_f^2} I,
     \end{equation}
     where $e_j$ is the $j$-th standard basis vector in $\mathbb R^n$.
     Hence
@@ -595,5 +595,82 @@ \section{Gaussian Process Regression (GPR)}
     exactly at every training input: each training point is matched
     perfectly by the predictive mean.
 \end{remark}
+%===================================
+From the discussion above, we know that the \emph{posterior mean} of a Gaussian
+Process provides the predictive value for a test input, while the associated
+\emph{uncertainty} can be quantified through the \emph{predictive variance}.
+The relative standard deviation is simply the square root of this predictive
+variance.
+
+Gaussian Process Regression (GPR) serves as the mathematical foundation of
+\textbf{Bayesian Optimization (BO)}, a framework for performing optimization
+when the objective function is expensive, noisy, or lacks analytic structure.
+
+\begin{definition}[Black-box Optimization]
+An optimization problem is called \emph{black-box optimization} if and only if
+the analytical form of the objective function is unknown and no gradient
+information is available.
+\end{definition}
+Since the gradient of the objective function is unavailable, gradient-based
+methods such as gradient descent cannot be applied in black-box settings.
+The only feasible operation is \emph{point-wise evaluation}: we may query the
+black box at a finite number of input locations and observe the corresponding
+outputs.
+
+Bayesian Optimization (BO) aims to construct a probabilistic surrogate model
+of the black-box function and use it to locate the maximizer of $y$ with as
+few evaluations as possible. This is particularly important when each
+evaluation is expensive, for instance in hyperparameter tuning.
+
+The typical BO procedure proceeds as follows:
+\begin{enumerate}
+    \item Randomly or uniformly select a small set of initial points
+    $x_1,\ldots,x_n$ and obtain their evaluations $y_1,\ldots,y_n$.
+    \item Fit a Gaussian Process Regression (GPR) model using the collected
+    data.
+    \item Use an acquisition function $a(x)$ to select one or a batch of new
+    query points; evaluate them, augment the dataset, and refit the GPR model.
+    \item Repeat the process until the evaluation budget is exhausted, and
+    return the point achieving the maximum observed value of $y$.
+\end{enumerate}
+Here are two commonly used acquisition functions in Bayesian Optimization:
+
+\begin{enumerate}
+    \item \textbf{Expected Improvement (EI)}:
+    \begin{equation}
+        a(x) = \mathbb{E}\big[(y(x) - y_{\max})^{+}\big],
+    \end{equation}
+    where
+    \begin{equation}
+        [z]^{+} := \max(0, z).
+    \end{equation}
+    Under the GPR model, the predictive distribution is
+    \begin{equation}
+        y(x) \sim \mathcal{N}\big(\mu(x),\, \Sigma(x)\big),
+    \end{equation}
+    so EI admits a closed-form expression obtained by integrating over the
+    tail above $y_{\max}$:
+    \begin{equation}
+        a(x)=\int_{y_{\max}}^{+\infty} \big(y(x)-y_{\max}\big)\,
+        \mathcal{N}\!\left(y(x)\,\middle|\,\mu(x),\,\Sigma(x)\right)\,
+        \mathrm{d}y(x).
+    \end{equation}
+
+    \item \textbf{Upper Confidence Bound (UCB)}:
+    \begin{equation}
+        a(x)=\mu(x) + \kappa\,\sigma(x),
+    \end{equation}
+    where $\sigma(x)$ denotes the predictive standard deviation and
+    $\kappa>0$ controls the exploration–exploitation balance.
+    In essence, UCB prefers points where the model is either promising
+    (large $\mu$) or highly uncertain (large $\sigma$).
+\end{enumerate}
+
+\begin{remark}
+    \textbf{Optuna} is a widely used automatic hyperparameter tuning library,
+    recommended in class for practical Bayesian Optimization.
+\end{remark}
+
+
 
 \end{document}
\ No newline at end of file
diff --git a/notes/2025/mvp/chapters/8-tel.pdf b/notes/2025/mvp/chapters/8-tel.pdf
new file mode 100644
index 0000000..4218af8
Binary files /dev/null and b/notes/2025/mvp/chapters/8-tel.pdf differ
diff --git a/notes/2025/mvp/chapters/8-tel.tex b/notes/2025/mvp/chapters/8-tel.tex
new file mode 100644
index 0000000..89e5ac4
--- /dev/null
+++ b/notes/2025/mvp/chapters/8-tel.tex
@@ -0,0 +1,1094 @@
+\documentclass[../main]{subfiles}
+\begin{document}
+\chapter{Trees and Ensemble Learning}
+\begin{introduction}
+    \item Decision Trees and Axis-Aligned Splits
+    \item Information Gain and Mutual-Information
+    \item Feature Selection and Purity Measures
+    \item Bagging and Variance Reduction
+    \item Random Forests and Out-of-Bag Evaluation
+    \item Boosting and Functional Gradient View
+\end{introduction}
+
+\section*{Review: Gaussian Processes and Bayesian Optimization}
+
+In the previous chapter, we introduced Gaussian Processes (GPs) and showed how
+they can be used for regression.  Given training inputs
+\begin{equation}
+    X = (x_1,\dots,x_n), \qquad
+    y = (y_1,\dots,y_n)^\top,
+\end{equation}
+and a test point $x_\star$, a GP prior with kernel $k(\cdot,\cdot)$ and i.i.d.\
+Gaussian observation noise of variance $\lambda$ yields the joint Gaussian
+distribution
+\begin{equation}
+    \begin{pmatrix}
+        y\\[2pt]
+        f_\star
+    \end{pmatrix}
+    \sim
+    \mathcal{N}\!\left(
+        0,\,
+        \begin{bmatrix}
+            K(X,X) + \lambda I_n
+            & k(X,x_\star)\\[2pt]
+            k(x_\star,X) & k(x_\star,x_\star)
+        \end{bmatrix}
+    \right),
+\end{equation}
+where $K(X,X)$ is the $n\times n$ Gram matrix and
+$k(X,x_\star) = (k(x_1,x_\star),\dots,k(x_n,x_\star))^\top$.
+Conditioning on the observed $y$ gives the posterior predictive distribution
+for $f_\star = f(x_\star)$:
+\begin{align}
+    \mu_\star
+    &= \mathbb{E}[f_\star \mid X,y,x_\star]
+      = k(x_\star,X)^\top
+        \bigl(K(X,X)+\lambda I_n\bigr)^{-1} y,
+        \label{eq:gp-posterior-mean}\\
+    \sigma_\star^2
+    &= \operatorname{Var}[f_\star \mid X,y,x_\star]
+      = k(x_\star,x_\star)
+        - k(x_\star,X)^\top
+          \bigl(K(X,X)+\lambda I_n\bigr)^{-1}
+          k(x_\star,X).
+        \label{eq:gp-posterior-var}
+\end{align}
+By evaluating $\mu_\star$ at many test points and optionally plotting
+$\mu_\star \pm \sigma_\star$, we obtain a smooth predictive curve together with
+credible intervals that quantify uncertainty.
+
+\subsection*{Black-Box Optimization with Gaussian Processes}
+
+An important application of GP regression is \emph{Bayesian Optimization} (BO),
+which addresses \emph{black-box optimization} problems:
+\begin{equation}
+    \max_{x\in\mathcal{X}} f(x),
+\end{equation}
+where:
+\begin{itemize}
+    \item $f(x)$ has no known analytic form (no closed-form expression);
+    \item gradients $\nabla f(x)$ are unavailable;
+    \item each evaluation of $f(x)$ is \emph{expensive}.
+\end{itemize}
+We can, however, query the black box at chosen points:
+\begin{equation}
+    y = f(x) + \varepsilon,
+\end{equation}
+and use these point evaluations to build a surrogate model.
+
+A canonical example is \emph{hyperparameter tuning} for neural networks or
+other machine learning models.  Here:
+\begin{itemize}
+    \item $x$ encodes a vector of hyperparameters (learning rate, width, depth,
+    regularization strength, \dots);
+    \item $f(x)$ is the validation performance (e.g.\ accuracy) obtained by
+    training the model with hyperparameters $x$ and evaluating on a validation
+    set.
+\end{itemize}
+Each function evaluation requires a full train--validate cycle and is therefore
+very costly, so we must find good $x$ using as \emph{few} evaluations as
+possible.
+
+\begin{remark}
+        Bayesian Optimization with a GP surrogate proceeds iteratively:
+    \begin{enumerate}
+        \item \textbf{Initialization.}  Select an initial design
+        $\{x_i\}_{i=1}^n$ (e.g.\ random or space-filling) and evaluate
+        $y_i = f(x_i)$.
+        \item \textbf{Fit GP surrogate.}  Use $\{(x_i,y_i)\}_{i=1}^n$ to fit a GP
+        regression model, yielding posterior mean $\mu(x)$ and variance
+        $\sigma^2(x)$ for all $x\in\mathcal{X}$.
+        \item \textbf{Acquisition maximization.}  Define an \emph{acquisition
+        function} $a(x)$ that uses $\mu(x)$ and $\sigma(x)$ to score candidate
+        points, and choose the next evaluation point(s) by
+        \begin{equation}
+            x_{\text{next}} \in \argmax_{x\in\mathcal{X}} a(x).
+        \end{equation}
+        \item \textbf{Evaluate and update.}  Query the black box at
+        $x_{\text{next}}$ to obtain $y_{\text{next}} = f(x_{\text{next}})$, augment
+        the dataset, and refit (or update) the GP surrogate.
+        \item \textbf{Repeat} Steps 2--4 until a pre-specified evaluation budget is
+        exhausted, then return the best observed $y$ (or its maximizer).
+    \end{enumerate}
+    The acquisition function is responsible for balancing
+    \emph{exploration} (sampling uncertain regions where $\sigma(x)$ is large) and
+    \emph{exploitation} (sampling near currently promising regions where
+    $\mu(x)$ is large).
+\end{remark}
+
+\subsection*{Acquisition Functions}
+
+Let $y_{\max}$ denote the best function value observed so far, and suppose the
+GP posterior at $x$ is Gaussian:
+\begin{equation}
+    f(x)\mid \mathcal{D}
+    \sim \mathcal{N}\bigl(\mu(x), \sigma^2(x)\bigr).
+\end{equation}
+Two widely used acquisition functions are:
+\begin{itemize}
+    \item \textbf{Expected Improvement (EI).}
+    Define the \emph{improvement} at $x$ as
+    \begin{equation}
+        I(x) = \bigl(f(x) - y_{\max}\bigr)_+
+        = \max\{f(x)-y_{\max},\,0\}.
+    \end{equation}
+    The Expected Improvement is
+    \begin{equation}
+        a_{\mathrm{EI}}(x)
+        = \mathbb{E}[I(x)\mid\mathcal{D}].
+    \end{equation}
+    Under the Gaussian posterior, this has a closed form:
+    \begin{equation}
+        a_{\mathrm{EI}}(x)
+        = (\mu(x)-y_{\max})\Phi(z) + \sigma(x)\phi(z),
+        \qquad
+        z = \frac{\mu(x)-y_{\max}}{\sigma(x)},
+    \end{equation}
+    where $\Phi$ and $\phi$ are the CDF and PDF of the standard normal
+    distribution, respectively.  EI focuses on regions that, on average, are
+    likely to yield improvements over the current best value.
+
+    \item \textbf{Upper Confidence Bound (UCB).}
+    For a parameter $\kappa>0$, define
+    \begin{equation}
+        a_{\mathrm{UCB}}(x)
+        = \mu(x) + \kappa\,\sigma(x).
+    \end{equation}
+    When $\kappa=1$, $a_{\mathrm{UCB}}(x)$ follows the upper envelope
+    $\mu(x)+\sigma(x)$ of the GP posterior.  Larger $\kappa$ encourages more
+    exploration (sampling high-uncertainty regions), while smaller $\kappa$
+    favors exploitation.
+\end{itemize}
+In practice, BO alternates between refining the surrogate in regions where data
+are scarce and zooming in on promising areas, progressively improving our
+estimate of both the location and value of the optimum.
+
+\begin{remark}
+    Many modern hyperparameter optimization libraries implement Bayesian
+    Optimization with GP or related surrogates.  For example, \texttt{Optuna}
+    provides a flexible interface where the user specifies:
+    \begin{itemize}
+        \item the search space of hyperparameters;
+        \item an evaluation budget (maximum number of trials);
+        \item an objective function that trains and evaluates the model.
+    \end{itemize}
+    The library then automatically manages the BO loop, returning (approximately)
+    optimal hyperparameters within the given budget.
+\end{remark}
+
+\section{Decision Trees and Feature Selection}
+
+\subsection{Decision Trees and Axis-Aligned Splits}
+
+Previously, we used linear models (or linear models in a high-dimensional
+feature space) to perform binary classification.  The classifier can be
+written as
+\begin{equation}
+f(x) =
+\begin{cases}
++1, & \text{if } w^\top x + b \ge 0,\\[4pt]
+-1, & \text{if } w^\top x + b < 0.
+\end{cases}
+\end{equation}
+The decision boundary is the hyperplane
+\begin{equation}
+w^\top x + b = 0.
+\end{equation}
+
+Another way to obtain more flexible, nonlinear decision boundaries is to use
+\emph{tree-based models}.  A simple decision tree implementing the rule above
+can be drawn as follows:
+\begin{figure}[h]
+    \centering
+    \includegraphics{../../tikz/8/1.pdf}
+\end{figure}
+\begin{definition}[Decision Tree]
+    A \emph{decision tree} is a tree consisting of a root node, internal nodes,
+    leaf nodes, and directed edges.  
+    Each internal node partitions the data according to some feature.  
+    For any input example, its final prediction is given by the label of the leaf
+    node it reaches.
+    \end{definition}
+    
+    \begin{example}
+        \newcommand{\cmark}{\ding{51}} % ✓
+\newcommand{\xmark}{\ding{55}} % ✗
+
+\begin{wraptable}[20]{r}{0.4\linewidth}
+    \centering
+    % \caption{Feature table for the decision tree example}
+    \vspace{2mm}
+    \begin{tabular}{c|ccc|c}
+        \textbf{ID} & \textbf{A} & \textbf{B} & \textbf{C} & \textbf{y} \\ \hline
+        1 & \cmark & \cmark & \cmark & \cmark \\
+        2 & \cmark & \xmark & \xmark & \cmark \\
+        3 & \cmark & \xmark & \cmark & \xmark \\
+        4 & \xmark & \cmark & \cmark & \xmark \\
+        5 & \xmark & \xmark & \xmark & \xmark \\
+        6 & \xmark & \xmark & \xmark & \xmark \\
+        7 & \xmark & \cmark & \xmark & \xmark \\
+        8 & \cmark & \xmark & \cmark & \xmark \\
+        9 & \xmark & \cmark & \cmark & \xmark \\
+    \end{tabular}
+\end{wraptable}
+
+    We consider a binary classification setting where the label set is
+    \begin{equation}
+        Y \in \{-1, +1\}.
+    \end{equation}
+    Here, the interpretation is:
+    \begin{equation}
+        +1: \text{ a good researcher}, \qquad
+        -1: \text{ a bad researcher}.
+    \end{equation}
+    
+    Assume we have three binary features:
+    
+    \begin{itemize}
+        \item Feature \(A\):
+        \begin{equation}
+            +1: \text{ hardworking}, \qquad -1: \text{ not hardworking}.
+        \end{equation}
+    
+        \item Feature \(B\):
+        \begin{equation}
+            +1: \text{ has good vision}, \qquad -1: \text{ not}.
+        \end{equation}
+    
+        \item Feature \(C\):
+        \begin{equation}
+            +1: \text{ likes bananas}, \qquad -1: \text{ not}.
+        \end{equation}
+    \end{itemize}
+    
+    A decision tree may use these features at its internal nodes to determine
+    whether an example will ultimately be predicted as \(+1\) or \(-1\).
+    \end{example}
+    \begin{solution}
+
+        \vspace{1em}
+        \begin{center}
+        \begin{minipage}{0.55\linewidth}
+            \centering
+            \includegraphics[width=\linewidth]{../../tikz/8/2.pdf}
+        \end{minipage}
+        \hfill
+        \begin{minipage}{0.33\linewidth}
+            \centering
+            \includegraphics[width=\linewidth]{../../tikz/8/3.pdf}
+        \end{minipage}
+    \end{center}
+        
+        \vspace{1em}
+        \end{solution}
+
+If we choose feature $C$ as the root node, the samples are not separated as cleanly as when we use feature $A$.  
+This leads to a natural question: \emph{given many candidate features, how can we choose the one that produces the ``purest'' split at the root?}
+
+From the example above, a good feature should split the training set into subsets whose labels are as pure (i.e., as close to all positive or all negative) as possible.  
+To quantify this notion of purity---or, conversely, of randomness---we introduce the notion of entropy.
+
+\subsection{Entropy, Information Gain, and Mutual Information}
+
+\begin{definition}[Entropy]
+    For a discrete random variable $X$ with distribution $P$, the (Shannon)
+    entropy of $P$ is defined as
+    \begin{equation}
+        H(P)
+        := -\sum_{x} P(x)\log P(x).
+    \end{equation}
+    Entropy measures the intrinsic uncertainty of the distribution.  Equivalently,
+    \begin{equation}
+        H(P)
+        = \mathbb{E}_{X\sim P}\bigl[-\log P(X)\bigr],
+    \end{equation}
+    so $-\log P(X)$ can be viewed as the \emph{information content} of the outcome
+    $X$, and $H(P)$ is its average over many draws.
+\end{definition}
+Entropy has several essential properties.
+\begin{enumerate}
+\item
+\textbf{Non-negativity.} Since $\log P(x) \le 0$ for all $x$, we have
+\begin{equation}
+    H(P) = -\sum_x P(x)\log P(x) \ge 0,
+\end{equation}
+with equality if and only if the distribution is degenerate (i.e., $P(x)=1$
+for some $x$).
+\item \textbf{Upper bound.}
+Using Jensen's inequality:
+\begin{equation}
+    H(P)
+    = -\mathbb{E}_{P}[\log P(X)]
+    \le -\log \mathbb{E}_{P}[P(X)]
+    = -\log \sum_x P(x)^2.
+\end{equation}
+In particular, for a distribution supported on $n$ distinct outcomes,
+\begin{equation}
+    H(P) \le \log n,
+\end{equation}
+and equality holds if and only if $P$ is the uniform distribution:
+\begin{equation}
+    P(x) = \frac{1}{n}.
+\end{equation}
+\end{enumerate}
+\begin{remark}
+    Beyond measuring uncertainty, entropy also characterizes the fundamental
+    limit of lossless data compression.  
+    
+    For a discrete source with distribution $P$, any prefix-free coding scheme
+    assigns a codeword of length $\ell(x)$ to each symbol $x$.  Shannon's source
+    coding theorem states that the expected code length satisfies
+    \begin{equation}
+        \mathbb{E}[\ell(X)]
+        \;\ge\;
+        H(P)
+        := -\sum_x P(x)\log P(x),
+    \end{equation}
+    with equality achievable asymptotically.
+    
+    Thus, the entropy $H(P)$ represents the minimal achievable average number of
+    bits required to encode samples drawn from $P$.  In this sense, entropy
+    quantifies both the intrinsic uncertainty of a distribution and the optimal
+    compression rate for a lossless encoding.
+    \end{remark}
+\begin{definition}[Cross-Entropy]
+Let $P$ and $Q$ be two distributions over the same support.
+The cross-entropy between $P$ and $Q$ is defined as
+\begin{equation}
+    H(P,Q)
+    := -\sum_x P(x)\log Q(x)
+    = \mathbb{E}_{P}[-\log Q(X)].
+\end{equation}
+\end{definition}
+
+Observe:
+\begin{equation}
+    H(P,Q) - H(P)
+    = -\sum_x P(x)\log Q(x)
+      +\sum_x P(x)\log P(x)
+    = \sum_x P(x)\log\frac{P(x)}{Q(x)}.
+\end{equation}
+\begin{definition}[Kullback--Leibler (KL) Divergence]
+    Let $P$ and $Q$ be two probability distributions over the same support.
+    The Kullback--Leibler divergence from $P$ to $Q$ is defined as
+    \begin{equation}
+        D_{\mathrm{KL}}(P \,\|\, Q)
+        := \sum_{x} P(x)\,\log\frac{P(x)}{Q(x)}
+    \end{equation}
+    in the discrete case, or
+    \begin{equation}
+        D_{\mathrm{KL}}(P \,\|\, Q)
+        := \int p(x)\,\log\frac{p(x)}{q(x)}\,\mathrm{d}x
+    \end{equation}
+    in the continuous case.
+    
+    KL divergence measures how different the distribution $P$ is from $Q$.  
+    It satisfies
+    \begin{equation}
+        D_{\mathrm{KL}}(P\|Q) \ge 0,
+    \end{equation}
+    with equality if and only if $P = Q$ almost everywhere.
+    \end{definition}
+Thus,
+\begin{equation}
+    H(P,Q) - H(P)
+    = D_{\mathrm{KL}}(P\|Q) \ge 0,
+\end{equation}
+with equality if and only if $P = Q$.
+
+Cross-entropy decomposes into:
+\begin{equation}
+    H(P,Q) = H(P) + D_{\mathrm{KL}}(P\|Q),
+\end{equation}
+meaning that cross-entropy is always at least as large as entropy and grows
+with the divergence between $P$ and $Q$.
+\begin{remark}
+    Huffman coding is a greedy algorithm for constructing an optimal
+    prefix-free code when the symbol probabilities are known.
+    More frequent symbols receive shorter codewords and rare symbols receive
+    longer ones.  Shannon's source coding theorem guarantees that the expected
+    code length of a Huffman code is no larger than $H(P)+1$ bits per symbol,
+    so Huffman coding is essentially optimal among all lossless prefix codes.
+    \begin{center}
+        \vspace{0.3em}
+        \includegraphics{../../tikz/8/4.pdf}
+    \end{center}
+        Huffman coding is tightly connected to entropy and cross-entropy.
+        
+        \textbf{1. Entropy as the optimal code length.}
+        For a source with distribution $P$, the entropy
+        \begin{equation}
+            H(P) = -\sum_x P(x)\log P(x)
+        \end{equation}
+        is the theoretical lower bound on the average number of bits needed to encode
+        symbols drawn from $P$.  Huffman coding produces the optimal prefix-free code,
+        and its expected code length $L_{\mathrm{Huff}}$ satisfies
+        \begin{equation}
+            H(P) \le L_{\mathrm{Huff}} < H(P)+1.
+        \end{equation}
+        
+        \textbf{2. Cross-entropy as the cost of using a wrong code.}
+        Suppose we design a Huffman code based on a \emph{wrong} distribution $Q$
+        but the true data come from $P$.  Then the expected code length becomes
+        \begin{equation}
+            \mathbb{E}_{P}[-\log Q(X)]
+            = H(P,Q),
+        \end{equation}
+        which is the cross-entropy between $P$ and $Q$.
+        
+        \textbf{3. Extra penalty equals KL divergence.}
+        The additional number of bits required due to the mismatch between $P$ and $Q$
+        is
+        \begin{equation}
+            H(P,Q) - H(P)
+            = D_{\mathrm{KL}}(P\|Q).
+        \end{equation}
+        Thus, the KL divergence quantifies the inefficiency of using a code optimized
+        for $Q$ when the true distribution is $P$.
+        
+        \textbf{Summary.}
+        Entropy gives the minimal achievable rate, Huffman coding attains it (up to one
+        bit), and cross-entropy / KL divergence measure how much extra cost arises when
+        the coding distribution does not match the true distribution.
+        \end{remark}
+        \begin{theorem}[Shannon's Source Coding Theorem*]
+            Let $X$ be a discrete memoryless source with distribution $P$.  
+            For any prefix-free code with codeword lengths $\{\ell(x)\}$, the
+            expected code length satisfies
+            \begin{equation}
+                \mathbb{E}[\ell(X)] \;\ge\; H(P).
+            \end{equation}
+            Moreover, for every $\varepsilon>0$, there exists a prefix-free code such that
+            \begin{equation}
+                \mathbb{E}[\ell(X)] \;\le\; H(P)+\varepsilon,
+            \end{equation}
+            for sufficiently long block coding.  
+            Thus $H(P)$ is the optimal achievable rate for lossless compression.
+            \end{theorem}
+\begin{proof}
+    The theorem consists of two parts: a converse (no code beats entropy) and
+    an achievability (entropy can be approached arbitrarily closely).
+    
+    \textbf{1. Converse: No prefix-free code can beat $H(P)$.}
+    For any prefix-free code, Kraft's inequality gives
+    \begin{equation}
+        \sum_x 2^{-\ell(x)} \le 1.
+    \end{equation}
+    Multiplying both sides by $P(x)$ and applying Jensen's inequality to the
+    convex function $-\log(\cdot)$ yields
+    \begin{equation}
+        \mathbb{E}[\ell(X)]
+        \;=\; \sum_x P(x)\ell(x)
+        \;\ge\; -\sum_x P(x)\log P(x)
+        \;=\; H(P).
+    \end{equation}
+    Thus entropy is a lower bound on any valid coding scheme.
+    
+    \textbf{2. Achievability: Constructing codes approaching $H(P)$.}
+    Consider $n$ i.i.d. samples $X^n$ from the source.  
+    The Asymptotic Equipartition Property (AEP) states that for any
+    $\varepsilon>0$, with probability approaching $1$, sequences in the
+    typical set $\mathcal{T}_\varepsilon^{(n)}$ satisfy
+    \begin{equation}
+        -\frac{1}{n}\log P(X^n)
+        \in [H(P)-\varepsilon,\; H(P)+\varepsilon].
+    \end{equation}
+    The typical set contains approximately $2^{nH(P)}$ sequences.
+    We can therefore assign to each typical sequence a codeword of length
+    \[
+        nH(P)+o(n),
+    \]
+    while atypical sequences receive longer codewords but contribute
+    negligibly to the expected length.  
+    This yields a block code with
+    \begin{equation}
+        \mathbb{E}[\ell(X^n)]
+        \le n(H(P)+\varepsilon).
+    \end{equation}
+    
+    \textbf{3. Conclusion.}
+    Combining the converse and achievability gives the optimal rate:
+    \[
+        \lim_{n\to\infty} \frac{1}{n}\mathbb{E}[\ell(X^n)] = H(P).
+    \]
+    Thus $H(P)$ is both a fundamental lower bound and an achievable rate for
+    lossless source coding.
+    \end{proof}
+
+        To measure how ``pure'' the labels in a dataset are, we use the notion of
+        \emph{information gain}.
+        
+        Let $D$ be a training set.  
+        Labels lie in a finite set $Y\in\{1,\ldots,K\}$.  
+        For each label $k$, define the subset
+        \begin{equation}
+            C_k := \{x\in D : y(x)=k\}.
+        \end{equation}
+        
+        The entropy of $D$ with respect to its empirical label distribution is
+        \begin{equation}
+            H(D)
+            = -\sum_{k=1}^K 
+                \frac{|C_k|}{|D|}
+                \log\frac{|C_k|}{|D|}
+            \approx
+            -\sum_{y=1}^K P(y)\log P(y)
+            = H(Y),
+        \end{equation}
+        where $P(y)$ denotes the empirical frequency of label $y$.
+        
+        Let $A$ be a feature taking values in $\{a_1,\ldots,a_m\}$, and let
+        \begin{equation}
+            D_{a_i} := \{x\in D : A(x)=a_i\}
+        \end{equation}
+        be the subset of samples with feature value $a_i$.
+        
+        The conditional entropy of labels given the feature is
+        \begin{equation}
+            H(D\mid A)
+            =
+            \sum_{i=1}^m 
+                \frac{|D_{a_i}|}{|D|}
+                H(D_{a_i})
+            =
+            \sum_{i=1}^m 
+                \frac{|D_{a_i}|}{|D|}
+                \left(
+                    -\sum_{k=1}^K 
+                    \frac{|D_{a_i}\cap C_k|}{|D_{a_i}|}
+                    \log \frac{|D_{a_i}\cap C_k|}{|D_{a_i}|}
+                \right),
+        \end{equation}
+\begin{note}
+    \begin{equation}
+        H(D\mid A)=\mathbb E_{x|A}[H(D_x)]
+    \end{equation}
+\end{note}
+        The \emph{information gain} of feature $A$ is defined as
+        \begin{definition}[Information gain (Mutual Information)]
+            \begin{equation}
+                g(D,A)
+               =
+                H(D) - H(D\mid A).
+            \end{equation}
+        \end{definition}
+        
+        A large value of $g(D,A)$ means that feature $A$ significantly reduces label
+        uncertainty, producing purer label subsets and therefore acting as a good
+        splitting feature in decision tree construction.
+
+        More generally, for two random variables $X$ and $Y$, their \emph{mutual
+        information} is
+        \begin{equation}
+            I(X;Y)
+            = H(X) - H(X\mid Y)
+            = H(Y) - H(Y\mid X)
+            = H(X) + H(Y) - H(X,Y).
+        \end{equation}
+        In our setting, information gain is exactly the mutual information between the
+        feature and the label:
+        \begin{equation}
+            g(D,A) \approx I(Y;A).
+        \end{equation}
+        We therefore choose the best feature $A$ by
+        \begin{equation}
+            A^\star=\argmax_A g(D,A).
+        \end{equation}
+        
+        If two random variables satisfy $I(X;Y)=0$, then they are independent: knowing
+        one of them does not reduce the uncertainty about the other.
+        
+        Mutual information has several important properties:
+        \begin{enumerate}
+            \item \textbf{Symmetry.} $I(X;Y) = I(Y;X)$.
+            \item \textbf{Non-negativity.} $I(X;Y) \ge 0$, with equality if and only if $X$ and $Y$ are independent.
+            \item \textbf{Alternative forms.} $I(X;Y) = H(X) + H(Y) - H(X,Y)$.
+        \end{enumerate}
+        
+        The larger the mutual information, the more correlated $X$ and $Y$ are.  When $X$ and $Y$ are highly correlated, knowing $X$ significantly reduces the uncertainty about $Y$, and vice versa.
+        
+\begin{figure}[h]
+    \centering
+    \includegraphics{../../tikz/8/5.pdf}
+\end{figure}
+
+\subsection{Information Gain Ratio}
+
+Information gain has a significant drawback: it tends to favor features with many possible values.  
+
+\begin{example}
+    Consider a dataset with two features: $A$ (taking values $a_1, a_2$) and $B$ (taking values $b_1, \ldots, b_{10}$).  Suppose that:
+    \begin{itemize}
+        \item Feature $A$ splits the data into two subsets, each containing 5 classes uniformly.
+        \item Feature $B$ splits the data into 10 pure subsets, each containing only one class.
+    \end{itemize}
+    Then $g(D,A) = \log 10 - \log 5 = \log 2$, while $g(D,B) = \log 10 - 0 = \log 10$.
+    Information gain would prefer feature $B$, even though it may have poor generalization ability.
+    
+    More extremely, if we use the index (row number) as a feature, each example would fall into its own pure subset, yielding maximum information gain $g(D,\mathrm{index}) = H(D)$, but such a feature has no generalization ability for new test examples.
+\end{example}
+
+To address this issue, we introduce the \emph{information gain ratio}, which penalizes features with many values.
+
+\begin{definition}[Information Gain Ratio]
+    The information gain ratio of feature $A$ is defined as
+    \begin{equation}
+        \mathrm{GR}(D,A)
+        = \frac{g(D,A)}{H(D,A)},
+    \end{equation}
+    where $H(D,A)$ is the entropy of feature $A$ itself:
+    \begin{equation}
+        H(D,A)
+        = -\sum_{i=1}^m \frac{|D_{a_i}|}{|D|}\log\frac{|D_{a_i}|}{|D|}.
+    \end{equation}
+    This measures the intrinsic uncertainty of the feature $A$'s distribution.
+\end{definition}
+
+The denominator $H(D,A)$ penalizes features with many values.  When $A$ has many uniformly distributed values, $H(D,A)$ approaches $\log m$, which reduces the gain ratio.  This helps prevent overfitting to features that achieve high purity simply by having many possible values.
+
+\subsection{Gini Index}
+
+Another common purity measure is the \emph{Gini index}, which serves as an alternative to entropy.
+
+\begin{definition}[Gini Index]
+    For a dataset $D$ with labels in $\{1,\ldots,K\}$, the Gini index is defined as
+    \begin{equation}
+        \mathrm{Gini}(D)
+        = 1 - \sum_{k=1}^K \left(\frac{|C_k|}{|D|}\right)^2
+        = \sum_{k=1}^K \frac{|C_k|}{|D|}\left(1 - \frac{|C_k|}{|D|}\right),
+    \end{equation}
+    where $C_k = \{x \in D : y(x) = k\}$.
+\end{definition}
+
+The Gini index measures the expected error rate if we randomly label examples according to the class distribution.  It satisfies:
+\begin{itemize}
+    \item $\mathrm{Gini}(D) = 0$ when $D$ is pure (all examples have the same label).
+    \item $\mathrm{Gini}(D)$ is maximized when the class distribution is uniform, taking the value $1 - 1/K$ for $K$ classes.
+    \item For binary classification ($K=2$), $\mathrm{Gini}(D) = 2p(1-p)$, where $p$ is the proportion of the positive class.
+\end{itemize}
+
+The conditional Gini index given feature $A$ is:
+\begin{equation}
+    \mathrm{Gini}(D \mid A)
+    = \sum_{i=1}^m \frac{|D_{a_i}|}{|D|} \mathrm{Gini}(D_{a_i}).
+\end{equation}
+
+For feature selection, we choose the feature that minimizes the conditional Gini index:
+\begin{equation}
+    A^\star = \argmin_A \mathrm{Gini}(D \mid A).
+\end{equation}
+
+\begin{remark}
+    For binary classification, the Gini index and entropy (scaled by $1/2$) are very similar functions of the class probability $p$.  Both achieve their minimum at $p \in \{0,1\}$ and maximum at $p = 1/2$.
+\end{remark}
+
+For regression problems, where labels are continuous, we use a different criterion based on squared loss.
+
+\begin{definition}[L2 Loss Criterion]
+    For a regression problem, after splitting by feature $A$ into subsets $D_1,\ldots,D_m$, define the mean label in each subset:
+    \begin{equation}
+        \bar{y}_{D_i} = \frac{1}{|D_i|}\sum_{j \in D_i} y_j.
+    \end{equation}
+    The L2 loss criterion is:
+    \begin{equation}
+        L(D,A)
+        = \sum_{i=1}^m \sum_{j \in D_i} (y_j - \bar{y}_{D_i})^2.
+    \end{equation}
+    We choose the feature that minimizes this criterion:
+    \begin{equation}
+        A^\star = \argmin_A L(D,A).
+    \end{equation}
+\end{definition}
+
+This criterion measures the variance within each subset after splitting.  A smaller L2 loss indicates purer subsets, where examples in each subset have labels close to their subset mean.  The mean $\bar{y}_{D_i}$ serves as the prediction for examples in subset $D_i$.
+
+\subsection{Building Decision Trees: Greedy Algorithm}
+
+Given multiple feature selection criteria (information gain, information gain ratio, Gini index, or L2 loss), we can build a decision tree.  The naive approach of enumerating all possible trees is computationally infeasible (super-exponential complexity).  Instead, we use a \emph{greedy algorithm}.
+
+The greedy algorithm builds the tree recursively:
+
+    \textbf{Input:} Training set $D$, feature set $\mathcal{F}$.
+    
+    \textbf{Procedure:}
+    \begin{enumerate}
+        \item \textbf{Choose a feature} $A \in \mathcal{F}$ according to a purity metric (e.g., information gain, information gain ratio, Gini index, or L2 loss).
+        \item \textbf{Partition} $D$ into subsets $D_1,\ldots,D_m$ based on feature $A$'s values $a_1,\ldots,a_m$.
+        \item \textbf{Recursively build subtrees} for each subset $D_i$ using the remaining features $\mathcal{F} \setminus \{A\}$.
+        
+        \textbf{Why remove $A$?} Since all examples in $D_i$ share the same value for feature $A$, further splitting by $A$ would be redundant.
+        
+        \item \textbf{Termination conditions:}
+        \begin{itemize}
+            \item The feature set $\mathcal{F}$ is empty (no more features available).
+            \item The subset $D_i$ is pure (contains only one class for classification, or has very low variance for regression).
+            \item Maximum depth is reached.
+            \item Purity improvement is below a threshold.
+            \item Number of samples in a node is below a minimum threshold.
+        \end{itemize}
+        \item When termination occurs, the node becomes a \textbf{leaf node} with label equal to the majority class (for classification) or the mean value (for regression) in that subset.
+    \end{enumerate}
+    
+    \textbf{Prediction:} For a test example, traverse the tree according to its feature values until reaching a leaf node, then use the leaf's label as the prediction.
+
+
+\begin{remark}
+    The naive approach of enumerating all possible trees would require exploring all permutations of feature selection orders, leading to super-exponential complexity.  The greedy algorithm reduces this to polynomial time by making locally optimal choices at each step, though it does not guarantee global optimality.
+\end{remark}
+
+\begin{remark}
+    To prevent overfitting, common regularization techniques include:
+    \begin{itemize}
+        \item Limiting the maximum depth of the tree.
+        \item Setting a minimum number of samples required to split a node.
+        \item Setting a minimum purity improvement threshold.
+        \item Using a validation set to monitor performance and stop early.
+    \end{itemize}
+\end{remark}
+
+\begin{remark}
+    The greedy algorithm is heuristic: it makes locally optimal choices at each step but does not guarantee a globally optimal tree.  Despite this, decision trees and their ensemble variants (Random Forests, Gradient Boosting) are among the most powerful methods in practice, especially for tabular data.
+\end{remark}
+
+\subsection{Continuous Features in Decision Trees}
+
+In the preceding discussion, we implicitly assumed that features are discrete,
+taking values in a finite set.  For such a feature $A$ with possible values
+$\{a_1,\dots,a_m\}$, a split on $A$ partitions the training set into $m$
+subsets $D_{a_1},\dots,D_{a_m}$.
+
+In practice, many features are continuous.  To handle a continuous feature $A$,
+we can discretize it by introducing \emph{thresholds}.  Let
+\begin{equation}
+    \{a^{(1)},\dots,a^{(n)}\}
+\end{equation}
+be the distinct values of $A$ observed in the training set (for some feature
+dimension).  Sort them so that
+\begin{equation}
+    a^{(1)} < a^{(2)} < \dots < a^{(n)}.
+\end{equation}
+Candidate thresholds can then be placed between successive values:
+\begin{equation}
+    \tau_j = \frac{a^{(j)} + a^{(j+1)}}{2}, \qquad j=1,\dots,n-1.
+\end{equation}
+Each threshold $\tau_j$ defines a binary split
+\begin{equation}
+    A \le \tau_j
+    \quad\text{vs.}\quad
+    A > \tau_j,
+\end{equation}
+which we can evaluate using any of our usual criteria (information gain, Gini
+index, squared loss for regression, etc.).  By scanning over all candidate
+thresholds and choosing the best one according to the purity measure, the tree
+handles continuous features while remaining axis-aligned.
+
+At prediction time, a test example with continuous feature value $A(x)$ is
+routed through the tree by comparing $A(x)$ with the learned thresholds at each
+internal node, just as if the feature had been discrete.
+
+\section{From Single Trees to Ensembles}
+
+So far, we have seen how to build and select a \emph{single} decision tree, by
+choosing features that maximize information gain at each split.  In practice,
+however, a single tree often suffers from two issues:
+\begin{itemize}
+    \item \textbf{High variance.} Small changes in the training data may lead to
+    very different trees, especially when the tree is grown deep and fits the
+    training set closely.
+    \item \textbf{Limited accuracy.} A single tree is easy to interpret, but its
+    predictive performance may lag behind more powerful models such as kernel
+    methods or neural networks.
+\end{itemize}
+
+Ensemble learning addresses these issues by combining the predictions of many
+base learners (often decision trees) to form a stronger model.
+
+\begin{definition}[Ensemble Learning]
+    Ensemble learning constructs a predictor by aggregating a collection of
+    base learners (also called \emph{weak learners}).  Given base predictors
+    $f_1,\dots,f_M$, the ensemble predictor takes the form
+    \begin{equation}
+        F(x) =
+        \begin{cases}
+            \mathrm{majority\ vote}\bigl(f_1(x),\dots,f_M(x)\bigr),
+                & \text{(classification)},\\[4pt]
+            \dfrac{1}{M}\sum_{m=1}^M f_m(x),
+                & \text{(regression)}.
+        \end{cases}
+    \end{equation}
+    The key idea is that even if each $f_m$ is only moderately accurate, their
+    combination can be significantly more accurate and more robust.
+\end{definition}
+
+\subsection{Bias--Variance Decomposition}
+Consider a supervised learning setting with training dataset $D$, drawn from
+some unknown data-generating distribution.  Let $x$ denote a test input with
+true label $y$, and let
+\begin{equation}
+    f(x;D)
+\end{equation}
+denote the prediction of a learning algorithm trained on $D$ when evaluated at
+$x$.  Under squared loss, the prediction error at $(x,y)$ for a given training
+set $D$ is
+\begin{equation}
+    \bigl(f(x;D) - y\bigr)^2.
+\end{equation}
+Since $D$ is random, it is natural to measure performance by averaging over the
+random draw of $D$.  We therefore consider
+\begin{equation}
+    \mathbb{E}_{D}\Bigl[\bigl(f(x;D) - y\bigr)^2\Bigr],
+\end{equation}
+which quantifies the expected squared error at $x$, averaged over the
+randomness in the training data.
+
+Define the \emph{average prediction} at $x$ as
+\begin{equation}
+    \bar{f}(x)
+    := \mathbb{E}_{D}\bigl[f(x;D)\bigr],
+\end{equation}
+that is, the expected prediction we would obtain if we could repeatedly sample
+training sets $D$ from the underlying distribution and retrain the model.
+Insert and subtract $\bar{f}(x)$ inside the square:
+\begin{align}
+    \mathbb{E}_{D}\Bigl[\bigl(f(x;D) - y\bigr)^2\Bigr]
+    &= \mathbb{E}_{D}
+       \Bigl[\bigl(f(x;D) - \bar{f}(x) + \bar{f}(x) - y\bigr)^2\Bigr]\\
+    &= \mathbb{E}_{D}
+       \Bigl[\bigl(f(x;D) - \bar{f}(x)\bigr)^2\Bigr]
+       + \bigl(\bar{f}(x) - y\bigr)^2\\
+    &\quad
+       + 2\,\mathbb{E}_{D}\Bigl[\bigl(f(x;D) - \bar{f}(x)\bigr)\bigl(\bar{f}(x) - y\bigr)\Bigr].
+\end{align}
+The cross term vanishes because
+\begin{equation}
+    \mathbb{E}_{D}[f(x;D) - \bar{f}(x)]
+    = \mathbb{E}_{D}[f(x;D)] - \bar{f}(x) = 0,
+\end{equation}
+and $\bar{f}(x)-y$ does not depend on $D$.  Hence
+\begin{equation}
+    \mathbb{E}_{D}\Bigl[\bigl(f(x;D) - y\bigr)^2\Bigr]
+    =
+    \underbrace{\mathbb{E}_{D}
+    \Bigl[\bigl(f(x;D) - \bar{f}(x)\bigr)^2\Bigr]}_{\text{variance at }x}
+    +
+    \underbrace{\bigl(\bar{f}(x) - y\bigr)^2}_{\text{bias}^2 \text{ at }x}.
+\end{equation}
+
+\begin{definition}[Bias and variance at a test point]
+    For a fixed test input $x$ with true label $y$, the
+    \emph{variance} and \emph{(squared) bias} of a learning algorithm are
+    defined as
+    \begin{align}
+        \mathrm{Var}_D\bigl[f(x;D)\bigr]
+        &= \mathbb{E}_{D}
+           \Bigl[\bigl(f(x;D)-\bar{f}(x)\bigr)^2\Bigr],\\
+        \mathrm{Bias}^2(x)
+        &= \bigl(\bar{f}(x) - y\bigr)^2.
+    \end{align}
+\end{definition}
+
+Intuitively:
+\begin{itemize}
+    \item The variance term measures how sensitive the predictor $f(x;D)$ is to
+    fluctuations in the training data: for different draws of $D$, predictions
+    at $x$ may vary significantly around their mean $\bar{f}(x)$.
+    \item The bias term measures how far, on average, the prediction is from
+    the true label $y$.  Even if we could train on infinitely many different
+    datasets, the average predictor $\bar{f}(x)$ might still systematically
+    miss the true target due to limited model capacity or misspecification.
+\end{itemize}
+
+High variance typically arises from \emph{overfitting} particular training sets,
+while high bias indicates that the model class itself is too simple to capture
+the underlying relationship, even when averaged over many datasets.
+
+\begin{example}[Linear model on nonlinear data]
+    Suppose the data arise from a quadratic relationship, but we insist on
+    fitting a linear model.  Even if we repeatedly resample training sets and
+    average the resulting fitted lines, the averaged predictor $\bar{f}(x)$
+    remains linear and cannot match the true quadratic curve everywhere.  This
+    manifests as a persistent bias term.
+\end{example}
+
+Bias and variance suggest complementary strategies:
+\begin{itemize}
+    \item To reduce \textbf{variance}, we can increase the training set size or
+    average multiple diverse models (as in bagging and random forests).
+    \item To reduce \textbf{bias}, we can increase model capacity, e.g.\ by
+    using more flexible hypothesis classes or boosting weak learners.
+\end{itemize}
+
+Two classical ensemble strategies are:
+\begin{itemize}
+    \item \emph{Bagging} (Bootstrap Aggregating): build many base learners
+    independently on randomized versions of the data, and average or vote.
+    \item \emph{Boosting}: build base learners sequentially, each one focusing
+    on correcting the mistakes of the previous ones.
+\end{itemize}
+
+In modern machine learning practice, decision-tree ensembles based on bagging
+and boosting are among the most powerful off-the-shelf methods, with famous
+examples including Random Forests, Gradient Boosting Trees, and XGBoost.
+
+\subsection{Bagging: Bootstrap Aggregating}
+
+Bagging is designed to reduce the variance of an unstable base learner (such as
+a deep decision tree) by averaging many independently trained copies.
+
+\begin{definition}[Bagging]
+    Let $D$ be a training dataset of size $n$, and let $\mathcal{A}$ be a base
+    learning algorithm that outputs a predictor $f = \mathcal{A}(D)$.
+    Bagging constructs $M$ bootstrap datasets $D^{(1)},\dots,D^{(M)}$, where
+    each $D^{(m)}$ is obtained by sampling $n$ points \emph{with replacement}
+    from $D$.
+    
+    On each bootstrap dataset $D^{(m)}$, we train a base learner
+    \begin{equation}
+        f_m = \mathcal{A}\bigl(D^{(m)}\bigr),
+        \qquad m=1,\dots,M.
+    \end{equation}
+    The bagged ensemble predictor is
+    \begin{equation}
+        F_{\mathrm{bag}}(x)
+        =
+        \begin{cases}
+            \mathrm{sign}\Bigl(\dfrac{1}{M}\sum_{m=1}^M f_m(x)\Bigr),
+                & \text{classification},\\[6pt]
+            \dfrac{1}{M}\sum_{m=1}^M f_m(x),
+                & \text{regression}.
+        \end{cases}
+    \end{equation}
+\end{definition}
+
+Intuitively, each bootstrap sample $D^{(m)}$ can be viewed as a noisy version
+of the original dataset.  The base learners $f_m$ will differ from one another,
+and averaging their predictions cancels out part of the randomness.
+
+\begin{remark}[Variance Reduction]
+    Consider a simplified setting where we average $M$ identically distributed
+    base predictors $f_1,\dots,f_M$ for regression, each with variance
+    $\sigma^2$ and pairwise correlation $\rho$.  Then the variance of the
+    averaged predictor is
+    \begin{equation}
+        \mathrm{Var}\bigl(F_{\mathrm{bag}}(x)\bigr)
+        = \frac{1}{M^2}\sum_{m=1}^M\sum_{m'=1}^M
+            \mathrm{Cov}\bigl(f_m(x), f_{m'}(x)\bigr)
+        \approx \sigma^2\left(\rho + \frac{1-\rho}{M}\right).
+    \end{equation}
+        As $M$ grows, the second term $(1-\rho)/M$ vanishes, so the variance is
+        dominated by $\rho\sigma^2$.  Thus, bagging is most effective when base
+        learners are accurate but have high variance and are not too strongly
+        correlated.
+\end{remark}
+
+\begin{figure}[h]
+    \centering
+    \includegraphics{../../tikz/8/6.pdf}
+\end{figure}
+
+\subsection{Random Forests}
+
+Random Forests specialize bagging to decision trees, and further inject
+randomness at the \emph{feature} level to decorrelate the trees.
+
+\begin{definition}[Random Forest]
+    A Random Forest is an ensemble of decision trees trained with two sources
+    of randomness:
+    \begin{enumerate}
+        \item \textbf{Bootstrap sampling of data.}
+        For each tree $m$, sample a bootstrap dataset $D^{(m)}$ from $D$.
+        \item \textbf{Random feature selection at each split.}
+        When splitting a node, instead of considering all features, randomly
+        select a subset $\mathcal{F}$ of features (of size $d_{\mathrm{sub}}$),
+        and choose the best split only among features in $\mathcal{F}$.
+    \end{enumerate}
+    The final prediction aggregates all trees by majority vote or averaging,
+    as in bagging.
+\end{definition}
+
+The additional randomness in feature selection has two important effects:
+\begin{itemize}
+    \item It reduces the correlation between different trees, which
+    strengthens the variance reduction effect of averaging.
+    \item It forces each tree to explore different feature combinations,
+    sometimes discovering useful patterns that a single greedy tree might miss.
+\end{itemize}
+
+In practice, Random Forests are strong general-purpose models with relatively
+few hyperparameters.  Common choices include:
+\begin{itemize}
+    \item the number of trees $M$ (often in the hundreds),
+    \item the maximum depth of each tree,
+    \item the number of features $d_{\mathrm{sub}}$ considered at each split
+    (e.g., $\sqrt{d}$ for classification, where $d$ is the total number of
+    features).
+\end{itemize}
+
+\begin{remark}[Out-of-Bag Evaluation]
+    In each bootstrap sample $D^{(m)}$, roughly a fraction $1-1/e\approx 0.63$
+    of the original training points are included, and the remaining points are
+    left out.  For any training example, we can average the predictions of all
+    trees that did \emph{not} see this example during training; this is called
+    the \emph{out-of-bag} prediction.  Aggregating these predictions over the
+    training set provides an internal estimate of the generalization error,
+    without using a separate validation set.
+\end{remark}
+
+\subsection{Boosting}
+
+While bagging focuses on variance reduction by averaging many independent
+learners, boosting builds an ensemble \emph{sequentially}.  Each new learner
+tries to correct the mistakes of the current ensemble, effectively turning a
+collection of weak learners into a strong one.
+
+\begin{definition}[Boosting (High-Level View)]
+    Given a training set $D = \{(x_i,y_i)\}_{i=1}^n$, boosting maintains an
+    ensemble predictor
+    \begin{equation}
+        F_0(x) \equiv 0,\qquad
+        F_M(x) = \sum_{m=1}^M \alpha_m f_m(x),
+    \end{equation}
+    where each $f_m$ is a base learner (often a shallow tree) and the weights
+    $\alpha_m$ control their influence.
+    
+    At iteration $m$, boosting chooses $f_m$ to focus on the current residual
+    errors or misclassified examples of $F_{m-1}$, and then updates the ensemble
+    to $F_m$.
+\end{definition}
+
+One classical example is AdaBoost for binary classification.  It maintains a
+distribution of weights over training samples, gives higher weights to
+misclassified points, and fits a new weak learner to this reweighted dataset at
+each iteration.
+
+\begin{remark}[Boosting vs.\ Bagging]
+    \begin{itemize}
+        \item Bagging trains base learners \emph{in parallel} on resampled
+        datasets and primarily reduces variance.
+        \item Boosting trains base learners \emph{sequentially}, focusing on
+        difficult samples, and can reduce both bias and variance, but is also
+        more prone to overfitting if not regularized (e.g., via tree depth,
+        learning rate, or early stopping).
+    \end{itemize}
+    Modern implementations such as Gradient Boosting Trees and XGBoost can be
+    viewed as performing a form of functional gradient descent in function
+    space, where each tree fits the negative gradient of the loss with respect
+    to the current ensemble prediction.
+\end{remark}
+
+\begin{figure}[h]
+    \centering
+    \includegraphics{../../tikz/8/7.pdf}
+\end{figure}
+
+\begin{note}
+    From a geometric perspective, both Random Forests and boosting-based tree
+    ensembles can be seen as constructing a complex, highly nonlinear decision
+    boundary by patching together many simple, axis-aligned splits.  Whereas a
+    single tree corresponds to a small set of such partitions, an ensemble can
+    carve out increasingly intricate decision regions, often achieving
+    state-of-the-art performance on tabular data.
+\end{note}
+
+\end{document}
diff --git a/notes/2025/mvp/chapters/9-em-mog.pdf b/notes/2025/mvp/chapters/9-em-mog.pdf
new file mode 100644
index 0000000..9929aa6
Binary files /dev/null and b/notes/2025/mvp/chapters/9-em-mog.pdf differ
diff --git a/notes/2025/mvp/chapters/9-em-mog.tex b/notes/2025/mvp/chapters/9-em-mog.tex
new file mode 100644
index 0000000..c0c8782
--- /dev/null
+++ b/notes/2025/mvp/chapters/9-em-mog.tex
@@ -0,0 +1,437 @@
+\documentclass[../main]{subfiles}
+\begin{document}
+\chapter{Expectation--Maximization and Mixture of Gaussians}
+\begin{introduction}
+    \item Mixture of Gaussians (MoG) as a generative clustering model
+    \item Maximum likelihood with latent variables: why it is hard
+    \item KL divergence and the evidence lower bound (ELBO)
+    \item EM algorithm: E-step and M-step, monotonic improvement and local optima
+    \item Closed-form EM updates for MoG and the connection to K-means
+\end{introduction}
+
+\section{Mixture of Gaussians (MoG): a generative view of clustering}
+We revisit clustering from a probabilistic and generative perspective.
+Instead of assigning each datapoint to a cluster deterministically, we assume
+each datapoint is generated by \emph{first} choosing a latent cluster index and
+\emph{then} sampling from a Gaussian distribution attached to that cluster.
+
+\subsection{Latent variable and generative process}
+Let $K\in\mathbb N$ be the number of clusters (components). For each datapoint
+$x\in\mathbb R^d$, introduce a latent discrete variable
+\begin{equation}
+    G \in \{1,2,\dots,K\},
+\end{equation}
+where $G=k$ indicates that $x$ is generated from the $k$-th Gaussian component.
+We model
+\begin{equation}
+    p(G=k)=\pi_k,\qquad \pi_k\ge 0,\qquad \sum_{k=1}^K \pi_k = 1,
+\end{equation}
+and the conditional likelihood
+\begin{equation}
+    p(x\mid G=k)=\mathcal N(x\mid \mu_k,\Sigma_k),
+\end{equation}
+where $\mu_k\in\mathbb R^d$ and $\Sigma_k\in\mathbb R^{d\times d}$ is symmetric
+positive definite.
+
+\begin{note}
+    Earlier chapters may encode the latent cluster by a one-hot vector
+    $z\in\{0,1\}^K$ with $\sum_k z_k=1$.  The integer notation here is
+    equivalent but notationally lighter:
+    $G=k \Longleftrightarrow z_k=1$.
+\end{note}
+
+\subsection{Marginal likelihood and the MLE objective}
+By marginalizing $G$, the density of $x$ becomes a mixture:
+\begin{equation}\label{eq:mog-marginal}
+    p(x;\theta)
+    =
+    \sum_{k=1}^K p(G=k)\,p(x\mid G=k)
+    =
+    \sum_{k=1}^K \pi_k\,\mathcal N(x\mid \mu_k,\Sigma_k),
+\end{equation}
+where we collect all parameters as
+\begin{equation}
+    \theta := \Bigl\{\pi_k,\mu_k,\Sigma_k\Bigr\}_{k=1}^K.
+\end{equation}
+
+Given i.i.d.\ data $\{x_i\}_{i=1}^n$, maximum likelihood estimation solves
+\begin{equation}\label{eq:mog-mle}
+    \argmax_{\theta}\;
+    \sum_{i=1}^n \log p(x_i;\theta)
+    =
+    \argmax_{\theta}\;
+    \sum_{i=1}^n
+    \log\!\left(
+        \sum_{k=1}^K \pi_k\,\mathcal N(x_i\mid \mu_k,\Sigma_k)
+    \right).
+\end{equation}
+
+\begin{remark}
+    Unlike supervised learning objectives of the form $\sum_i \log p(y_i\mid
+    x_i;\theta)$, unsupervised density estimation maximizes $\sum_i \log
+    p(x_i;\theta)$ directly because no labels are observed.
+\end{remark}
+
+\subsection{Why direct optimization is non-trivial}
+The objective \eqref{eq:mog-mle} is difficult for at least two reasons.
+\begin{itemize}
+    \item \textbf{Log-sum structure:} $\log\big(\sum_k \pi_k \mathcal N(\cdot)\big)$
+    couples all components inside a logarithm, preventing simple closed-form
+    derivatives from decoupling.
+    \item \textbf{Constraints:} $\{\pi_k\}$ must lie on the simplex and each
+    $\Sigma_k$ must be positive definite. One can use constrained optimization,
+    penalty methods, or reparameterization, but a naive unconstrained gradient
+    method is not directly applicable.
+\end{itemize}
+\begin{remark}
+    Gradient descent can be made to work with non-trivial adaptations (e.g.,
+    softmax parameters for $\pi$, Cholesky factors for $\Sigma$), but EM
+    typically provides a faster and more elegant framework for this class of
+    latent-variable MLE problems.
+\end{remark}
+
+\section{From latent-variable MLE to ELBO}
+We now derive the Expectation--Maximization (EM) algorithm in a general setting.
+
+\subsection{General latent-variable likelihood}
+Let $x$ be observed and $z$ be a latent variable (discrete or continuous).
+Assume a joint model $p(x,z;\theta)$. The marginal likelihood is
+\begin{equation}
+    p(x;\theta) = \sum_z p(x,z;\theta)\qquad
+    (\text{or } \int p(x,z;\theta)\,\mathrm dz).
+\end{equation}
+In MLE, the objective is
+\begin{equation}
+    \argmax_{\theta}\; \sum_{i=1}^n \log p(x_i;\theta).
+\end{equation}
+The marginalization over $z$ often makes $\log p(x;\theta)$ hard to optimize.
+
+\subsection{KL divergence: definition and key properties}
+\begin{definition}[Kullback--Leibler divergence]
+    For distributions $Q$ and $P$ on the same variable $Z$, the KL divergence
+    is
+    \begin{equation}
+        \mathrm{KL}(Q\|P)
+        := \mathbb E_{Z\sim Q}\!\left[\log\frac{Q(Z)}{P(Z)}\right].
+    \end{equation}
+\end{definition}
+
+\begin{proposition}[Basic properties of KL]\label{prop:kl-basic}
+    For any $Q,P$,
+    \begin{enumerate}
+        \item $\mathrm{KL}(Q\|P)\ge 0$.
+        \item $\mathrm{KL}(Q\|P)=0$ if and only if $Q=P$ almost surely.
+        \item In general, $\mathrm{KL}(Q\|P)\neq \mathrm{KL}(P\|Q)$ (not symmetric).
+    \end{enumerate}
+\end{proposition}
+\begin{proof}
+    By Jensen's inequality applied to the convex function $-\log(\cdot)$,
+    \begin{align*}
+        \mathrm{KL}(Q\|P)
+        &= -\mathbb E_{Q}\!\left[\log \frac{P(Z)}{Q(Z)}\right]
+        \ge -\log \mathbb E_{Q}\!\left[\frac{P(Z)}{Q(Z)}\right]
+        = -\log\!\left(\sum_z P(z)\right)
+        = 0,
+    \end{align*}
+    and equality holds iff $\frac{P(Z)}{Q(Z)}$ is constant $Q$-a.s., which gives
+    $Q=P$ a.s.  Non-symmetry follows from counterexamples.
+\end{proof}
+
+\begin{remark}
+    KL can be written as \textbf{cross-entropy minus entropy}:
+    \begin{equation}
+        \mathrm{KL}(Q\|P)
+        =
+        \underbrace{\mathbb E_Q[-\log P(Z)]}_{\text{cross-entropy}}
+        -
+        \underbrace{\mathbb E_Q[-\log Q(Z)]}_{\text{entropy}}.
+    \end{equation}
+    This explains why using the ``wrong code'' $P$ to encode samples from $Q$
+    necessarily incurs extra expected description length.
+\end{remark}
+
+\subsection{ELBO decomposition}
+Introduce an arbitrary distribution $q(z)$ (a ``variational distribution'').
+Then the following identity holds:
+\begin{theorem}[ELBO decomposition]\label{thm:elbo}
+    For any $q(z)$ and any $\theta$,
+    \begin{equation}\label{eq:elbo-decomp}
+        \log p(x;\theta)
+        =
+        \underbrace{
+        \mathbb E_{z\sim q}\!\left[\log \frac{p(x,z;\theta)}{q(z)}\right]
+        }_{\mathcal L(q,\theta)\;\;(\text{ELBO})}
+        +
+        \underbrace{
+        \mathrm{KL}\!\left(q(z)\,\middle\|\,p(z\mid x;\theta)\right)
+        }_{\ge 0}.
+    \end{equation}
+    Consequently, $\mathcal L(q,\theta)\le \log p(x;\theta)$ for all $q$.
+\end{theorem}
+\begin{proof}
+    Starting from $\log p(x;\theta)$ and taking expectation over $q$,
+    \begin{align*}
+        \log p(x;\theta)
+        &= \sum_z q(z)\,\log p(x;\theta) \\
+        &= \sum_z q(z)\,\log \frac{p(x,z;\theta)}{p(z\mid x;\theta)} \\
+        &= \sum_z q(z)\,\log \frac{p(x,z;\theta)}{q(z)}
+        + \sum_z q(z)\,\log \frac{q(z)}{p(z\mid x;\theta)},
+    \end{align*}
+    where the last term is exactly the KL divergence.
+\end{proof}
+
+\begin{definition}[Evidence lower bound (ELBO)]
+    The functional
+    \begin{equation}
+        \mathcal L(q,\theta)
+        :=
+        \mathbb E_{z\sim q}\!\left[\log p(x,z;\theta)\right]
+        -
+        \mathbb E_{z\sim q}\!\left[\log q(z)\right]
+        =
+        \mathbb E_{z\sim q}\!\left[\log \frac{p(x,z;\theta)}{q(z)}\right]
+    \end{equation}
+    is called the \emph{evidence lower bound}.
+\end{definition}
+
+\begin{remark}
+    The abbreviation \textbf{ELBO} is commonly read as ``elbow'' in talks.
+    The bound gap is precisely
+    $\mathrm{KL}\big(q(z)\|p(z\mid x;\theta)\big)$.
+    When $q(z)=p(z\mid x;\theta)$, the bound is tight.
+\end{remark}
+
+\begin{note}
+    When the posterior $p(z\mid x;\theta)$ is intractable, one typically
+    restricts $q$ to a tractable family and maximizes ELBO approximately (this
+    is the core idea of variational inference). EM can be viewed as the special
+    case where the E-step posterior is tractable and makes the bound tight at
+    the current iterate.
+\end{note}
+
+\section{The EM algorithm}
+EM is an iterative algorithm that alternates between optimizing $q$ (E-step)
+and optimizing $\theta$ (M-step), using ELBO as a surrogate objective.
+
+\subsection{E-step and M-step from ELBO}
+Suppose we are at $\theta^{(t)}$.
+\begin{itemize}
+    \item \textbf{E-step (fix $\theta$):}
+    choose $q^{(t+1)}$ to maximize $\mathcal L(q,\theta^{(t)})$.
+    Since $\log p(x;\theta^{(t)})$ is a constant w.r.t.\ $q$,
+    Theorem~\ref{thm:elbo} implies
+    \begin{equation}\label{eq:e-step}
+        q^{(t+1)}(z)
+        =
+        p\bigl(z\mid x;\theta^{(t)}\bigr),
+    \end{equation}
+    which minimizes the KL gap to $0$.
+
+    \item \textbf{M-step (fix $q$):}
+    update $\theta$ by maximizing the ELBO
+    \begin{equation}\label{eq:m-step}
+        \theta^{(t+1)}
+        \in
+        \argmax_{\theta}\; \mathcal L\!\left(q^{(t+1)},\theta\right).
+    \end{equation}
+    Because the term $-\mathbb E_{q^{(t+1)}}[\log q^{(t+1)}(z)]$ does not depend
+    on $\theta$, the M-step equivalently maximizes
+    \begin{equation}\label{eq:q-function}
+        Q(\theta\mid \theta^{(t)})
+        :=
+        \mathbb E_{z\sim p(z\mid x;\theta^{(t)})}\!\left[\log p(x,z;\theta)\right],
+    \end{equation}
+    the expected complete-data log-likelihood.
+\end{itemize}
+
+\begin{remark}
+    The ``expectation'' in \textbf{E-step} refers to taking expectation with
+    respect to the posterior $p(z\mid x;\theta^{(t)})$, which is computed in the
+    E-step and then used to form the expected complete log-likelihood in the
+    M-step.
+\end{remark}
+
+\begin{note}
+    The overall algorithm repeats \textbf{E-step $\rightarrow$ M-step} until
+    convergence, e.g.\ until $\theta^{(t+1)}$ is sufficiently close to
+    $\theta^{(t)}$ or the log-likelihood improvement becomes negligible.
+\end{note}
+
+\subsection{Monotonicity and local optima}
+\begin{proposition}[Monotonic improvement]
+    Each EM iteration does not decrease the data log-likelihood:
+    \[
+        \log p(x;\theta^{(t+1)}) \ge \log p(x;\theta^{(t)}).
+    \]
+\end{proposition}
+\begin{proof}
+    In the E-step, the ELBO is made tight at $\theta^{(t)}$ by choosing
+    $q^{(t+1)}(z)=p(z\mid x;\theta^{(t)})$, hence
+    $\mathcal L(q^{(t+1)},\theta^{(t)})=\log p(x;\theta^{(t)})$.
+    In the M-step, we maximize the ELBO in $\theta$, so
+    \[
+        \mathcal L(q^{(t+1)},\theta^{(t+1)})
+        \ge
+        \mathcal L(q^{(t+1)},\theta^{(t)})
+        =
+        \log p(x;\theta^{(t)}).
+    \]
+    Since ELBO is always a lower bound, $\log p(x;\theta^{(t+1)})\ge
+    \mathcal L(q^{(t+1)},\theta^{(t+1)})$, proving the claim.
+\end{proof}
+
+\begin{remark}
+    EM typically converges to a \emph{stationary point}, which may be a local
+    optimum. Different initializations can lead to different solutions.
+\end{remark}
+
+% \begin{figure}[h]
+%     \centering
+%     \includegraphics[width=0.9\textwidth]{../../mvp/tikz/9/5.pdf}
+%     \caption{Geometric intuition: each E-step picks an ELBO that ``touches'' the log-likelihood at the current $\theta^{(t)}$, and the M-step maximizes that ELBO to obtain a new parameter $\theta^{(t+1)}$ with increased likelihood.}
+%     \label{fig:em-elbo-geometry}
+% \end{figure}
+
+\section{EM for Mixture of Gaussians}
+We now apply the general EM framework to MoG.
+
+\subsection{Complete-data likelihood}
+For a single datapoint $(x_i,G_i)$,
+\begin{equation}
+    p(x_i,G_i=k;\theta)
+    =
+    p(G_i=k)\,p(x_i\mid G_i=k)
+    =
+    \pi_k\,\mathcal N(x_i\mid \mu_k,\Sigma_k).
+\end{equation}
+For the dataset, the complete-data log-likelihood is
+\begin{equation}
+    \log p(\{x_i\},\{G_i\};\theta)
+    =
+    \sum_{i=1}^n \sum_{k=1}^K \mathbb I\{G_i=k\}
+    \Bigl(
+        \log \pi_k + \log \mathcal N(x_i\mid \mu_k,\Sigma_k)
+    \Bigr).
+\end{equation}
+
+\subsection{E-step: responsibilities}
+Define the \emph{responsibility} (posterior assignment probability)
+\begin{equation}\label{eq:responsibility}
+    \gamma_{ik}
+    :=
+    p(G_i=k \mid x_i;\theta^{(t)}).
+\end{equation}
+By Bayes' rule,
+\begin{equation}\label{eq:mog-e-step}
+    \gamma_{ik}
+    =
+    \frac{
+        \pi_k^{(t)}\,\mathcal N(x_i\mid \mu_k^{(t)},\Sigma_k^{(t)})
+    }{
+        \sum_{j=1}^K \pi_j^{(t)}\,\mathcal N(x_i\mid \mu_j^{(t)},\Sigma_j^{(t)})
+    }.
+\end{equation}
+Intuitively, $\gamma_{ik}$ measures the fraction of ``credit'' component $k$
+takes for explaining datapoint $x_i$.
+
+\subsection{M-step: closed-form updates}
+Given $\{\gamma_{ik}\}$, the EM objective for MoG becomes
+\begin{equation}\label{eq:mog-Q}
+    Q(\theta\mid \theta^{(t)})
+    =
+    \sum_{i=1}^n \sum_{k=1}^K \gamma_{ik}
+    \Bigl(
+        \log \pi_k + \log \mathcal N(x_i\mid \mu_k,\Sigma_k)
+    \Bigr).
+\end{equation}
+Let
+\begin{equation}
+    N_k := \sum_{i=1}^n \gamma_{ik}.
+\end{equation}
+
+\subsubsection{Update of $\pi_k$ (simplex constraint)}
+We maximize \eqref{eq:mog-Q} w.r.t.\ $\pi$ subject to $\sum_k \pi_k=1$.
+Introduce a Lagrange multiplier $\lambda$:
+\begin{equation}
+    \mathcal J(\pi,\lambda)
+    =
+    \sum_{i=1}^n \sum_{k=1}^K \gamma_{ik}\log \pi_k
+    + \lambda\left(1-\sum_{k=1}^K \pi_k\right).
+\end{equation}
+Setting $\partial \mathcal J/\partial \pi_k=0$ gives
+\begin{equation}
+    \pi_k
+    =
+    \frac{1}{n}\sum_{i=1}^n \gamma_{ik}
+    =
+    \frac{N_k}{n}.
+\end{equation}
+
+\subsubsection{Update of $\mu_k$}
+Using the Gaussian log-density, the terms depending on $\mu_k$ yield the
+weighted least-squares problem whose solution is
+\begin{equation}\label{eq:mog-mu-update}
+    \mu_k
+    =
+    \frac{\sum_{i=1}^n \gamma_{ik} x_i}{\sum_{i=1}^n \gamma_{ik}}
+    =
+    \frac{1}{N_k}\sum_{i=1}^n \gamma_{ik} x_i.
+\end{equation}
+
+\subsubsection{Update of $\Sigma_k$}
+Similarly, maximizing w.r.t.\ $\Sigma_k$ gives
+\begin{equation}\label{eq:mog-sigma-update}
+    \Sigma_k
+    =
+    \frac{\sum_{i=1}^n \gamma_{ik} (x_i-\mu_k)(x_i-\mu_k)^\top}{\sum_{i=1}^n \gamma_{ik}}
+    =
+    \frac{1}{N_k}\sum_{i=1}^n \gamma_{ik} (x_i-\mu_k)(x_i-\mu_k)^\top.
+\end{equation}
+
+\begin{remark}
+    The updates \eqref{eq:mog-mu-update}--\eqref{eq:mog-sigma-update} are
+    \textbf{soft-assignment weighted} empirical mean and covariance.  If
+    $\gamma_{ik}\in\{0,1\}$ becomes hard assignment, they reduce to the usual
+    sample mean/covariance of points in cluster $k$.
+\end{remark}
+
+\subsection{Relationship to K-means as a limiting case}
+Assume an isotropic shared covariance $\Sigma_k=\sigma^2 I$ for all $k$.
+Then
+\begin{equation}
+    \gamma_{ik}
+    \propto
+    \pi_k \exp\!\left(-\frac{\|x_i-\mu_k\|^2}{2\sigma^2}\right).
+\end{equation}
+As $\sigma\to 0$, the softmax distribution concentrates on the closest mean:
+\begin{equation}
+    \gamma_{ik}
+    \to
+    \begin{cases}
+        1, & k=\argmin_j \|x_i-\mu_j\|^2,\\
+        0, & \text{otherwise},
+    \end{cases}
+\end{equation}
+which recovers the hard assignment step of K-means. With these hard assignments,
+the M-step mean update reduces to the K-means centroid update.
+
+\begin{remark}
+    This connection explains a common interpretation: \textbf{K-means is a
+    special/limiting case of MoG} (and can be seen as a degenerate EM).
+\end{remark}
+
+\section{A clarification: PCA vs regression (from Q\&A)}
+\begin{note}
+    PCA is an \emph{unsupervised} problem: all coordinates are treated
+    symmetrically and the objective is to find a low-dimensional subspace that
+    best explains the variance/geometry of $x$.
+    In contrast, regression is \emph{supervised}: $y$ plays a special role and
+    the objective is to minimize prediction error of $y$ given $x$.
+    Therefore, even if one augments $x$ with $y$, the resulting optimization is
+    not equivalent to PCA because the symmetry between coordinates is broken by
+    the learning objective.
+\end{note}
+
+\end{document}
+
+
diff --git a/notes/2025/mvp/chapters/9-ul.pdf b/notes/2025/mvp/chapters/9-ul.pdf
new file mode 100644
index 0000000..0e5fb12
Binary files /dev/null and b/notes/2025/mvp/chapters/9-ul.pdf differ
diff --git a/notes/2025/mvp/chapters/9-ul.tex b/notes/2025/mvp/chapters/9-ul.tex
new file mode 100644
index 0000000..555ae42
--- /dev/null
+++ b/notes/2025/mvp/chapters/9-ul.tex
@@ -0,0 +1,697 @@
+\documentclass[../main]{subfiles}
+\begin{document}
+\chapter{Unsupervised Learning}
+\begin{introduction}
+    \item Dimensionality reduction
+    \item Cluster
+    \item Latent-variable models and the EM algorithm
+\end{introduction}
+All methods we have discussed so far fall under the category of \textbf{supervised learning}, where the training process relies on labeled data and the model is explicitly guided by pairs $(x,y)$.  
+In contrast, \textbf{unsupervised learning} operates without labels: the goal is to learn the underlying structure of the data and to approximate the distribution of datapoints $x$ itself.
+
+\begin{definition}[Unsupervised Learning]
+    Let $\mathcal{X}$ be an unlabeled dataset.  
+    Unsupervised learning aims to learn a mapping $f:\mathcal{X}\to\mathcal{Z}$ or a generative model $p_\theta(x)$ that captures the intrinsic structure of the underlying data distribution $p(x)$.  
+    Typical objectives include clustering, dimensionality reduction, density estimation, and representation learning.  
+    Importantly, no labeled pairs $(x,y)$ are observed during training.
+\end{definition}
+
+Unsupervised learning can be divided into several tasks:
+\begin{enumerate}
+    \item Dimensionality reduction: datapoints are often lie in high-dimensional manifold, while the dimension that really matter only contribute as a small part. Dimensionality reduction is the process to train the model to find these crutial dimension from enormous amount of dimensions.
+    \item Clustering. Train model to group similar data togather.
+    \item Generate models. Various generative model, like SoRA, LLM/GPT, are trained to learn how to generate contents like text or image by learning the distribution of the target context.
+\end{enumerate}
+\section{Dimensionality reduction}
+\subsection{Principal Component Analysis (PCA)}
+
+\begin{theorem}[Principle of PCA]
+    PCA seeks a direction $w$ (with $\|w\|=1$) along which the projected data $w^\top x$
+    achieves the \emph{maximum possible variance}.  
+    Formally, the goal is to maximize
+    \begin{equation}
+        \operatorname{Var}(w^\top X).
+    \end{equation}
+\end{theorem}
+    
+    Let the dataset be
+    \begin{equation}
+        X = 
+        \begin{pmatrix}
+            x_1^{\top} \\
+            x_2^{\top} \\
+            \vdots \\
+            x_n^{\top}
+        \end{pmatrix}
+        \in \mathbb{R}^{n\times d},
+    \end{equation}
+    with sample mean
+    \begin{equation}
+        \bar{x} = \frac{1}{n}\sum_{i=1}^n x_i .
+    \end{equation}
+    Each $x_i \in \mathbb{R}^d$ is an observation.
+    \begin{figure}[h]
+        \centering
+        \includegraphics{../../tikz/9/1.pdf}
+    \end{figure}
+    
+    The empirical covariance matrix is defined as
+    \begin{equation}
+        \Sigma = \frac{1}{n}\sum_{i=1}^n (x_i - \bar{x})(x_i - \bar{x})^{\top}
+        \in \mathbb{R}^{d\times d}.
+    \end{equation}
+    Entrywise,  
+    \begin{equation}
+        \Sigma_{jk}
+        = \frac{1}{n}\sum_{i=1}^n 
+        (x_i^{(j)} - \bar{x}^{(j)})(x_i^{(k)} - \bar{x}^{(k)}).
+    \end{equation}
+    \begin{remark}
+        The covariance matrix defined with the factor $\tfrac{1}{n}$ is technically a biased estimator of the true population covariance.  
+        The unbiased version uses $\tfrac{1}{n-1}$ instead.  
+        However, when $n$ is large, the difference between $\tfrac{1}{n}$ and $\tfrac{1}{n-1}$ is negligible:
+        \begin{equation}
+            \frac{1}{n} = \frac{1}{n-1} \left( 1 - \frac{1}{n} \right),
+        \end{equation}
+        and thus affects the covariance only by a vanishing scalar factor.
+        
+        Moreover, PCA depends solely on the \emph{eigenvectors} of the covariance matrix, i.e., the principal directions.  
+        Multiplying $\Sigma$ by any positive constant does not change its eigenvectors.  
+        Therefore, for PCA, the choice between $\tfrac{1}{n}$ and $\tfrac{1}{n-1}$ has no effect on the principal components.
+        \end{remark}
+        \begin{lemma}[Variance of a Linear Projection]
+            Let $w$ be any unit vector in $\mathbb{R}^d$.  
+            The projection of a data point $x_i$ onto $w$ is the scalar
+            \begin{equation}
+                z_i = w^\top x_i .
+            \end{equation}
+            The sample mean of the projected data is
+            \begin{equation}
+                \bar{z} = \frac{1}{n}\sum_{i=1}^n z_i
+                       = \frac{1}{n}\sum_{i=1}^n w^\top x_i
+                       = w^\top \bar{x}.
+            \end{equation}
+            Thus
+            \begin{equation}
+                z_i - \bar{z}
+                = w^\top (x_i - \bar{x}).
+            \end{equation}
+            The sample variance of the projected data is
+            \begin{equation}
+                \operatorname{Var}(w^\top X)
+                = \frac{1}{n} \sum_{i=1}^n (z_i - \bar{z})^2
+                = \frac{1}{n} \sum_{i=1}^n 
+                    \bigl( w^\top (x_i - \bar{x}) \bigr)^2 .
+            \end{equation}
+            Using the identity 
+            \[
+            (w^\top a)^2 = w^\top (a a^\top) w,
+            \]
+            we obtain
+            \begin{equation}
+                \operatorname{Var}(w^\top X)
+                = w^\top
+                  \left(
+                     \frac{1}{n}\sum_{i=1}^n (x_i - \bar{x})(x_i - \bar{x})^\top
+                  \right)
+                  w
+                = w^\top \Sigma w.
+            \end{equation}
+            \end{lemma}
+    Let $u_1$ be the unit projection vector on $\mathbb R^d$ to project each $x_i$ onto a 1-D space, then PCA is equivalent to solve the below problem
+    \begin{equation}
+        \argmax_{\|u_1\|=1}\; u_1^\top \Sigma u_1 .
+    \end{equation}
+    By introducing a Lagrange multiplier $\lambda$ for the constraint $u_1^\top u_1 = 1$, we consider the Lagrangian
+    \begin{equation}
+        \mathcal{L}(w,\lambda)
+        = u_1^\top \Sigma u_1 - \lambda \bigl( u_1^\top u_1 - 1 \bigr).
+    \end{equation}
+    Taking the derivative with respect to $u_1$ and setting it to zero gives
+    \begin{equation}
+        \frac{\partial \mathcal{L}}{\partial u_1}
+        = 2 \Sigma u_1 - 2 \lambda u_1 = 0,
+    \end{equation}
+    hence
+    \begin{equation}
+        \Sigma u_1 = \lambda u_1.
+    \end{equation}
+    The derivative with respect to $\lambda$ enforces the constraint
+    \begin{equation}
+        \frac{\partial \mathcal{L}}{\partial \lambda}
+        = -\bigl(u_1^\top u_1 - 1\bigr) = 0
+        \quad\Longrightarrow\quad
+        u_1^\top u_1 = 1.
+    \end{equation}
+    Therefore, any maximizer \textbf{$u_1$ must be an eigenvector of $\Sigma$}, with $\lambda$ equal to the corresponding eigenvalue.
+    Since
+    \begin{equation}
+        u_1^\top \Sigma u_1 = \lambda
+    \end{equation}
+    for any unit eigenvector $u_1$, the optimization problem is solved by choosing $u_1$ as the eigenvector associated with the \textbf{largest eigenvalue of $\Sigma$}.  
+    This $u_1$ is the first principal component.
+
+    After obtaining the first principal component $u_1$, the second principal component $u_2$ is defined as the direction that maximizes the projected variance subject to two constraints:
+\begin{equation}
+    \max_{u_2}\; u_2^\top \Sigma u_2
+\end{equation}
+subject to
+\begin{equation}
+    u_2^\top u_2 = 1,
+    \qquad
+    u_2^\top u_1 = 0.
+\end{equation}
+The first condition enforces unit length; the second ensures orthogonality to $u_1$.
+
+Introduce Lagrange multipliers $\lambda_2$ and $\alpha$.  
+Consider the Lagrangian
+\begin{equation}
+\mathcal{L}(u_2,\lambda_2,\alpha)
+    = u_2^\top \Sigma u_2
+      + \lambda_2 (1 - u_2^\top u_2)
+      + \alpha\, u_2^\top u_1.
+\end{equation}
+Taking the derivative with respect to $u_2$ and setting it to zero yields
+\begin{equation}
+    \frac{\partial \mathcal{L}}{\partial u_2}
+    = 2 \Sigma u_2 - 2 \lambda_2 u_2 + \alpha u_1 = 0.
+\end{equation}
+
+Rearranging the stationarity condition gives
+\begin{equation}
+    \Sigma u_2 = \lambda_2 u_2 - \frac{\alpha}{2} u_1.
+\end{equation}
+Using the orthogonality constraint $u_1^\top u_2 = 0$ and the fact that
+\begin{equation}
+    \Sigma u_1 = \lambda_1 u_1,
+\end{equation}
+we obtain
+\begin{equation}
+    u_1^\top \Sigma u_2 = \lambda_2\, u_1^\top u_2 - \frac{\alpha}{2} u_1^\top u_1
+    = -\frac{\alpha}{2}.
+\end{equation}
+On the other hand,
+\begin{equation}
+    u_1^\top \Sigma u_2 = (\Sigma u_1)^\top u_2 = \lambda_1 u_1^\top u_2 = 0.
+\end{equation}
+Thus $\alpha = 0$, and the stationarity condition reduces to the eigenvalue equation
+\begin{equation}
+    \Sigma u_2 = \lambda_2 u_2.
+\end{equation}
+Therefore $u_2$ must be an eigenvector of $\Sigma$ associated with the second largest eigenvalue.
+
+By induction:
+\begin{theorem}
+    The first $K$ principal components are exactly the eigenvectors of the covariance matrix $\Sigma$ corresponding to its $K$ largest eigenvalues.  
+\end{theorem}
+Denote these eigenvectors by
+\begin{equation}
+    U_{1:K}
+    = 
+(
+        u_1,u_2 ,\cdots , u_K
+)
+    \in \mathbb{R}^{d \times K}.
+\end{equation}
+Each $u_i$ satisfies$
+    \Sigma u_i = \lambda_i u_i,
+    \,
+    \lambda_1 \ge \lambda_2 \ge \cdots \ge \lambda_K.$
+
+    Given the matrix of the top $K$ principal directions $U_{1:K}$, the projection of a data point $x \in \mathbb{R}^d$ onto the $K$-dimensional PCA subspace is $
+        z = U_{1:K}^{\top} x 
+        \in \mathbb{R}^{n\times K}.
+$
+    For a dataset $X \in \mathbb{R}^{n \times d}$, the projected data matrix is
+    \begin{equation}
+        X U_{1:K}
+        \in \mathbb{R}^{n \times K}.
+    \end{equation}
+    This reduces the dimensionality from $d$ to $K$ while retaining the directions of maximal variance.
+
+    \begin{definition}[Centered Data Matrix]
+        Let the centered data matrix be
+        \begin{equation}
+            \hat{X}
+            =
+            \begin{pmatrix}
+                x_1^{\top} - \bar{x}^{\top} \\
+                x_2^{\top} - \bar{x}^{\top} \\
+                \vdots \\
+                x_n^{\top} - \bar{x}^{\top}
+            \end{pmatrix}
+            \in \mathbb{R}^{n \times d}.
+        \end{equation}
+        Each row of $\hat{X}$ is a mean-subtracted data point.
+        \end{definition}
+        
+        The empirical covariance matrix can be written compactly as
+        \begin{equation}
+            \Sigma = \frac{1}{n}\, \hat{X}^{\top}\hat{X}.
+        \end{equation}
+        
+        \begin{theorem}[Eigen-Decomposition of the Covariance]
+        Because $\Sigma$ is real symmetric, it admits an orthogonal eigendecomposition:
+        \begin{equation}
+            \Sigma
+            = U \Lambda U^{\top},
+        \end{equation}
+        where $U$ is an orthogonal matrix whose columns are eigenvectors  
+        and $\Lambda$ is a diagonal matrix containing the eigenvalues of $\Sigma$.
+        \end{theorem}
+        
+        In practice, numerical libraries such as \texttt{numpy.linalg.eig} or 
+        \texttt{numpy.linalg.eigh} compute the eigen-decomposition of $\Sigma$ directly.  
+        The eigenvectors $\{u_1,\ldots,u_d\}$ form the principal directions of the data.
+\subsection{Singular Value Decomposition (SVD)}
+        Consider the singular value decomposition of the centered data matrix:
+        \begin{equation}
+            \hat{X} = U_X\, S\, V^{\top}.
+        \end{equation}
+        \newcommand{\mblock}[2]{%
+  \begingroup
+  \setlength{\fboxsep}{1pt}%
+  \colorbox{#1!20}{$\displaystyle #2$}%
+  \endgroup
+}
+
+\begin{equation}
+  \hat{X}_{\mathbb{R}^{n\times d}}
+  =
+  \mblock{blue}{ U_{X\,\mathbb{R}^{n\times r}}}\,
+  \mblock{green}{\begin{pmatrix}
+    \sigma_1&\cdots &0&0&\cdots&0\\
+    0&\sigma_2&\cdots&0&\cdots&0\\
+    \vdots&\ddots &&&&\vdots\\
+    0&\cdots&0&\sigma_n&\cdots&0
+    \end{pmatrix} _{ \mathbb{R}^{r\times r}}}\,
+  \mblock{red}{V^{\top}_ {\mathbb{R}^{r\times d}} }.
+\end{equation}
+        Then
+        \begin{equation}
+            \hat{X}^{\top}\hat{X}
+            = V S^{2} V^{\top},
+        \end{equation}
+        which implies that
+        \begin{equation}
+            \Sigma = \frac{1}{n} V S^{2} V^{\top}.
+        \end{equation}
+        Thus \textbf{the right singular vectors of $\hat{X}$ are exactly the eigenvectors of $\Sigma$},  
+        and the eigenvalues of $\Sigma$ are the squared singular values of $\hat{X}$ scaled by $\tfrac{1}{n}$.
+
+        
+        \begin{remark}
+        This relationship provides an alternative way to compute principal components:  
+        the top $K$ principal directions are simply the first $K$ right singular vectors of $\hat{X}$.  
+        This method is numerically more stable, especially when $n \neq d$ or when $d$ is large.
+        \end{remark}
+        \begin{note}
+            In practice, computing a full SVD of a \(d\times d\) matrix costs \(O(d^3)\) time,  
+            which is prohibitive for high-dimensional data.  
+            Modern numerical linear algebra therefore relies on \emph{approximate} SVD methods  
+            that dramatically reduce the computational burden while preserving the leading singular components.
+        
+            A common approach is the \emph{truncated SVD}:  
+            instead of computing all singular values, we only approximate the top \(K\) components.  
+            Techniques such as randomized sketching, subspace iteration, and the Nyström method  
+            project the data onto a low-dimensional subspace in which SVD becomes cheap.  
+            These methods achieve a cost of roughly
+            \begin{equation*}
+                O(ndK) \quad\text{or}\quad O(d^2K),
+            \end{equation*}
+            depending on the algorithm, reducing the complexity significantly when \(K \ll d\).
+        
+            This is why real-world PCA implementations (e.g., \emph{\texttt{sklearn}, \texttt{numpy.linalg.svd}},  
+            and randomized PCA algorithms) compute only the leading singular vectors rather than performing a full decomposition.
+        \end{note}
+\subsection{t-distributed Stochastic Neighbor Embedding (t-SNE)*}
+
+t-SNE is a nonlinear dimensionality reduction method designed for visualization 
+of high-dimensional datasets.  
+It constructs two probability distributions:
+\begin{itemize}
+    \item one on the pairwise similarities in the high-dimensional space,  
+    \item one on the pairwise similarities in the low-dimensional embedding,
+\end{itemize}
+and finds an embedding that makes these two distributions as close as possible.
+
+
+\begin{definition}[High-Dimensional Similarity]
+For datapoints $x_i, x_j$ in the original space, define a conditional probability
+\begin{equation}
+    p_{j|i}
+    = \frac{\exp\!\left(-\|x_i - x_j\|^2 / 2\sigma_i^2\right)}
+            {\sum_{k \neq i} \exp\!\left(-\|x_i - x_k\|^2 / 2\sigma_i^2\right)}.
+\end{equation}
+The bandwidth $\sigma_i$ is chosen such that the \emph{perplexity}
+of the distribution matches a user-specified value.
+The symmetric joint probability is
+\begin{equation}
+    p_{ij} = \frac{p_{j|i} + p_{i|j}}{2n}.
+\end{equation}
+\end{definition}
+\begin{note}
+    It simply applies an RBF kernel to these data points.
+\end{note}
+\begin{definition}[Low-Dimensional Similarity]
+For embedding points $y_i, y_j \in \mathbb{R}^2$,  
+t-SNE uses a Student-\(t\) distribution with one degree of freedom:
+\begin{equation}
+    q_{ij}
+    =
+    \frac{\left(1 + \|y_i - y_j\|^2\right)^{-1}}
+            {\sum_{k \neq \ell}
+            \left(1 + \|y_k - y_\ell\|^2\right)^{-1}}.
+\end{equation}
+The heavy-tailed distribution alleviates the “crowding problem.”
+\end{definition}
+
+\begin{theorem}[t-SNE Objective]
+t-SNE seeks an embedding that minimizes the Kullback–Leibler divergence
+between the high-dimensional and low-dimensional similarity distributions:
+\begin{equation}
+    \mathcal{L}(Y)
+    = \mathrm{KL}(P \parallel Q)
+    = \sum_{i \neq j} p_{ij} \log \frac{p_{ij}}{q_{ij}}.
+\end{equation}
+Gradient descent on $\mathcal{L}$ yields the final visualization.
+\end{theorem}
+
+t-SNE preserves local neighborhood structure.
+Pairs with large $p_{ij}$ are forced to satisfy large $q_{ij}$, keeping  
+nearby points close.  
+The heavy-tailed $t$-distribution allows distant points to be modeled far apart,
+preventing clusters from collapsing together.
+
+t-SNE is excellent for visualization but not for general-purpose embedding:
+its geometry is not globally meaningful, and repeated runs  
+may differ due to initialization and stochasticity.
+\begin{figure}[h]
+    \centering
+    \includegraphics{../../tikz/9/2.pdf}
+\end{figure}
+\section{Clustering}
+\subsection{K-means Clustering Algorithm}
+Clustering is a fundamental task in unsupervised learning.  
+Given a dataset
+$
+    D = \{x_1, x_2, \ldots, x_n\} \subset \mathbb{R}^d,
+$
+our goal is to divide these points into \(K\) meaningful groups.  
+Here \(K\) is a user-chosen hyperparameter, reflecting how many clusters we expect to find.
+
+To formalize this idea, assume that each cluster \(k \in [K]\) is represented by a \textbf{center}
+$
+    m_k \in \mathbb{R}^d,
+$
+and each data point chooses exactly one cluster.  
+We denote the assignment using an indicator variable \(r_{ik} \in \{0,1\}\):
+\begin{equation}
+    r_{ik} = 1 
+    \quad\Longleftrightarrow\quad
+    x_i \text{ belongs to cluster } k.
+\end{equation}
+Because every point must join one and only one cluster, we enforce
+\begin{equation}
+    \sum_{k=1}^{K} r_{ik} = 1,
+    \qquad \forall i \in [n].
+\end{equation}
+
+The intuitive goal of K-means is simple:  
+each point should be close to the center of the cluster it joins.  
+This leads to the objective function
+\begin{theorem}
+    \begin{equation}
+        L =
+        \sum_{i=1}^{n}
+        \sum_{k=1}^{K}
+            r_{ik} \, \|x_i - m_k\|^2,
+    \end{equation}
+\end{theorem}
+which measures the total within-cluster variance.
+
+At first glance, minimizing \(L\) over both the assignments \(r\) and the centers \(m\) seems challenging,  
+because the problem is non-convex and the discrete variables \(r_{ik}\) make the search space combinatorial.  
+However, K-means becomes surprisingly tractable once we notice a key structural property of its objective.  
+Although optimizing both the assignments \(r\) and the centers \(m\) jointly is difficult,  
+each subproblem becomes very simple when the other variable is held fixed:
+
+\begin{itemize}
+    \item \textbf{Fixing the centers \(\{m_k\}\):}  
+    assigning each point \(x_i\) reduces to choosing the closest center.
+
+    \item \textbf{Fixing the assignments \(\{r_{ik}\}\):}  
+    updating each center becomes a simple averaging step, since the optimal \(m_k\) is just the mean of the points assigned to cluster \(k\).
+\end{itemize}
+
+This observation directly motivates the alternating minimization strategy used by the K-means algorithm:  
+we first update the assignments, then update the centers, and repeat the process until convergence.  
+The objective value decreases at every iteration, and although the algorithm is not guaranteed to reach the global optimum,  
+it converges to a stable \textbf{local minimum} in practice.
+
+\textbf{Assignment step (fix the centers).}  
+For each datapoint \(x_i\), we assign it to the nearest cluster center:
+\begin{equation}
+    r_{ik} =
+    \begin{cases}
+        1,
+        & k = \displaystyle\argmin_{j\in[K]} \|x_i - m_j\|^2, \\[6pt]
+        0,
+        & \text{otherwise}.
+    \end{cases}
+\end{equation}
+
+\textbf{Update step (fix the assignments).}  
+To update \(m_k\), we minimize the partial objective
+\begin{equation}
+    \sum_{i=1}^{n} r_{ik} \|x_i - m_k\|^2
+\end{equation}
+with respect to \(m_k\).  
+Taking the derivative and setting it to zero,
+\begin{equation}
+    \frac{\partial}{\partial m_k}
+    \sum_{i=1}^{n} r_{ik} \|x_i - m_k\|^2
+    =
+    -2 \sum_{i=1}^{n} r_{ik} (x_i - m_k)
+    = 0,
+\end{equation}
+which gives the closed-form mean update:
+\begin{equation}
+    m_k
+    =
+    \frac{\sum_{i=1}^{n} r_{ik} x_i}
+         {\sum_{i=1}^{n} r_{ik}}.
+\end{equation}
+
+\begin{remark}
+    At each iteration, K-means performs a greedy improvement step:  
+    the assignment update minimizes the objective given the centers,  
+    and the center update minimizes the objective given the assignments.  
+    Hence the objective value is non-increasing and bounded below,  
+    which guarantees convergence.
+\end{remark}
+
+\begin{remark}
+    However, precisely because each step is greedy and only optimizes a partial variable,  
+    K-means may converge to a \emph{local} minimum rather than the global optimum.  
+    Different initializations can therefore lead to different final solutions.
+\end{remark}
+
+Together, these two steps define the classical K-means iteration,  
+which alternates between assigning points to their closest centers  
+and recomputing each center as the average of its assigned datapoints.
+\begin{figure}[h]
+    \centering
+    \includegraphics[width=\textwidth]{../../tikz/9/3.pdf}
+\end{figure}
+\begin{note}
+At each iteration, the K-means algorithm computes the \emph{centroids} of the current approximate clusters and then uses these centroids as the new cluster centers.  
+    In other words, K-means repeatedly replaces each center with the center of mass of the points currently assigned to it.  
+\end{note}
+\subsection{Mixture of Gaussians (MoG)}  
+K-means assumes that points belonging to each cluster are concentrated around a center,  
+but it does not model the \emph{shape} or \emph{spread} of each cluster.  
+A more expressive model is the \emph{Mixture of Gaussians},  
+which assumes that points assigned to the same cluster are generated from a Gaussian distribution with its own mean and covariance.
+
+Formally, for cluster \(k \in [K]\), assume
+$
+    (x \mid z_k = 1) \sim \mathcal{N}(\mu_k,\, \Sigma_k).
+$
+Here \(z = (z_1,\ldots,z_K)\) is a latent indicator vector with
+\begin{equation}
+    z_k \in \{0,1\}, 
+    \qquad 
+    \sum_{k=1}^{K} z_k = 1.
+\end{equation}
+Thus \(z_k = 1\) indicates that \(x\) is generated from cluster \(k\).
+
+The prior distribution over latent clusters is modeled using a categorical distribution:
+\begin{equation}
+    P(z_k = 1) = \pi_k,
+    \qquad
+    0 \le \pi_k \le 1,
+    \qquad
+    \sum_{k=1}^{K} \pi_k = 1.
+\end{equation}
+
+Given the latent variable \(z\), the conditional likelihood is
+\begin{equation}
+    P(x \mid z_k = 1)
+    =
+    \mathcal{N}(x \mid \mu_k, \Sigma_k).
+\end{equation}
+
+Since MoG is a \emph{generative model}, the marginal density of \(x\) is obtained by summing over all latent clusters:
+\begin{equation}
+    P(x)
+    = 
+    \sum_{k=1}^{K}
+    P(z_k = 1)\, P(x \mid z_k = 1)
+    =
+    \sum_{k=1}^{K}
+    \pi_k \, \mathcal{N}(x \mid \mu_k, \Sigma_k).
+\end{equation}
+
+% In practice, the parameters
+% \(\{\pi_k, \mu_k, \Sigma_k\}_{k=1}^{K}\)
+% are estimated by maximizing the likelihood of the observed data,  
+% typically using the EM algorithm (Expectation–Maximization),  
+% which alternates between inferring the latent cluster probabilities and updating the Gaussian parameters.
+\begin{figure}[h]
+    \centering
+    \includegraphics{../../tikz/9/4.pdf}
+\end{figure}
+\begin{note}
+    MoG can be viewed as a soft, probabilistic extension of K-means:  
+instead of forcing each point into exactly one cluster,  
+it allows fractional responsibilities and models not only the center but also the spread and orientation of each cluster.
+\end{note}
+\section{Latent Variable Models and the EM Algorithm}
+\subsection{Latent-variable MLE and why it is hard}
+MoG is a \emph{latent-variable} model: for each datapoint $x_i$ there is an
+unobserved cluster indicator.  We will use the lighter integer notation
+\begin{equation}
+    G_i \in \{1,2,\dots,K\},
+\end{equation}
+where $G_i=k$ means $x_i$ is generated from component $k$.
+(This is equivalent to one-hot encoding used elsewhere.)
+
+The marginal likelihood for each datapoint is
+\begin{equation}
+    p(x_i;\theta)=\sum_{k=1}^K \pi_k\,\mathcal N(x_i\mid \mu_k,\Sigma_k),
+\end{equation}
+thus the MLE objective becomes
+\begin{equation}\label{eq:ul-mog-mle}
+    \argmax_{\theta}\;
+    \sum_{i=1}^{n}
+        \log\!\left(
+            \sum_{k=1}^{K}
+                \pi_k\, \mathcal{N}(x_i \mid \mu_k, \Sigma_k)
+        \right),
+\end{equation}
+where $\theta=\{\pi_k,\mu_k,\Sigma_k\}_{k=1}^K$.
+
+\begin{remark}
+    The difficulty is the \textbf{log of a sum}, which couples all components.
+    Moreover, $\sum_k\pi_k=1$ and each $\Sigma_k$ must be positive definite,
+    making naive unconstrained gradient descent non-trivial.
+\end{remark}
+
+\subsection{KL divergence and ELBO (evidence lower bound)}
+To derive EM in a general and reusable way, we introduce KL divergence.
+\begin{definition}[Kullback--Leibler divergence]
+    For distributions $Q,P$ on the same variable $Z$,
+    \begin{equation}
+        \mathrm{KL}(Q\|P)
+        := \mathbb E_{Z\sim Q}\!\left[\log\frac{Q(Z)}{P(Z)}\right].
+    \end{equation}
+\end{definition}
+
+\begin{proposition}\label{prop:ul-kl}
+    $\mathrm{KL}(Q\|P)\ge 0$, and $\mathrm{KL}(Q\|P)=0$ iff $Q=P$ (a.s.).
+\end{proposition}
+
+\begin{theorem}[ELBO decomposition]\label{thm:ul-elbo}
+    For any $q(z)$ and any $\theta$,
+    \begin{equation}\label{eq:ul-elbo}
+        \log p(x;\theta)
+        =
+        \underbrace{
+        \mathbb E_{z\sim q}\!\left[\log \frac{p(x,z;\theta)}{q(z)}\right]
+        }_{\mathcal L(q,\theta)\;\;(\text{ELBO})}
+        +
+        \underbrace{
+        \mathrm{KL}\!\left(q(z)\,\middle\|\,p(z\mid x;\theta)\right)
+        }_{\ge 0}.
+    \end{equation}
+    Hence $\mathcal L(q,\theta)\le \log p(x;\theta)$.
+\end{theorem}
+
+\begin{remark}
+    ELBO is commonly read as ``elbow'' in talks.
+    The bound is tight when $q(z)=p(z\mid x;\theta)$.
+\end{remark}
+
+\subsection{The EM algorithm (Expectation--Maximization)}
+EM alternates between optimizing the variational distribution $q$ and the model
+parameters $\theta$.
+
+\textbf{E-step (fix $\theta$).}
+Given $\theta^{(t)}$, maximize ELBO in $q$:
+\begin{equation}
+    q^{(t+1)}(z)=p(z\mid x;\theta^{(t)}),
+\end{equation}
+which drives the KL gap in \eqref{eq:ul-elbo} to $0$ at $\theta^{(t)}$.
+
+\textbf{M-step (fix $q$).}
+With $q^{(t+1)}$ fixed, update parameters by maximizing ELBO:
+\begin{equation}
+    \theta^{(t+1)}\in\argmax_{\theta}\; \mathcal L\!\left(q^{(t+1)},\theta\right).
+\end{equation}
+Because the $-\mathbb E_{q^{(t+1)}}[\log q^{(t+1)}]$ term does not depend on
+$\theta$, this is equivalent to maximizing the expected complete-data
+log-likelihood
+\begin{equation}
+    Q(\theta\mid \theta^{(t)})
+    :=
+    \mathbb E_{z\sim p(z\mid x;\theta^{(t)})}\!\left[\log p(x,z;\theta)\right].
+\end{equation}
+
+\begin{remark}
+    Repeating E-step $\rightarrow$ M-step yields a monotone (non-decreasing)
+    data log-likelihood sequence, and EM converges to a stationary point (not
+    necessarily the global optimum).
+\end{remark}
+
+\subsection{EM for MoG: responsibilities and closed-form updates}
+For MoG, the complete-data likelihood for one datapoint is
+\begin{equation}
+    p(x_i,G_i=k;\theta)
+    = \pi_k\,\mathcal N(x_i\mid \mu_k,\Sigma_k).
+\end{equation}
+
+\textbf{E-step.} Compute posterior responsibilities
+\begin{equation}
+    \gamma_{ik}
+    :=
+    p(G_i=k\mid x_i;\theta^{(t)})
+    =
+    \frac{\pi_k^{(t)}\, \mathcal{N}(x_i \mid \mu_k^{(t)}, \Sigma_k^{(t)})}
+         {\sum_{j=1}^{K} \pi_j^{(t)}\, \mathcal{N}(x_i \mid \mu_j^{(t)}, \Sigma_j^{(t)})}.
+\end{equation}
+
+\textbf{M-step.} Let $N_k:=\sum_{i=1}^n\gamma_{ik}$. Then the maximizer has
+closed-form updates:
+\begin{gather}
+    \pi_k=\frac{N_k}{n},\\
+    \mu_k=\frac{1}{N_k}\sum_{i=1}^n \gamma_{ik}x_i,\\
+    \Sigma_k=\frac{1}{N_k}\sum_{i=1}^n \gamma_{ik}(x_i-\mu_k)(x_i-\mu_k)^{\top}.
+\end{gather}
+
+\begin{remark}
+    These are \textbf{soft-assignment weighted} empirical mean and covariance.
+    When responsibilities collapse to hard assignments, MoG-EM reduces to the
+    familiar K-means updates.
+\end{remark}
+\end{document}
\ No newline at end of file
diff --git a/notes/2025/mvp/main.pdf b/notes/2025/mvp/main.pdf
index 8cf0e9d..6e25b79 100644
Binary files a/notes/2025/mvp/main.pdf and b/notes/2025/mvp/main.pdf differ
diff --git a/notes/2025/mvp/main.tex b/notes/2025/mvp/main.tex
index ee48f36..b2e7a73 100644
--- a/notes/2025/mvp/main.tex
+++ b/notes/2025/mvp/main.tex
@@ -1,5 +1,8 @@
 \documentclass[lang=en,newtx,10pt]{elegantbook}
 \usepackage{subfiles}
+\usepackage{wrapfig}
+\usepackage{amssymb}
+\usepackage{pifont}
 \title{Machine Learning}
 
 \author{Shaoheng Yan (\href{https://www.photonyan.fun/about}{PhotonYan})}
@@ -63,4 +66,7 @@
 \subfile{chapters/5-rt.tex}
 \subfile{chapters/6-lt.tex}
 \subfile{chapters/7-gp.tex}
+\subfile{chapters/8-tel.tex}
+\subfile{chapters/9-ul.tex}
+% \subfile{chapters/9-em-mog.tex}
 \end{document}
\ No newline at end of file
diff --git a/notes/2025/mvp/tikz/9/5.pdf b/notes/2025/mvp/tikz/9/5.pdf
new file mode 100644
index 0000000..ddd593d
Binary files /dev/null and b/notes/2025/mvp/tikz/9/5.pdf differ
diff --git a/notes/2025/mvp/tikz/9/5.tex b/notes/2025/mvp/tikz/9/5.tex
new file mode 100644
index 0000000..7f7dd47
--- /dev/null
+++ b/notes/2025/mvp/tikz/9/5.tex
@@ -0,0 +1,48 @@
+\documentclass[tikz,border=5pt]{standalone}
+\usepackage{amsmath}
+\usetikzlibrary{arrows.meta}
+
+\begin{document}
+\begin{tikzpicture}[>=Latex,scale=1]
+
+% axes
+\draw[->] (0,0) -- (6.5,0) node[below] {$\theta$};
+\draw[->] (0,0) -- (0,4.3) node[left] {$\log p(x;\theta)$};
+
+% true log-likelihood curve (white curve in lecture)
+\draw[thick] plot[smooth] coordinates {
+  (0.4,1.1) (1.2,1.6) (2.2,2.1) (3.4,2.9) (4.6,3.1) (5.8,3.35)
+};
+\node[black] at (5.3,3.65) {\small data log-likelihood};
+
+% theta_old and theta_new markers
+\coordinate (told) at (2.2,2.1);
+\coordinate (tnew) at (4.6,3.1);
+\fill (told) circle (2pt);
+\fill (tnew) circle (2pt);
+\node[below] at (told) {\small $\theta^{(t)}$};
+\node[below] at (tnew) {\small $\theta^{(t+1)}$};
+
+% ELBO curve touching at theta^{(t)} (red curve)
+\draw[thick,red] plot[smooth] coordinates {
+  (0.4,0.7) (1.2,1.3) (2.2,2.1) (3.4,2.55) (4.6,3.0) (5.8,3.05)
+};
+\node[red] at (1.1,0.9) {\small ELBO at $\theta^{(t)}$};
+
+% ELBO curve touching at theta^{(t+1)} (orange curve), indicative of next iteration
+\draw[thick,orange!80!black] plot[smooth] coordinates {
+  (0.4,0.6) (1.2,1.0) (2.2,1.7) (3.4,2.5) (4.6,3.1) (5.8,3.25)
+};
+\node[orange!80!black] at (5.0,2.55) {\small ELBO at $\theta^{(t+1)}$};
+
+% arrows for E-step and M-step
+\draw[->,thick] (2.2,0.2) -- (2.2,1.95);
+\node[right] at (2.25,1.1) {\small E-step: tighten};
+
+\draw[->,thick] (2.35,2.05) -- (4.45,3.02);
+\node[above] at (3.6,2.8) {\small M-step: maximize};
+
+\end{tikzpicture}
+\end{document}
+
+
diff --git a/notes/2025/tikz/8/1.pdf b/notes/2025/tikz/8/1.pdf
new file mode 100644
index 0000000..25de9bf
Binary files /dev/null and b/notes/2025/tikz/8/1.pdf differ
diff --git a/notes/2025/tikz/8/1.tex b/notes/2025/tikz/8/1.tex
new file mode 100644
index 0000000..1a75ed3
--- /dev/null
+++ b/notes/2025/tikz/8/1.tex
@@ -0,0 +1,21 @@
+\documentclass[tikz]{standalone}
+
+\usepackage{tikz}
+\usetikzlibrary{positioning}
+
+\begin{document}
+
+\begin{tikzpicture}[>=stealth, node distance=1.8cm]
+    % root node
+    \node[circle,draw,minimum width=13mm,label=right:{$w^\top x + b \ge 0$}] (root) {};
+
+    % leaves
+    \node[draw,rectangle,below left=1.4cm and 2.0cm of root]  (plus)  {$+1$};
+    \node[draw,rectangle,below right=1.4cm and 2.0cm of root] (minus) {$-1$};
+
+    % edges
+    \draw[->] (root) -- node[above left]  {yes} (plus);
+    \draw[->] (root) -- node[above right] {no}  (minus);
+\end{tikzpicture}
+
+\end{document}
\ No newline at end of file
diff --git a/notes/2025/tikz/8/2.bbl b/notes/2025/tikz/8/2.bbl
new file mode 100644
index 0000000..e69de29
diff --git a/notes/2025/tikz/8/2.blg b/notes/2025/tikz/8/2.blg
new file mode 100644
index 0000000..41db3d9
--- /dev/null
+++ b/notes/2025/tikz/8/2.blg
@@ -0,0 +1,3 @@
+[0] Config.pm:308> INFO - This is Biber 2.20
+[0] Config.pm:311> INFO - Logfile is '2.blg'
+[42] biber:340> INFO - === Wed Nov 19, 2025, 15:59:11
diff --git a/notes/2025/tikz/8/2.pdf b/notes/2025/tikz/8/2.pdf
new file mode 100644
index 0000000..d8230ce
Binary files /dev/null and b/notes/2025/tikz/8/2.pdf differ
diff --git a/notes/2025/tikz/8/2.tex b/notes/2025/tikz/8/2.tex
new file mode 100644
index 0000000..d5603b8
--- /dev/null
+++ b/notes/2025/tikz/8/2.tex
@@ -0,0 +1,49 @@
+\documentclass[tikz]{standalone}
+
+\usepackage{tikz}
+\usetikzlibrary{positioning}
+
+\begin{document}
+
+\begin{tikzpicture}[>=stealth, node distance=15mm]
+
+    % root internal node (feature A)
+    \node[circle,draw,minimum size=11mm,
+          label=above:{\small $A$}] (A) {};
+
+    % right leaf: predict -1, set {4,5,6,7,9}
+    \node[rectangle,draw,below right=16mm and 24mm of A,
+          minimum width=12mm,minimum height=9mm] (Rminus) {$-1$};
+
+    % left internal node (feature B)
+    \node[circle,draw,below left=16mm and 10mm of A,
+          minimum size=11mm,
+          label=left:{\small $B$}] (B) {};
+
+    % left leaf under B: +1, set {1,2}
+    \node[rectangle,draw,below left=16mm and 8mm of B,
+          minimum width=12mm,minimum height=9mm] (Lplus) {$+1$};
+
+    % right leaf under B: -1, set {3,8}
+    \node[rectangle,draw,below right=16mm and 8mm of B,
+          minimum width=12mm,minimum height=9mm] (Lminus) {$-1$};
+
+    % edges from root
+    \draw[->] (A) -- node[above left,pos=0.45] {\scriptsize yes}
+                     node[left,pos=0.55]      {\scriptsize $\{1,2,3,8\}$}
+              (B);
+    \draw[->] (A) -- node[above right,pos=0.45] {\scriptsize no}
+                     node[right,pos=0.55]       {\scriptsize $\{4,5,6,7,9\}$}
+              (Rminus);
+
+    % edges from B
+    \draw[->] (B) -- node[above left,pos=0.45] {\scriptsize yes}
+                     node[left,pos=0.65]       {\scriptsize $\{1,2\}$}
+              (Lplus);
+    \draw[->] (B) -- node[above right,pos=0.45] {\scriptsize no}
+                     node[right,pos=0.65]       {\scriptsize $\{3,8\}$}
+              (Lminus);
+
+\end{tikzpicture}
+
+\end{document}
\ No newline at end of file
diff --git a/notes/2025/tikz/8/3.pdf b/notes/2025/tikz/8/3.pdf
new file mode 100644
index 0000000..da92626
Binary files /dev/null and b/notes/2025/tikz/8/3.pdf differ
diff --git a/notes/2025/tikz/8/3.tex b/notes/2025/tikz/8/3.tex
new file mode 100644
index 0000000..b20f5f4
--- /dev/null
+++ b/notes/2025/tikz/8/3.tex
@@ -0,0 +1,36 @@
+\documentclass[tikz]{standalone}
+
+\usepackage{tikz}
+\usetikzlibrary{calc}
+
+\begin{document}
+
+\begin{tikzpicture}[>=stealth,line width=0.5pt]
+    \fill[red!15] (2,2) rectangle (4,4);
+    % Axes
+    \draw[->] (-0.2,0) -- (4.2,0) node[right] {$B$};
+    \draw[->] (0,-0.2) -- (0,4.2) node[above] {$A$};
+
+    % Horizontal and vertical split lines
+    \draw (0,2) -- (4,2);
+    \draw (2,2) -- (2,4);
+
+    % Highlight the top-right cell (in light pink)
+    
+
+    % Region labels
+    \node at (1,3) {$-1$};
+    \node at (3,3) {$+1$};
+    \node at (2,1) {$-1$};
+    % \node at (3,1) {$-1$};
+
+    % Axis tick labels (+1 / -1)
+    \node[left] at (0,3) {$+1$};
+    \node[left] at (0,1) {$-1$};
+
+    \node[below] at (1,0) {$-1$};
+    \node[below] at (3,0) {$+1$};
+
+\end{tikzpicture}
+
+\end{document}
\ No newline at end of file
diff --git a/notes/2025/tikz/8/4.pdf b/notes/2025/tikz/8/4.pdf
new file mode 100644
index 0000000..bcbe9a7
Binary files /dev/null and b/notes/2025/tikz/8/4.pdf differ
diff --git a/notes/2025/tikz/8/4.tex b/notes/2025/tikz/8/4.tex
new file mode 100644
index 0000000..1dd715c
--- /dev/null
+++ b/notes/2025/tikz/8/4.tex
@@ -0,0 +1,40 @@
+\documentclass[tikz]{standalone}
+\usepackage{tikz}
+\usetikzlibrary{positioning}
+
+\begin{document}
+
+\begin{tikzpicture}[%
+    >=stealth,
+    level distance=18mm,
+    sibling distance=26mm,
+    every node/.style={font=\small}
+]
+
+% root = 1.0
+\node[circle, draw, minimum size=9mm] (root) {$1.0$}
+    child[left] {
+        % apple leaf
+        node[align=center] (apple) {apple\\$0.2$}
+        edge from parent node[midway,  left] {0}
+    }
+    child[right] {
+        % internal node for orange + banana
+        node[circle, draw, minimum size=9mm] (node2) {$0.8$}
+        edge from parent node[midway,  right] {1}
+    };
+
+% children of node2
+\node[align=center, below=12.7mm of node2, xshift=8mm] (orange)
+    {orange\\$0.3$};
+
+\node[align=center, below=12mm of node2, xshift=-8mm] (banana)
+    {banana\\$0.5$};
+
+% edges from node2 to leaves manually
+\draw[->] (node2) -- (orange) node[midway,right ] {0};
+\draw[->] (node2) -- (banana) node[midway, left] {1};
+
+\end{tikzpicture}
+
+\end{document}
\ No newline at end of file
diff --git a/notes/2025/tikz/8/5.pdf b/notes/2025/tikz/8/5.pdf
new file mode 100644
index 0000000..abc5430
Binary files /dev/null and b/notes/2025/tikz/8/5.pdf differ
diff --git a/notes/2025/tikz/8/5.tex b/notes/2025/tikz/8/5.tex
new file mode 100644
index 0000000..177243f
--- /dev/null
+++ b/notes/2025/tikz/8/5.tex
@@ -0,0 +1,36 @@
+\documentclass[tikz]{standalone}
+\usepackage{tikz}
+
+\begin{document}
+
+\begin{tikzpicture}[every node/.style={font=\small}]
+    % 圆半径
+    \def\r{1.8}
+
+    % 圆心位置
+    \coordinate (Xc) at (0,0);
+    \coordinate (Yc) at (3,0);
+
+    % 着色：左 H(X)，右 H(Y)，交叠区域颜色更深
+    \fill[blue!50,opacity=0.5] (Xc) circle (\r);
+    \fill[red!50,opacity=0.5]  (Yc) circle (\r);
+
+    % 边界
+    \draw (Xc) circle (\r);
+    \draw (Yc) circle (\r);
+
+    % 顶部标 H(X), H(Y)
+    \node[above] at (0, \r+0.2) {$H(X)$};
+    \node[above] at (3, \r+0.2) {$H(Y)$};
+
+    % 条件熵部分：左右各一块
+    \node at (-0.6,0) {$H(X\mid Y)$};
+    \node at (3.6,0) {$H(Y\mid X)$};
+
+    % 中间交叠部分：互信息
+    \draw (1.5,-0.3)--(1.5,-1.5);
+    \node at (1.5,-2) {$I(X;Y)$};
+
+\end{tikzpicture}
+
+\end{document}
\ No newline at end of file
diff --git a/notes/2025/tikz/8/6.pdf b/notes/2025/tikz/8/6.pdf
new file mode 100644
index 0000000..66d8fed
Binary files /dev/null and b/notes/2025/tikz/8/6.pdf differ
diff --git a/notes/2025/tikz/8/6.tex b/notes/2025/tikz/8/6.tex
new file mode 100644
index 0000000..3e397c7
--- /dev/null
+++ b/notes/2025/tikz/8/6.tex
@@ -0,0 +1,51 @@
+\documentclass[tikz]{standalone}
+
+\usepackage{tikz}
+\usetikzlibrary{positioning}
+
+\begin{document}
+
+\begin{tikzpicture}[
+    >=stealth,
+    every node/.style={font=\small},
+    box/.style={draw,rounded corners,minimum width=18mm,minimum height=7mm,align=center},
+    tree/.style={draw,rectangle,minimum width=10mm,minimum height=7mm,align=center}
+]
+
+% Original dataset
+\node[box] (data) {Training\\data $D$};
+
+% Bootstrap samples
+\node[box,below left=10mm and 6mm of data]  (b1) {$D^{(1)}$};
+\node[box,below       =10mm of data]        (b2) {$D^{(2)}$};
+\node[box,below right=10mm and 6mm of data] (b3) {$D^{(M)}$};
+
+% Arrows from original data to bootstraps
+\draw[->] (data.south west) -- (b1.north);
+\draw[->] (data.south)      -- (b2.north);
+\draw[->] (data.south east) -- (b3.north);
+
+% Base learners (trees)
+\node[tree,below=10mm of b1] (t1) {$f_1$};
+\node[tree,below=10mm of b2] (t2) {$f_2$};
+\node[tree,below=10mm of b3] (t3) {$f_M$};
+
+\draw[->] (b1) -- (t1);
+\draw[->] (b2) -- (t2);
+\draw[->] (b3) -- (t3);
+
+% Aggregation box
+\node[box,below=16mm of t2,minimum width=26mm] (agg) {Average /\\majority vote};
+
+\draw[->] (t1.south) -- ([xshift=-8mm]agg.north);
+\draw[->] (t2.south) -- (agg.north);
+\draw[->] (t3.south) -- ([xshift=8mm]agg.north);
+
+% Output
+\node[right=20mm of agg] (out) {$F_{\mathrm{bag}}(x)$};
+\draw[->] (agg.east) -- (out.west);
+
+\end{tikzpicture}
+
+\end{document}
+
diff --git a/notes/2025/tikz/8/7.pdf b/notes/2025/tikz/8/7.pdf
new file mode 100644
index 0000000..1027b33
Binary files /dev/null and b/notes/2025/tikz/8/7.pdf differ
diff --git a/notes/2025/tikz/8/7.tex b/notes/2025/tikz/8/7.tex
new file mode 100644
index 0000000..09ea585
--- /dev/null
+++ b/notes/2025/tikz/8/7.tex
@@ -0,0 +1,39 @@
+\documentclass[tikz]{standalone}
+
+\usepackage{tikz}
+\usetikzlibrary{positioning}
+
+\begin{document}
+
+\begin{tikzpicture}[
+    >=stealth,
+    every node/.style={font=\small},
+    box/.style={draw,rounded corners,minimum width=22mm,minimum height=7mm,align=center},
+    tree/.style={draw,rectangle,minimum width=10mm,minimum height=7mm,align=center}
+]
+
+% Data with weights
+\node[box] (d0) {Data $D$\\weights $w^{(0)}$};
+
+% First tree
+\node[tree,right=15mm of d0] (f1) {$f_1$};
+\draw[->] (d0) -- node[above]{fit} (f1);
+
+% Reweighted data
+\node[box,below=10mm of f1,minimum width=26mm] (d1) {Data $D$\\weights $w^{(1)}$};
+\draw[->] (f1.south) -- node[right]{update} (d1.north);
+
+% Second tree
+\node[tree,right=15mm of d1] (f2) {$f_2$};
+\draw[->] (d1) -- node[below]{fit} (f2);
+
+% Ensemble arrows
+\node[box,above=10mm of f2,minimum width=30mm] (F) {$F_2(x)=\alpha_1 f_1(x)+\alpha_2 f_2(x)$};
+
+\draw[->,dashed] (f1.north east) -- (F.west);
+\draw[->,dashed] (f2.north)      -- (F.south);
+
+\end{tikzpicture}
+
+\end{document}
+
diff --git a/notes/2025/tikz/9/1.pdf b/notes/2025/tikz/9/1.pdf
new file mode 100644
index 0000000..39ee108
Binary files /dev/null and b/notes/2025/tikz/9/1.pdf differ
diff --git a/notes/2025/tikz/9/1.tex b/notes/2025/tikz/9/1.tex
new file mode 100644
index 0000000..a89165b
--- /dev/null
+++ b/notes/2025/tikz/9/1.tex
@@ -0,0 +1,35 @@
+\documentclass[tikz,border=5pt]{standalone}
+
+\begin{document}
+\begin{tikzpicture}[scale=0.6,>=stealth]
+
+  % Axes
+  \draw[thick,->] (-3,0) -- (3.5,0) node[below] {$x_1$};
+  \draw[thick,->] (0,-3) -- (0,3.5) node[left] {$x_2$};
+
+  % Scatter points (rough diagonal cluster)
+  \foreach \x/\y in {
+    % upper-right cloud
+    0.2/0.6, 0.4/0.9, 0.5/1.1, 0.7/1.2, 0.8/1.5,
+    1.0/1.6, 1.1/1.8, 1.3/2.0, 1.4/2.2, 1.6/2.3,
+    1.8/2.5, 2.0/2.6, 2.1/2.8, 2.2/3.0,
+    0.1/0.9, 0.3/1.0, 0.6/1.3, 0.9/1.7, 1.2/1.9,
+    % central band
+    -0.2/0.2, 0.0/0.4, 0.2/0.5, 0.3/0.7, 0.5/0.8,
+    0.7/1.0, 0.9/1.2, 1.0/1.4,
+    % lower-left small cloud
+    -2.0/-0.7, -1.8/-0.6, -1.7/-0.4, -1.5/-0.5, -1.3/-0.3,
+    -1.2/-0.2, -1.0/-0.1, -0.9/0.0, -0.8/0.1
+  }{
+    \fill (\x,\y) circle[radius=1.6pt];
+  }
+
+  % PCA axes
+  % PC1: along the main diagonal direction (roughly slope ~1)
+  \draw[ thick,blue,->] (-3,-1.8) -- (3,3.2) node[anchor=south west] {$z_1$};
+
+  % PC2: perpendicular to PC1
+  \draw[ thick,red,<-] (-2.5,3) -- (2.5,-3) node[anchor=north west] {$z_2$};
+
+\end{tikzpicture}
+\end{document}
\ No newline at end of file
diff --git a/notes/2025/tikz/9/2.pdf b/notes/2025/tikz/9/2.pdf
new file mode 100644
index 0000000..935a0ed
Binary files /dev/null and b/notes/2025/tikz/9/2.pdf differ
diff --git a/notes/2025/tikz/9/2.tex b/notes/2025/tikz/9/2.tex
new file mode 100644
index 0000000..af0bc6e
--- /dev/null
+++ b/notes/2025/tikz/9/2.tex
@@ -0,0 +1,45 @@
+\documentclass[tikz,border=5pt]{standalone}
+
+\usepackage{xcolor}
+
+\begin{document}
+\begin{tikzpicture}[>=stealth,scale=0.6]
+
+  % Axes in embedding space
+  \draw[->,] (-3,0) -- (3.5,0) node[right] {$y_1$};
+  \draw[->,] (0,-3) -- (0,3.5) node[above] {$y_2$};
+%   \node[above right] at (2.2,3.2) {\small t-SNE embedding space};
+
+  % Cluster 1 (blue)
+  \foreach \x/\y in {
+    -1.4/1.2, -1.6/1.4, -1.2/1.5, -1.5/1.1, -1.3/1.3,
+    -1.7/1.2, -1.4/1.0, -1.1/1.4
+  }{
+    \fill[blue!70] (\x,\y) circle (1.7pt);
+  }
+  \node[blue!70] at (-1.6,1.9) {\small $\mu_A$};
+
+  % Cluster 2 (red)
+  \foreach \x/\y in {
+    1.4/1.0, 1.6/1.2, 1.8/0.9, 1.5/0.7, 1.9/1.1,
+    1.3/0.8, 1.7/0.6, 1.5/1.3
+  }{
+    \fill[red!70] (\x,\y) circle (1.7pt);
+  }
+  \node[red!70] at (2.2,1.6) {\small $\mu_B$};
+
+  % Cluster 3 (green)
+  \foreach \x/\y in {
+    -0.4/-1.2, -0.2/-1.0, 0.0/-1.3, 0.2/-1.1,
+    -0.1/-1.5, 0.3/-1.4, -0.3/-1.0, 0.1/-1.6
+  }{
+    \fill[green!70!black] (\x,\y) circle (1.7pt);
+  }
+  \node[green!70!black] at (0.4,-0.6) {\small $\mu_C$};
+
+  % A few “outliers”
+  \fill[gray!70] (2.4,-1.5) circle (1.7pt);
+  \fill[gray!70] (-2.3,-1.8) circle (1.7pt);
+
+\end{tikzpicture}
+\end{document}
\ No newline at end of file
diff --git a/notes/2025/tikz/9/3.pdf b/notes/2025/tikz/9/3.pdf
new file mode 100644
index 0000000..1f1ecef
Binary files /dev/null and b/notes/2025/tikz/9/3.pdf differ
diff --git a/notes/2025/tikz/9/3.tex b/notes/2025/tikz/9/3.tex
new file mode 100644
index 0000000..5db6054
--- /dev/null
+++ b/notes/2025/tikz/9/3.tex
@@ -0,0 +1,88 @@
+\documentclass[tikz,border=5pt]{standalone}
+\usepackage{xcolor}
+
+\begin{document}
+\begin{tikzpicture}[scale=1]
+
+% ---------- common style ----------
+\tikzstyle{panel}=[draw=black!40,rounded corners,thick]
+\tikzstyle{center}=[circle,draw=black,fill=white,inner sep=1pt]
+
+% ================== Panel 1: initial assignment ==================
+\begin{scope}[shift={(0,0)}]
+  % clip region
+  \clip (-2.4,-3) rectangle (2.4,3);
+
+  % data blobs (new color scheme)
+  \fill[magenta!20] (-0.4,1.3) circle (0.9);               % magenta blob
+  \fill[gray!20]   (-0.8,-1.2) ellipse (0.8 and 0.45);     % gray blob
+  \fill[cyan!20]   (1.1,-0.1)  ellipse (1.5 and 1.0);      % cyan blob
+
+  % initial (bad) centers
+  \node[center] (c1a) at (-1.5,  2.0) {};
+  \node[center] (c2a) at ( 1.8,  1.8) {};
+  \node[center] (c3a) at ( 0.5, -2.0) {};
+
+  % dashed circular decision regions
+  \draw[magenta!60,dashed,thick] (c1a) circle (1.4);
+  \draw[cyan!60,dashed,thick]    (c2a) circle (1.6);
+  \draw[gray!60,dashed,thick]    (c3a) circle (1.4);
+
+  % frame + title (also inside clip)
+  \draw[panel] (-2.4,-3) rectangle (2.4,3);
+  \node at (0,2.7) {\small Iteration 0};
+\end{scope}
+
+% ================== Panel 2: updated centers ==================
+\begin{scope}[shift={(6,0)}]
+  \clip (-2.4,-3) rectangle (2.4,3);
+
+  % same data blobs
+  \fill[magenta!20] (-0.4,1.3) circle (0.9);
+  \fill[gray!20]   (-0.8,-1.2) ellipse (0.8 and 0.45);
+  \fill[cyan!20]   (1.1,-0.1)  ellipse (1.5 and 1.0);
+
+  % means after one update
+  \node[center] (c1b) at (-0.8, 1.7) {};
+  \node[center] (c2b) at ( 1.5,0.3) {};
+  \node[center] (c3b) at (-0.3,-1.4) {};
+
+  % dashed regions
+  \draw[magenta!60,dashed,thick] (c1b) circle (1.0);
+  \draw[cyan!60,dashed,thick]    (c2b) circle (1.5);
+  \draw[gray!60,dashed,thick]    (c3b) circle (0.9);
+
+  \draw[panel] (-2.4,-3) rectangle (2.4,3);
+  \node at (0,2.7) {\small Iteration 1};
+\end{scope}
+
+% ================== Panel 3: converged ==================
+\begin{scope}[shift={(12,0)}]
+  \clip (-2.4,-3) rectangle (2.4,3);
+
+  % same data blobs
+  \fill[magenta!20] (-0.4,1.3) circle (0.9);
+  \fill[gray!20]   (-0.8,-1.2) ellipse (0.8 and 0.45);
+  \fill[cyan!20]   (1.1,-0.1)  ellipse (1.5 and 1.0);
+
+  % final means with labels
+  \node[center]
+        (c1c) at (-0.4, 1.3) {};
+
+  \node[center]
+        (c2c) at ( 1.1,-0.1) {};
+
+  \node[center]
+        (c3c) at (-0.8,-1.2) {};
+
+  % final boundaries
+  \draw[magenta!60,dashed,thick] (c1c) circle (1.0);
+  \draw[cyan!60,dashed,thick]    (c2c) circle (1.5);
+  \draw[gray!60,dashed,thick]    (c3c) circle (0.9);
+
+  \draw[panel] (-2.4,-3) rectangle (2.4,3);
+  \node at (0,2.7) {\small Iteration 2};
+\end{scope}
+
+\end{tikzpicture}
+\end{document}
\ No newline at end of file
diff --git a/notes/2025/tikz/9/4.pdf b/notes/2025/tikz/9/4.pdf
new file mode 100644
index 0000000..adc8256
Binary files /dev/null and b/notes/2025/tikz/9/4.pdf differ
diff --git a/notes/2025/tikz/9/4.tex b/notes/2025/tikz/9/4.tex
new file mode 100644
index 0000000..2c44612
--- /dev/null
+++ b/notes/2025/tikz/9/4.tex
@@ -0,0 +1,34 @@
+\documentclass[tikz,border=5pt]{standalone}
+\usepackage{amsmath}
+\usetikzlibrary{positioning,fit,arrows.meta}
+
+\begin{document}
+\begin{tikzpicture}[
+    latent/.style={circle,draw,thick,minimum size=14pt,inner sep=0pt},
+    observed/.style={latent,fill=gray!20},
+    plate/.style={draw,thick,rounded corners,inner sep=8pt},
+    >=Latex
+]
+
+% nodes
+\node[latent]   (Z)          {$Z$};
+\node[observed] (X) [below=1.6cm of Z] {$X$};
+
+% arrow Z -> X
+\draw[->,thick] (Z) -- (X);
+
+% labels on the right
+\node[right=0.5cm of Z] {\small latent variable};
+\node[right=0.5cm of X] {\small observed data};
+
+% prior / parameters
+\draw[<-] (Z) --++ (-1,0);
+\draw[->] (X) --++ (0,-1);
+\node[left=0.9cm of Z]  {$\pi$};
+\node[below=0.8cm of X] {\small $\mu_k,\;\Sigma_k$};
+
+% plate surrounding the generative block
+\node[plate,fit=(Z)(X)] (plate1) {};
+
+\end{tikzpicture}
+\end{document}
\ No newline at end of file