diff --git a/.gitignore b/.gitignore index 947bdc4..3a97470 100644 --- a/.gitignore +++ b/.gitignore @@ -14,4 +14,7 @@ notes/2025/tikz/4 notes/2025/mvp/chapters/4-nn.tex notes/2025/mvp/chapters/4-nn.pdf -0-ac* \ No newline at end of file +0-ac* + +logs/ +movs/ \ No newline at end of file diff --git a/notes/2025/mvp/chapters/2-lr.tex b/notes/2025/mvp/chapters/2-lr.tex index 6c82f6a..fc89658 100644 --- a/notes/2025/mvp/chapters/2-lr.tex +++ b/notes/2025/mvp/chapters/2-lr.tex @@ -8,9 +8,7 @@ \chapter{Logistic Regression} \item Sigmoid Regression \item Maximum a posteriori \end{introduction} -\section{Classification} - - \subsection{Binary Classification Problem} +\section{Binary Classification} Settings. \begin{itemize} @@ -58,7 +56,7 @@ \section{Classification} \centering \includegraphics{../../tikz/2/2.pdf} \caption{Classification by heperplane.} - \label{2-lr} + \label{fig:2-hyperplane} \end{figure} @@ -182,7 +180,7 @@ \section{Classification} If all points can be separated by a linear model without error, we say the dataset is {linearly separable}. \end{definition} -Example \ref{2-lr} is linearly separable, and the final state leads to $\|W\| \to \infty, \quad \|b\| \to \infty$. However, this situation is not desirable in practice, since it implies poor robustness. Hence a natural question arises: under the condition of linear separability, how can we find a well-chosen separating hyperplane that maximizes robustness? The answer will be presented in the next chapter, where we introduce the Support Vector Machine (SVM). The SVM optimizes $\hat w, \hat b$ by maximizing the margin (the distance between data points and the separating hyperplane), instead of simply minimizing the cross-entropy loss. +Figure~\ref{fig:2-hyperplane} illustrates a linearly separable setting, and the final state leads to $\|W\| \to \infty, \quad \|b\| \to \infty$. However, this situation is not desirable in practice, since it implies poor robustness. Hence a natural question arises: under the condition of linear separability, how can we find a well-chosen separating hyperplane that maximizes robustness? The answer will be presented in the next chapter, where we introduce the Support Vector Machine (SVM). The SVM optimizes $\hat w, \hat b$ by maximizing the margin (the distance between data points and the separating hyperplane), instead of simply minimizing the cross-entropy loss. Although logistic regression may suffer from divergence of parameters under separable data, it often achieves better performance than SVM in practice, due to the following reasons: \begin{enumerate} @@ -261,7 +259,7 @@ \section{Rethink of Linear Regression} \centering \includegraphics{../../tikz/2/3.pdf} \caption{Normal distribution (95\%).} - \label{2-lr} + \label{fig:2-normal} \end{figure} While the Central Limit Theorem (CLT) does not imply that most datasets are normally distributed, it motivates modeling additive noise as Gaussian. We assume: diff --git a/notes/2025/mvp/chapters/3-svm.tex b/notes/2025/mvp/chapters/3-svm.tex index d1e9a01..8f5c036 100644 --- a/notes/2025/mvp/chapters/3-svm.tex +++ b/notes/2025/mvp/chapters/3-svm.tex @@ -237,7 +237,8 @@ \subsection{Hard Margin} \begin{figure}[H] \centering \includegraphics{../../tikz/3/1.pdf} - \label{2-lr} + \caption{Distance from a point to a hyperplane.} + \label{fig:3-hyperplane-distance} \end{figure} \begin{proof} \begin{enumerate} @@ -962,7 +963,7 @@ \section{Kernel} \begin{equation} K(x, z) = x^\top z; \end{equation} - \item \textbf{Polynomial kernel}: maps $\mathbb{R}^n \to \mathbb{R}^{\scriptsize\begin{pmatrix}n+p\\p\end{pmatrix}}$, + \item \textbf{Polynomial kernel}: can be understood as mapping $\mathbb{R}^n$ into a feature space of dimension $\binom{n+p}{p}$, \begin{equation} K(x, z) = (x^\top z + 1)^p; \end{equation} diff --git a/notes/2025/mvp/chapters/7-gp.pdf b/notes/2025/mvp/chapters/7-gp.pdf index 6edc425..6c164c3 100644 Binary files a/notes/2025/mvp/chapters/7-gp.pdf and b/notes/2025/mvp/chapters/7-gp.pdf differ diff --git a/notes/2025/mvp/chapters/7-gp.tex b/notes/2025/mvp/chapters/7-gp.tex index 0fa7a16..a412987 100644 --- a/notes/2025/mvp/chapters/7-gp.tex +++ b/notes/2025/mvp/chapters/7-gp.tex @@ -527,7 +527,7 @@ \section{Gaussian Process Regression (GPR)} \Sigma_\star \bigr), \end{equation} -with mean +with posterior mean \begin{equation} \mu_\star = k(x_\star,X)^\top (K + \sigma^2 I)^{-1} y, @@ -564,7 +564,7 @@ \section{Gaussian Process Regression (GPR)} Gram matrix $K_\ell$ converges to a diagonal matrix: \begin{equation} \lim_{\ell \to 0} K_\ell - = \sigma_f^2 I_n. + = \sigma_f^2 I. \end{equation} In the noise-free case (i.e.\ $\sigma^2 = 0$ in the observation @@ -581,7 +581,7 @@ \section{Gaussian Process Regression (GPR)} \qquad K_\ell^{-1} \;\longrightarrow\; - \frac{1}{\sigma_f^2} I_n, + \frac{1}{\sigma_f^2} I, \end{equation} where $e_j$ is the $j$-th standard basis vector in $\mathbb R^n$. Hence @@ -595,5 +595,82 @@ \section{Gaussian Process Regression (GPR)} exactly at every training input: each training point is matched perfectly by the predictive mean. \end{remark} +%=================================== +From the discussion above, we know that the \emph{posterior mean} of a Gaussian +Process provides the predictive value for a test input, while the associated +\emph{uncertainty} can be quantified through the \emph{predictive variance}. +The relative standard deviation is simply the square root of this predictive +variance. + +Gaussian Process Regression (GPR) serves as the mathematical foundation of +\textbf{Bayesian Optimization (BO)}, a framework for performing optimization +when the objective function is expensive, noisy, or lacks analytic structure. + +\begin{definition}[Black-box Optimization] +An optimization problem is called \emph{black-box optimization} if and only if +the analytical form of the objective function is unknown and no gradient +information is available. +\end{definition} +Since the gradient of the objective function is unavailable, gradient-based +methods such as gradient descent cannot be applied in black-box settings. +The only feasible operation is \emph{point-wise evaluation}: we may query the +black box at a finite number of input locations and observe the corresponding +outputs. + +Bayesian Optimization (BO) aims to construct a probabilistic surrogate model +of the black-box function and use it to locate the maximizer of $y$ with as +few evaluations as possible. This is particularly important when each +evaluation is expensive, for instance in hyperparameter tuning. + +The typical BO procedure proceeds as follows: +\begin{enumerate} + \item Randomly or uniformly select a small set of initial points + $x_1,\ldots,x_n$ and obtain their evaluations $y_1,\ldots,y_n$. + \item Fit a Gaussian Process Regression (GPR) model using the collected + data. + \item Use an acquisition function $a(x)$ to select one or a batch of new + query points; evaluate them, augment the dataset, and refit the GPR model. + \item Repeat the process until the evaluation budget is exhausted, and + return the point achieving the maximum observed value of $y$. +\end{enumerate} +Here are two commonly used acquisition functions in Bayesian Optimization: + +\begin{enumerate} + \item \textbf{Expected Improvement (EI)}: + \begin{equation} + a(x) = \mathbb{E}\big[(y(x) - y_{\max})^{+}\big], + \end{equation} + where + \begin{equation} + [z]^{+} := \max(0, z). + \end{equation} + Under the GPR model, the predictive distribution is + \begin{equation} + y(x) \sim \mathcal{N}\big(\mu(x),\, \Sigma(x)\big), + \end{equation} + so EI admits a closed-form expression obtained by integrating over the + tail above $y_{\max}$: + \begin{equation} + a(x)=\int_{y_{\max}}^{+\infty} \big(y(x)-y_{\max}\big)\, + \mathcal{N}\!\left(y(x)\,\middle|\,\mu(x),\,\Sigma(x)\right)\, + \mathrm{d}y(x). + \end{equation} + + \item \textbf{Upper Confidence Bound (UCB)}: + \begin{equation} + a(x)=\mu(x) + \kappa\,\sigma(x), + \end{equation} + where $\sigma(x)$ denotes the predictive standard deviation and + $\kappa>0$ controls the exploration–exploitation balance. + In essence, UCB prefers points where the model is either promising + (large $\mu$) or highly uncertain (large $\sigma$). +\end{enumerate} + +\begin{remark} + \textbf{Optuna} is a widely used automatic hyperparameter tuning library, + recommended in class for practical Bayesian Optimization. +\end{remark} + + \end{document} \ No newline at end of file diff --git a/notes/2025/mvp/chapters/8-tel.pdf b/notes/2025/mvp/chapters/8-tel.pdf new file mode 100644 index 0000000..4218af8 Binary files /dev/null and b/notes/2025/mvp/chapters/8-tel.pdf differ diff --git a/notes/2025/mvp/chapters/8-tel.tex b/notes/2025/mvp/chapters/8-tel.tex new file mode 100644 index 0000000..89e5ac4 --- /dev/null +++ b/notes/2025/mvp/chapters/8-tel.tex @@ -0,0 +1,1094 @@ +\documentclass[../main]{subfiles} +\begin{document} +\chapter{Trees and Ensemble Learning} +\begin{introduction} + \item Decision Trees and Axis-Aligned Splits + \item Information Gain and Mutual-Information + \item Feature Selection and Purity Measures + \item Bagging and Variance Reduction + \item Random Forests and Out-of-Bag Evaluation + \item Boosting and Functional Gradient View +\end{introduction} + +\section*{Review: Gaussian Processes and Bayesian Optimization} + +In the previous chapter, we introduced Gaussian Processes (GPs) and showed how +they can be used for regression. Given training inputs +\begin{equation} + X = (x_1,\dots,x_n), \qquad + y = (y_1,\dots,y_n)^\top, +\end{equation} +and a test point $x_\star$, a GP prior with kernel $k(\cdot,\cdot)$ and i.i.d.\ +Gaussian observation noise of variance $\lambda$ yields the joint Gaussian +distribution +\begin{equation} + \begin{pmatrix} + y\\[2pt] + f_\star + \end{pmatrix} + \sim + \mathcal{N}\!\left( + 0,\, + \begin{bmatrix} + K(X,X) + \lambda I_n + & k(X,x_\star)\\[2pt] + k(x_\star,X) & k(x_\star,x_\star) + \end{bmatrix} + \right), +\end{equation} +where $K(X,X)$ is the $n\times n$ Gram matrix and +$k(X,x_\star) = (k(x_1,x_\star),\dots,k(x_n,x_\star))^\top$. +Conditioning on the observed $y$ gives the posterior predictive distribution +for $f_\star = f(x_\star)$: +\begin{align} + \mu_\star + &= \mathbb{E}[f_\star \mid X,y,x_\star] + = k(x_\star,X)^\top + \bigl(K(X,X)+\lambda I_n\bigr)^{-1} y, + \label{eq:gp-posterior-mean}\\ + \sigma_\star^2 + &= \operatorname{Var}[f_\star \mid X,y,x_\star] + = k(x_\star,x_\star) + - k(x_\star,X)^\top + \bigl(K(X,X)+\lambda I_n\bigr)^{-1} + k(x_\star,X). + \label{eq:gp-posterior-var} +\end{align} +By evaluating $\mu_\star$ at many test points and optionally plotting +$\mu_\star \pm \sigma_\star$, we obtain a smooth predictive curve together with +credible intervals that quantify uncertainty. + +\subsection*{Black-Box Optimization with Gaussian Processes} + +An important application of GP regression is \emph{Bayesian Optimization} (BO), +which addresses \emph{black-box optimization} problems: +\begin{equation} + \max_{x\in\mathcal{X}} f(x), +\end{equation} +where: +\begin{itemize} + \item $f(x)$ has no known analytic form (no closed-form expression); + \item gradients $\nabla f(x)$ are unavailable; + \item each evaluation of $f(x)$ is \emph{expensive}. +\end{itemize} +We can, however, query the black box at chosen points: +\begin{equation} + y = f(x) + \varepsilon, +\end{equation} +and use these point evaluations to build a surrogate model. + +A canonical example is \emph{hyperparameter tuning} for neural networks or +other machine learning models. Here: +\begin{itemize} + \item $x$ encodes a vector of hyperparameters (learning rate, width, depth, + regularization strength, \dots); + \item $f(x)$ is the validation performance (e.g.\ accuracy) obtained by + training the model with hyperparameters $x$ and evaluating on a validation + set. +\end{itemize} +Each function evaluation requires a full train--validate cycle and is therefore +very costly, so we must find good $x$ using as \emph{few} evaluations as +possible. + +\begin{remark} + Bayesian Optimization with a GP surrogate proceeds iteratively: + \begin{enumerate} + \item \textbf{Initialization.} Select an initial design + $\{x_i\}_{i=1}^n$ (e.g.\ random or space-filling) and evaluate + $y_i = f(x_i)$. + \item \textbf{Fit GP surrogate.} Use $\{(x_i,y_i)\}_{i=1}^n$ to fit a GP + regression model, yielding posterior mean $\mu(x)$ and variance + $\sigma^2(x)$ for all $x\in\mathcal{X}$. + \item \textbf{Acquisition maximization.} Define an \emph{acquisition + function} $a(x)$ that uses $\mu(x)$ and $\sigma(x)$ to score candidate + points, and choose the next evaluation point(s) by + \begin{equation} + x_{\text{next}} \in \argmax_{x\in\mathcal{X}} a(x). + \end{equation} + \item \textbf{Evaluate and update.} Query the black box at + $x_{\text{next}}$ to obtain $y_{\text{next}} = f(x_{\text{next}})$, augment + the dataset, and refit (or update) the GP surrogate. + \item \textbf{Repeat} Steps 2--4 until a pre-specified evaluation budget is + exhausted, then return the best observed $y$ (or its maximizer). + \end{enumerate} + The acquisition function is responsible for balancing + \emph{exploration} (sampling uncertain regions where $\sigma(x)$ is large) and + \emph{exploitation} (sampling near currently promising regions where + $\mu(x)$ is large). +\end{remark} + +\subsection*{Acquisition Functions} + +Let $y_{\max}$ denote the best function value observed so far, and suppose the +GP posterior at $x$ is Gaussian: +\begin{equation} + f(x)\mid \mathcal{D} + \sim \mathcal{N}\bigl(\mu(x), \sigma^2(x)\bigr). +\end{equation} +Two widely used acquisition functions are: +\begin{itemize} + \item \textbf{Expected Improvement (EI).} + Define the \emph{improvement} at $x$ as + \begin{equation} + I(x) = \bigl(f(x) - y_{\max}\bigr)_+ + = \max\{f(x)-y_{\max},\,0\}. + \end{equation} + The Expected Improvement is + \begin{equation} + a_{\mathrm{EI}}(x) + = \mathbb{E}[I(x)\mid\mathcal{D}]. + \end{equation} + Under the Gaussian posterior, this has a closed form: + \begin{equation} + a_{\mathrm{EI}}(x) + = (\mu(x)-y_{\max})\Phi(z) + \sigma(x)\phi(z), + \qquad + z = \frac{\mu(x)-y_{\max}}{\sigma(x)}, + \end{equation} + where $\Phi$ and $\phi$ are the CDF and PDF of the standard normal + distribution, respectively. EI focuses on regions that, on average, are + likely to yield improvements over the current best value. + + \item \textbf{Upper Confidence Bound (UCB).} + For a parameter $\kappa>0$, define + \begin{equation} + a_{\mathrm{UCB}}(x) + = \mu(x) + \kappa\,\sigma(x). + \end{equation} + When $\kappa=1$, $a_{\mathrm{UCB}}(x)$ follows the upper envelope + $\mu(x)+\sigma(x)$ of the GP posterior. Larger $\kappa$ encourages more + exploration (sampling high-uncertainty regions), while smaller $\kappa$ + favors exploitation. +\end{itemize} +In practice, BO alternates between refining the surrogate in regions where data +are scarce and zooming in on promising areas, progressively improving our +estimate of both the location and value of the optimum. + +\begin{remark} + Many modern hyperparameter optimization libraries implement Bayesian + Optimization with GP or related surrogates. For example, \texttt{Optuna} + provides a flexible interface where the user specifies: + \begin{itemize} + \item the search space of hyperparameters; + \item an evaluation budget (maximum number of trials); + \item an objective function that trains and evaluates the model. + \end{itemize} + The library then automatically manages the BO loop, returning (approximately) + optimal hyperparameters within the given budget. +\end{remark} + +\section{Decision Trees and Feature Selection} + +\subsection{Decision Trees and Axis-Aligned Splits} + +Previously, we used linear models (or linear models in a high-dimensional +feature space) to perform binary classification. The classifier can be +written as +\begin{equation} +f(x) = +\begin{cases} ++1, & \text{if } w^\top x + b \ge 0,\\[4pt] +-1, & \text{if } w^\top x + b < 0. +\end{cases} +\end{equation} +The decision boundary is the hyperplane +\begin{equation} +w^\top x + b = 0. +\end{equation} + +Another way to obtain more flexible, nonlinear decision boundaries is to use +\emph{tree-based models}. A simple decision tree implementing the rule above +can be drawn as follows: +\begin{figure}[h] + \centering + \includegraphics{../../tikz/8/1.pdf} +\end{figure} +\begin{definition}[Decision Tree] + A \emph{decision tree} is a tree consisting of a root node, internal nodes, + leaf nodes, and directed edges. + Each internal node partitions the data according to some feature. + For any input example, its final prediction is given by the label of the leaf + node it reaches. + \end{definition} + + \begin{example} + \newcommand{\cmark}{\ding{51}} % ✓ +\newcommand{\xmark}{\ding{55}} % ✗ + +\begin{wraptable}[20]{r}{0.4\linewidth} + \centering + % \caption{Feature table for the decision tree example} + \vspace{2mm} + \begin{tabular}{c|ccc|c} + \textbf{ID} & \textbf{A} & \textbf{B} & \textbf{C} & \textbf{y} \\ \hline + 1 & \cmark & \cmark & \cmark & \cmark \\ + 2 & \cmark & \xmark & \xmark & \cmark \\ + 3 & \cmark & \xmark & \cmark & \xmark \\ + 4 & \xmark & \cmark & \cmark & \xmark \\ + 5 & \xmark & \xmark & \xmark & \xmark \\ + 6 & \xmark & \xmark & \xmark & \xmark \\ + 7 & \xmark & \cmark & \xmark & \xmark \\ + 8 & \cmark & \xmark & \cmark & \xmark \\ + 9 & \xmark & \cmark & \cmark & \xmark \\ + \end{tabular} +\end{wraptable} + + We consider a binary classification setting where the label set is + \begin{equation} + Y \in \{-1, +1\}. + \end{equation} + Here, the interpretation is: + \begin{equation} + +1: \text{ a good researcher}, \qquad + -1: \text{ a bad researcher}. + \end{equation} + + Assume we have three binary features: + + \begin{itemize} + \item Feature \(A\): + \begin{equation} + +1: \text{ hardworking}, \qquad -1: \text{ not hardworking}. + \end{equation} + + \item Feature \(B\): + \begin{equation} + +1: \text{ has good vision}, \qquad -1: \text{ not}. + \end{equation} + + \item Feature \(C\): + \begin{equation} + +1: \text{ likes bananas}, \qquad -1: \text{ not}. + \end{equation} + \end{itemize} + + A decision tree may use these features at its internal nodes to determine + whether an example will ultimately be predicted as \(+1\) or \(-1\). + \end{example} + \begin{solution} + + \vspace{1em} + \begin{center} + \begin{minipage}{0.55\linewidth} + \centering + \includegraphics[width=\linewidth]{../../tikz/8/2.pdf} + \end{minipage} + \hfill + \begin{minipage}{0.33\linewidth} + \centering + \includegraphics[width=\linewidth]{../../tikz/8/3.pdf} + \end{minipage} + \end{center} + + \vspace{1em} + \end{solution} + +If we choose feature $C$ as the root node, the samples are not separated as cleanly as when we use feature $A$. +This leads to a natural question: \emph{given many candidate features, how can we choose the one that produces the ``purest'' split at the root?} + +From the example above, a good feature should split the training set into subsets whose labels are as pure (i.e., as close to all positive or all negative) as possible. +To quantify this notion of purity---or, conversely, of randomness---we introduce the notion of entropy. + +\subsection{Entropy, Information Gain, and Mutual Information} + +\begin{definition}[Entropy] + For a discrete random variable $X$ with distribution $P$, the (Shannon) + entropy of $P$ is defined as + \begin{equation} + H(P) + := -\sum_{x} P(x)\log P(x). + \end{equation} + Entropy measures the intrinsic uncertainty of the distribution. Equivalently, + \begin{equation} + H(P) + = \mathbb{E}_{X\sim P}\bigl[-\log P(X)\bigr], + \end{equation} + so $-\log P(X)$ can be viewed as the \emph{information content} of the outcome + $X$, and $H(P)$ is its average over many draws. +\end{definition} +Entropy has several essential properties. +\begin{enumerate} +\item +\textbf{Non-negativity.} Since $\log P(x) \le 0$ for all $x$, we have +\begin{equation} + H(P) = -\sum_x P(x)\log P(x) \ge 0, +\end{equation} +with equality if and only if the distribution is degenerate (i.e., $P(x)=1$ +for some $x$). +\item \textbf{Upper bound.} +Using Jensen's inequality: +\begin{equation} + H(P) + = -\mathbb{E}_{P}[\log P(X)] + \le -\log \mathbb{E}_{P}[P(X)] + = -\log \sum_x P(x)^2. +\end{equation} +In particular, for a distribution supported on $n$ distinct outcomes, +\begin{equation} + H(P) \le \log n, +\end{equation} +and equality holds if and only if $P$ is the uniform distribution: +\begin{equation} + P(x) = \frac{1}{n}. +\end{equation} +\end{enumerate} +\begin{remark} + Beyond measuring uncertainty, entropy also characterizes the fundamental + limit of lossless data compression. + + For a discrete source with distribution $P$, any prefix-free coding scheme + assigns a codeword of length $\ell(x)$ to each symbol $x$. Shannon's source + coding theorem states that the expected code length satisfies + \begin{equation} + \mathbb{E}[\ell(X)] + \;\ge\; + H(P) + := -\sum_x P(x)\log P(x), + \end{equation} + with equality achievable asymptotically. + + Thus, the entropy $H(P)$ represents the minimal achievable average number of + bits required to encode samples drawn from $P$. In this sense, entropy + quantifies both the intrinsic uncertainty of a distribution and the optimal + compression rate for a lossless encoding. + \end{remark} +\begin{definition}[Cross-Entropy] +Let $P$ and $Q$ be two distributions over the same support. +The cross-entropy between $P$ and $Q$ is defined as +\begin{equation} + H(P,Q) + := -\sum_x P(x)\log Q(x) + = \mathbb{E}_{P}[-\log Q(X)]. +\end{equation} +\end{definition} + +Observe: +\begin{equation} + H(P,Q) - H(P) + = -\sum_x P(x)\log Q(x) + +\sum_x P(x)\log P(x) + = \sum_x P(x)\log\frac{P(x)}{Q(x)}. +\end{equation} +\begin{definition}[Kullback--Leibler (KL) Divergence] + Let $P$ and $Q$ be two probability distributions over the same support. + The Kullback--Leibler divergence from $P$ to $Q$ is defined as + \begin{equation} + D_{\mathrm{KL}}(P \,\|\, Q) + := \sum_{x} P(x)\,\log\frac{P(x)}{Q(x)} + \end{equation} + in the discrete case, or + \begin{equation} + D_{\mathrm{KL}}(P \,\|\, Q) + := \int p(x)\,\log\frac{p(x)}{q(x)}\,\mathrm{d}x + \end{equation} + in the continuous case. + + KL divergence measures how different the distribution $P$ is from $Q$. + It satisfies + \begin{equation} + D_{\mathrm{KL}}(P\|Q) \ge 0, + \end{equation} + with equality if and only if $P = Q$ almost everywhere. + \end{definition} +Thus, +\begin{equation} + H(P,Q) - H(P) + = D_{\mathrm{KL}}(P\|Q) \ge 0, +\end{equation} +with equality if and only if $P = Q$. + +Cross-entropy decomposes into: +\begin{equation} + H(P,Q) = H(P) + D_{\mathrm{KL}}(P\|Q), +\end{equation} +meaning that cross-entropy is always at least as large as entropy and grows +with the divergence between $P$ and $Q$. +\begin{remark} + Huffman coding is a greedy algorithm for constructing an optimal + prefix-free code when the symbol probabilities are known. + More frequent symbols receive shorter codewords and rare symbols receive + longer ones. Shannon's source coding theorem guarantees that the expected + code length of a Huffman code is no larger than $H(P)+1$ bits per symbol, + so Huffman coding is essentially optimal among all lossless prefix codes. + \begin{center} + \vspace{0.3em} + \includegraphics{../../tikz/8/4.pdf} + \end{center} + Huffman coding is tightly connected to entropy and cross-entropy. + + \textbf{1. Entropy as the optimal code length.} + For a source with distribution $P$, the entropy + \begin{equation} + H(P) = -\sum_x P(x)\log P(x) + \end{equation} + is the theoretical lower bound on the average number of bits needed to encode + symbols drawn from $P$. Huffman coding produces the optimal prefix-free code, + and its expected code length $L_{\mathrm{Huff}}$ satisfies + \begin{equation} + H(P) \le L_{\mathrm{Huff}} < H(P)+1. + \end{equation} + + \textbf{2. Cross-entropy as the cost of using a wrong code.} + Suppose we design a Huffman code based on a \emph{wrong} distribution $Q$ + but the true data come from $P$. Then the expected code length becomes + \begin{equation} + \mathbb{E}_{P}[-\log Q(X)] + = H(P,Q), + \end{equation} + which is the cross-entropy between $P$ and $Q$. + + \textbf{3. Extra penalty equals KL divergence.} + The additional number of bits required due to the mismatch between $P$ and $Q$ + is + \begin{equation} + H(P,Q) - H(P) + = D_{\mathrm{KL}}(P\|Q). + \end{equation} + Thus, the KL divergence quantifies the inefficiency of using a code optimized + for $Q$ when the true distribution is $P$. + + \textbf{Summary.} + Entropy gives the minimal achievable rate, Huffman coding attains it (up to one + bit), and cross-entropy / KL divergence measure how much extra cost arises when + the coding distribution does not match the true distribution. + \end{remark} + \begin{theorem}[Shannon's Source Coding Theorem*] + Let $X$ be a discrete memoryless source with distribution $P$. + For any prefix-free code with codeword lengths $\{\ell(x)\}$, the + expected code length satisfies + \begin{equation} + \mathbb{E}[\ell(X)] \;\ge\; H(P). + \end{equation} + Moreover, for every $\varepsilon>0$, there exists a prefix-free code such that + \begin{equation} + \mathbb{E}[\ell(X)] \;\le\; H(P)+\varepsilon, + \end{equation} + for sufficiently long block coding. + Thus $H(P)$ is the optimal achievable rate for lossless compression. + \end{theorem} +\begin{proof} + The theorem consists of two parts: a converse (no code beats entropy) and + an achievability (entropy can be approached arbitrarily closely). + + \textbf{1. Converse: No prefix-free code can beat $H(P)$.} + For any prefix-free code, Kraft's inequality gives + \begin{equation} + \sum_x 2^{-\ell(x)} \le 1. + \end{equation} + Multiplying both sides by $P(x)$ and applying Jensen's inequality to the + convex function $-\log(\cdot)$ yields + \begin{equation} + \mathbb{E}[\ell(X)] + \;=\; \sum_x P(x)\ell(x) + \;\ge\; -\sum_x P(x)\log P(x) + \;=\; H(P). + \end{equation} + Thus entropy is a lower bound on any valid coding scheme. + + \textbf{2. Achievability: Constructing codes approaching $H(P)$.} + Consider $n$ i.i.d. samples $X^n$ from the source. + The Asymptotic Equipartition Property (AEP) states that for any + $\varepsilon>0$, with probability approaching $1$, sequences in the + typical set $\mathcal{T}_\varepsilon^{(n)}$ satisfy + \begin{equation} + -\frac{1}{n}\log P(X^n) + \in [H(P)-\varepsilon,\; H(P)+\varepsilon]. + \end{equation} + The typical set contains approximately $2^{nH(P)}$ sequences. + We can therefore assign to each typical sequence a codeword of length + \[ + nH(P)+o(n), + \] + while atypical sequences receive longer codewords but contribute + negligibly to the expected length. + This yields a block code with + \begin{equation} + \mathbb{E}[\ell(X^n)] + \le n(H(P)+\varepsilon). + \end{equation} + + \textbf{3. Conclusion.} + Combining the converse and achievability gives the optimal rate: + \[ + \lim_{n\to\infty} \frac{1}{n}\mathbb{E}[\ell(X^n)] = H(P). + \] + Thus $H(P)$ is both a fundamental lower bound and an achievable rate for + lossless source coding. + \end{proof} + + To measure how ``pure'' the labels in a dataset are, we use the notion of + \emph{information gain}. + + Let $D$ be a training set. + Labels lie in a finite set $Y\in\{1,\ldots,K\}$. + For each label $k$, define the subset + \begin{equation} + C_k := \{x\in D : y(x)=k\}. + \end{equation} + + The entropy of $D$ with respect to its empirical label distribution is + \begin{equation} + H(D) + = -\sum_{k=1}^K + \frac{|C_k|}{|D|} + \log\frac{|C_k|}{|D|} + \approx + -\sum_{y=1}^K P(y)\log P(y) + = H(Y), + \end{equation} + where $P(y)$ denotes the empirical frequency of label $y$. + + Let $A$ be a feature taking values in $\{a_1,\ldots,a_m\}$, and let + \begin{equation} + D_{a_i} := \{x\in D : A(x)=a_i\} + \end{equation} + be the subset of samples with feature value $a_i$. + + The conditional entropy of labels given the feature is + \begin{equation} + H(D\mid A) + = + \sum_{i=1}^m + \frac{|D_{a_i}|}{|D|} + H(D_{a_i}) + = + \sum_{i=1}^m + \frac{|D_{a_i}|}{|D|} + \left( + -\sum_{k=1}^K + \frac{|D_{a_i}\cap C_k|}{|D_{a_i}|} + \log \frac{|D_{a_i}\cap C_k|}{|D_{a_i}|} + \right), + \end{equation} +\begin{note} + \begin{equation} + H(D\mid A)=\mathbb E_{x|A}[H(D_x)] + \end{equation} +\end{note} + The \emph{information gain} of feature $A$ is defined as + \begin{definition}[Information gain (Mutual Information)] + \begin{equation} + g(D,A) + = + H(D) - H(D\mid A). + \end{equation} + \end{definition} + + A large value of $g(D,A)$ means that feature $A$ significantly reduces label + uncertainty, producing purer label subsets and therefore acting as a good + splitting feature in decision tree construction. + + More generally, for two random variables $X$ and $Y$, their \emph{mutual + information} is + \begin{equation} + I(X;Y) + = H(X) - H(X\mid Y) + = H(Y) - H(Y\mid X) + = H(X) + H(Y) - H(X,Y). + \end{equation} + In our setting, information gain is exactly the mutual information between the + feature and the label: + \begin{equation} + g(D,A) \approx I(Y;A). + \end{equation} + We therefore choose the best feature $A$ by + \begin{equation} + A^\star=\argmax_A g(D,A). + \end{equation} + + If two random variables satisfy $I(X;Y)=0$, then they are independent: knowing + one of them does not reduce the uncertainty about the other. + + Mutual information has several important properties: + \begin{enumerate} + \item \textbf{Symmetry.} $I(X;Y) = I(Y;X)$. + \item \textbf{Non-negativity.} $I(X;Y) \ge 0$, with equality if and only if $X$ and $Y$ are independent. + \item \textbf{Alternative forms.} $I(X;Y) = H(X) + H(Y) - H(X,Y)$. + \end{enumerate} + + The larger the mutual information, the more correlated $X$ and $Y$ are. When $X$ and $Y$ are highly correlated, knowing $X$ significantly reduces the uncertainty about $Y$, and vice versa. + +\begin{figure}[h] + \centering + \includegraphics{../../tikz/8/5.pdf} +\end{figure} + +\subsection{Information Gain Ratio} + +Information gain has a significant drawback: it tends to favor features with many possible values. + +\begin{example} + Consider a dataset with two features: $A$ (taking values $a_1, a_2$) and $B$ (taking values $b_1, \ldots, b_{10}$). Suppose that: + \begin{itemize} + \item Feature $A$ splits the data into two subsets, each containing 5 classes uniformly. + \item Feature $B$ splits the data into 10 pure subsets, each containing only one class. + \end{itemize} + Then $g(D,A) = \log 10 - \log 5 = \log 2$, while $g(D,B) = \log 10 - 0 = \log 10$. + Information gain would prefer feature $B$, even though it may have poor generalization ability. + + More extremely, if we use the index (row number) as a feature, each example would fall into its own pure subset, yielding maximum information gain $g(D,\mathrm{index}) = H(D)$, but such a feature has no generalization ability for new test examples. +\end{example} + +To address this issue, we introduce the \emph{information gain ratio}, which penalizes features with many values. + +\begin{definition}[Information Gain Ratio] + The information gain ratio of feature $A$ is defined as + \begin{equation} + \mathrm{GR}(D,A) + = \frac{g(D,A)}{H(D,A)}, + \end{equation} + where $H(D,A)$ is the entropy of feature $A$ itself: + \begin{equation} + H(D,A) + = -\sum_{i=1}^m \frac{|D_{a_i}|}{|D|}\log\frac{|D_{a_i}|}{|D|}. + \end{equation} + This measures the intrinsic uncertainty of the feature $A$'s distribution. +\end{definition} + +The denominator $H(D,A)$ penalizes features with many values. When $A$ has many uniformly distributed values, $H(D,A)$ approaches $\log m$, which reduces the gain ratio. This helps prevent overfitting to features that achieve high purity simply by having many possible values. + +\subsection{Gini Index} + +Another common purity measure is the \emph{Gini index}, which serves as an alternative to entropy. + +\begin{definition}[Gini Index] + For a dataset $D$ with labels in $\{1,\ldots,K\}$, the Gini index is defined as + \begin{equation} + \mathrm{Gini}(D) + = 1 - \sum_{k=1}^K \left(\frac{|C_k|}{|D|}\right)^2 + = \sum_{k=1}^K \frac{|C_k|}{|D|}\left(1 - \frac{|C_k|}{|D|}\right), + \end{equation} + where $C_k = \{x \in D : y(x) = k\}$. +\end{definition} + +The Gini index measures the expected error rate if we randomly label examples according to the class distribution. It satisfies: +\begin{itemize} + \item $\mathrm{Gini}(D) = 0$ when $D$ is pure (all examples have the same label). + \item $\mathrm{Gini}(D)$ is maximized when the class distribution is uniform, taking the value $1 - 1/K$ for $K$ classes. + \item For binary classification ($K=2$), $\mathrm{Gini}(D) = 2p(1-p)$, where $p$ is the proportion of the positive class. +\end{itemize} + +The conditional Gini index given feature $A$ is: +\begin{equation} + \mathrm{Gini}(D \mid A) + = \sum_{i=1}^m \frac{|D_{a_i}|}{|D|} \mathrm{Gini}(D_{a_i}). +\end{equation} + +For feature selection, we choose the feature that minimizes the conditional Gini index: +\begin{equation} + A^\star = \argmin_A \mathrm{Gini}(D \mid A). +\end{equation} + +\begin{remark} + For binary classification, the Gini index and entropy (scaled by $1/2$) are very similar functions of the class probability $p$. Both achieve their minimum at $p \in \{0,1\}$ and maximum at $p = 1/2$. +\end{remark} + +For regression problems, where labels are continuous, we use a different criterion based on squared loss. + +\begin{definition}[L2 Loss Criterion] + For a regression problem, after splitting by feature $A$ into subsets $D_1,\ldots,D_m$, define the mean label in each subset: + \begin{equation} + \bar{y}_{D_i} = \frac{1}{|D_i|}\sum_{j \in D_i} y_j. + \end{equation} + The L2 loss criterion is: + \begin{equation} + L(D,A) + = \sum_{i=1}^m \sum_{j \in D_i} (y_j - \bar{y}_{D_i})^2. + \end{equation} + We choose the feature that minimizes this criterion: + \begin{equation} + A^\star = \argmin_A L(D,A). + \end{equation} +\end{definition} + +This criterion measures the variance within each subset after splitting. A smaller L2 loss indicates purer subsets, where examples in each subset have labels close to their subset mean. The mean $\bar{y}_{D_i}$ serves as the prediction for examples in subset $D_i$. + +\subsection{Building Decision Trees: Greedy Algorithm} + +Given multiple feature selection criteria (information gain, information gain ratio, Gini index, or L2 loss), we can build a decision tree. The naive approach of enumerating all possible trees is computationally infeasible (super-exponential complexity). Instead, we use a \emph{greedy algorithm}. + +The greedy algorithm builds the tree recursively: + + \textbf{Input:} Training set $D$, feature set $\mathcal{F}$. + + \textbf{Procedure:} + \begin{enumerate} + \item \textbf{Choose a feature} $A \in \mathcal{F}$ according to a purity metric (e.g., information gain, information gain ratio, Gini index, or L2 loss). + \item \textbf{Partition} $D$ into subsets $D_1,\ldots,D_m$ based on feature $A$'s values $a_1,\ldots,a_m$. + \item \textbf{Recursively build subtrees} for each subset $D_i$ using the remaining features $\mathcal{F} \setminus \{A\}$. + + \textbf{Why remove $A$?} Since all examples in $D_i$ share the same value for feature $A$, further splitting by $A$ would be redundant. + + \item \textbf{Termination conditions:} + \begin{itemize} + \item The feature set $\mathcal{F}$ is empty (no more features available). + \item The subset $D_i$ is pure (contains only one class for classification, or has very low variance for regression). + \item Maximum depth is reached. + \item Purity improvement is below a threshold. + \item Number of samples in a node is below a minimum threshold. + \end{itemize} + \item When termination occurs, the node becomes a \textbf{leaf node} with label equal to the majority class (for classification) or the mean value (for regression) in that subset. + \end{enumerate} + + \textbf{Prediction:} For a test example, traverse the tree according to its feature values until reaching a leaf node, then use the leaf's label as the prediction. + + +\begin{remark} + The naive approach of enumerating all possible trees would require exploring all permutations of feature selection orders, leading to super-exponential complexity. The greedy algorithm reduces this to polynomial time by making locally optimal choices at each step, though it does not guarantee global optimality. +\end{remark} + +\begin{remark} + To prevent overfitting, common regularization techniques include: + \begin{itemize} + \item Limiting the maximum depth of the tree. + \item Setting a minimum number of samples required to split a node. + \item Setting a minimum purity improvement threshold. + \item Using a validation set to monitor performance and stop early. + \end{itemize} +\end{remark} + +\begin{remark} + The greedy algorithm is heuristic: it makes locally optimal choices at each step but does not guarantee a globally optimal tree. Despite this, decision trees and their ensemble variants (Random Forests, Gradient Boosting) are among the most powerful methods in practice, especially for tabular data. +\end{remark} + +\subsection{Continuous Features in Decision Trees} + +In the preceding discussion, we implicitly assumed that features are discrete, +taking values in a finite set. For such a feature $A$ with possible values +$\{a_1,\dots,a_m\}$, a split on $A$ partitions the training set into $m$ +subsets $D_{a_1},\dots,D_{a_m}$. + +In practice, many features are continuous. To handle a continuous feature $A$, +we can discretize it by introducing \emph{thresholds}. Let +\begin{equation} + \{a^{(1)},\dots,a^{(n)}\} +\end{equation} +be the distinct values of $A$ observed in the training set (for some feature +dimension). Sort them so that +\begin{equation} + a^{(1)} < a^{(2)} < \dots < a^{(n)}. +\end{equation} +Candidate thresholds can then be placed between successive values: +\begin{equation} + \tau_j = \frac{a^{(j)} + a^{(j+1)}}{2}, \qquad j=1,\dots,n-1. +\end{equation} +Each threshold $\tau_j$ defines a binary split +\begin{equation} + A \le \tau_j + \quad\text{vs.}\quad + A > \tau_j, +\end{equation} +which we can evaluate using any of our usual criteria (information gain, Gini +index, squared loss for regression, etc.). By scanning over all candidate +thresholds and choosing the best one according to the purity measure, the tree +handles continuous features while remaining axis-aligned. + +At prediction time, a test example with continuous feature value $A(x)$ is +routed through the tree by comparing $A(x)$ with the learned thresholds at each +internal node, just as if the feature had been discrete. + +\section{From Single Trees to Ensembles} + +So far, we have seen how to build and select a \emph{single} decision tree, by +choosing features that maximize information gain at each split. In practice, +however, a single tree often suffers from two issues: +\begin{itemize} + \item \textbf{High variance.} Small changes in the training data may lead to + very different trees, especially when the tree is grown deep and fits the + training set closely. + \item \textbf{Limited accuracy.} A single tree is easy to interpret, but its + predictive performance may lag behind more powerful models such as kernel + methods or neural networks. +\end{itemize} + +Ensemble learning addresses these issues by combining the predictions of many +base learners (often decision trees) to form a stronger model. + +\begin{definition}[Ensemble Learning] + Ensemble learning constructs a predictor by aggregating a collection of + base learners (also called \emph{weak learners}). Given base predictors + $f_1,\dots,f_M$, the ensemble predictor takes the form + \begin{equation} + F(x) = + \begin{cases} + \mathrm{majority\ vote}\bigl(f_1(x),\dots,f_M(x)\bigr), + & \text{(classification)},\\[4pt] + \dfrac{1}{M}\sum_{m=1}^M f_m(x), + & \text{(regression)}. + \end{cases} + \end{equation} + The key idea is that even if each $f_m$ is only moderately accurate, their + combination can be significantly more accurate and more robust. +\end{definition} + +\subsection{Bias--Variance Decomposition} +Consider a supervised learning setting with training dataset $D$, drawn from +some unknown data-generating distribution. Let $x$ denote a test input with +true label $y$, and let +\begin{equation} + f(x;D) +\end{equation} +denote the prediction of a learning algorithm trained on $D$ when evaluated at +$x$. Under squared loss, the prediction error at $(x,y)$ for a given training +set $D$ is +\begin{equation} + \bigl(f(x;D) - y\bigr)^2. +\end{equation} +Since $D$ is random, it is natural to measure performance by averaging over the +random draw of $D$. We therefore consider +\begin{equation} + \mathbb{E}_{D}\Bigl[\bigl(f(x;D) - y\bigr)^2\Bigr], +\end{equation} +which quantifies the expected squared error at $x$, averaged over the +randomness in the training data. + +Define the \emph{average prediction} at $x$ as +\begin{equation} + \bar{f}(x) + := \mathbb{E}_{D}\bigl[f(x;D)\bigr], +\end{equation} +that is, the expected prediction we would obtain if we could repeatedly sample +training sets $D$ from the underlying distribution and retrain the model. +Insert and subtract $\bar{f}(x)$ inside the square: +\begin{align} + \mathbb{E}_{D}\Bigl[\bigl(f(x;D) - y\bigr)^2\Bigr] + &= \mathbb{E}_{D} + \Bigl[\bigl(f(x;D) - \bar{f}(x) + \bar{f}(x) - y\bigr)^2\Bigr]\\ + &= \mathbb{E}_{D} + \Bigl[\bigl(f(x;D) - \bar{f}(x)\bigr)^2\Bigr] + + \bigl(\bar{f}(x) - y\bigr)^2\\ + &\quad + + 2\,\mathbb{E}_{D}\Bigl[\bigl(f(x;D) - \bar{f}(x)\bigr)\bigl(\bar{f}(x) - y\bigr)\Bigr]. +\end{align} +The cross term vanishes because +\begin{equation} + \mathbb{E}_{D}[f(x;D) - \bar{f}(x)] + = \mathbb{E}_{D}[f(x;D)] - \bar{f}(x) = 0, +\end{equation} +and $\bar{f}(x)-y$ does not depend on $D$. Hence +\begin{equation} + \mathbb{E}_{D}\Bigl[\bigl(f(x;D) - y\bigr)^2\Bigr] + = + \underbrace{\mathbb{E}_{D} + \Bigl[\bigl(f(x;D) - \bar{f}(x)\bigr)^2\Bigr]}_{\text{variance at }x} + + + \underbrace{\bigl(\bar{f}(x) - y\bigr)^2}_{\text{bias}^2 \text{ at }x}. +\end{equation} + +\begin{definition}[Bias and variance at a test point] + For a fixed test input $x$ with true label $y$, the + \emph{variance} and \emph{(squared) bias} of a learning algorithm are + defined as + \begin{align} + \mathrm{Var}_D\bigl[f(x;D)\bigr] + &= \mathbb{E}_{D} + \Bigl[\bigl(f(x;D)-\bar{f}(x)\bigr)^2\Bigr],\\ + \mathrm{Bias}^2(x) + &= \bigl(\bar{f}(x) - y\bigr)^2. + \end{align} +\end{definition} + +Intuitively: +\begin{itemize} + \item The variance term measures how sensitive the predictor $f(x;D)$ is to + fluctuations in the training data: for different draws of $D$, predictions + at $x$ may vary significantly around their mean $\bar{f}(x)$. + \item The bias term measures how far, on average, the prediction is from + the true label $y$. Even if we could train on infinitely many different + datasets, the average predictor $\bar{f}(x)$ might still systematically + miss the true target due to limited model capacity or misspecification. +\end{itemize} + +High variance typically arises from \emph{overfitting} particular training sets, +while high bias indicates that the model class itself is too simple to capture +the underlying relationship, even when averaged over many datasets. + +\begin{example}[Linear model on nonlinear data] + Suppose the data arise from a quadratic relationship, but we insist on + fitting a linear model. Even if we repeatedly resample training sets and + average the resulting fitted lines, the averaged predictor $\bar{f}(x)$ + remains linear and cannot match the true quadratic curve everywhere. This + manifests as a persistent bias term. +\end{example} + +Bias and variance suggest complementary strategies: +\begin{itemize} + \item To reduce \textbf{variance}, we can increase the training set size or + average multiple diverse models (as in bagging and random forests). + \item To reduce \textbf{bias}, we can increase model capacity, e.g.\ by + using more flexible hypothesis classes or boosting weak learners. +\end{itemize} + +Two classical ensemble strategies are: +\begin{itemize} + \item \emph{Bagging} (Bootstrap Aggregating): build many base learners + independently on randomized versions of the data, and average or vote. + \item \emph{Boosting}: build base learners sequentially, each one focusing + on correcting the mistakes of the previous ones. +\end{itemize} + +In modern machine learning practice, decision-tree ensembles based on bagging +and boosting are among the most powerful off-the-shelf methods, with famous +examples including Random Forests, Gradient Boosting Trees, and XGBoost. + +\subsection{Bagging: Bootstrap Aggregating} + +Bagging is designed to reduce the variance of an unstable base learner (such as +a deep decision tree) by averaging many independently trained copies. + +\begin{definition}[Bagging] + Let $D$ be a training dataset of size $n$, and let $\mathcal{A}$ be a base + learning algorithm that outputs a predictor $f = \mathcal{A}(D)$. + Bagging constructs $M$ bootstrap datasets $D^{(1)},\dots,D^{(M)}$, where + each $D^{(m)}$ is obtained by sampling $n$ points \emph{with replacement} + from $D$. + + On each bootstrap dataset $D^{(m)}$, we train a base learner + \begin{equation} + f_m = \mathcal{A}\bigl(D^{(m)}\bigr), + \qquad m=1,\dots,M. + \end{equation} + The bagged ensemble predictor is + \begin{equation} + F_{\mathrm{bag}}(x) + = + \begin{cases} + \mathrm{sign}\Bigl(\dfrac{1}{M}\sum_{m=1}^M f_m(x)\Bigr), + & \text{classification},\\[6pt] + \dfrac{1}{M}\sum_{m=1}^M f_m(x), + & \text{regression}. + \end{cases} + \end{equation} +\end{definition} + +Intuitively, each bootstrap sample $D^{(m)}$ can be viewed as a noisy version +of the original dataset. The base learners $f_m$ will differ from one another, +and averaging their predictions cancels out part of the randomness. + +\begin{remark}[Variance Reduction] + Consider a simplified setting where we average $M$ identically distributed + base predictors $f_1,\dots,f_M$ for regression, each with variance + $\sigma^2$ and pairwise correlation $\rho$. Then the variance of the + averaged predictor is + \begin{equation} + \mathrm{Var}\bigl(F_{\mathrm{bag}}(x)\bigr) + = \frac{1}{M^2}\sum_{m=1}^M\sum_{m'=1}^M + \mathrm{Cov}\bigl(f_m(x), f_{m'}(x)\bigr) + \approx \sigma^2\left(\rho + \frac{1-\rho}{M}\right). + \end{equation} + As $M$ grows, the second term $(1-\rho)/M$ vanishes, so the variance is + dominated by $\rho\sigma^2$. Thus, bagging is most effective when base + learners are accurate but have high variance and are not too strongly + correlated. +\end{remark} + +\begin{figure}[h] + \centering + \includegraphics{../../tikz/8/6.pdf} +\end{figure} + +\subsection{Random Forests} + +Random Forests specialize bagging to decision trees, and further inject +randomness at the \emph{feature} level to decorrelate the trees. + +\begin{definition}[Random Forest] + A Random Forest is an ensemble of decision trees trained with two sources + of randomness: + \begin{enumerate} + \item \textbf{Bootstrap sampling of data.} + For each tree $m$, sample a bootstrap dataset $D^{(m)}$ from $D$. + \item \textbf{Random feature selection at each split.} + When splitting a node, instead of considering all features, randomly + select a subset $\mathcal{F}$ of features (of size $d_{\mathrm{sub}}$), + and choose the best split only among features in $\mathcal{F}$. + \end{enumerate} + The final prediction aggregates all trees by majority vote or averaging, + as in bagging. +\end{definition} + +The additional randomness in feature selection has two important effects: +\begin{itemize} + \item It reduces the correlation between different trees, which + strengthens the variance reduction effect of averaging. + \item It forces each tree to explore different feature combinations, + sometimes discovering useful patterns that a single greedy tree might miss. +\end{itemize} + +In practice, Random Forests are strong general-purpose models with relatively +few hyperparameters. Common choices include: +\begin{itemize} + \item the number of trees $M$ (often in the hundreds), + \item the maximum depth of each tree, + \item the number of features $d_{\mathrm{sub}}$ considered at each split + (e.g., $\sqrt{d}$ for classification, where $d$ is the total number of + features). +\end{itemize} + +\begin{remark}[Out-of-Bag Evaluation] + In each bootstrap sample $D^{(m)}$, roughly a fraction $1-1/e\approx 0.63$ + of the original training points are included, and the remaining points are + left out. For any training example, we can average the predictions of all + trees that did \emph{not} see this example during training; this is called + the \emph{out-of-bag} prediction. Aggregating these predictions over the + training set provides an internal estimate of the generalization error, + without using a separate validation set. +\end{remark} + +\subsection{Boosting} + +While bagging focuses on variance reduction by averaging many independent +learners, boosting builds an ensemble \emph{sequentially}. Each new learner +tries to correct the mistakes of the current ensemble, effectively turning a +collection of weak learners into a strong one. + +\begin{definition}[Boosting (High-Level View)] + Given a training set $D = \{(x_i,y_i)\}_{i=1}^n$, boosting maintains an + ensemble predictor + \begin{equation} + F_0(x) \equiv 0,\qquad + F_M(x) = \sum_{m=1}^M \alpha_m f_m(x), + \end{equation} + where each $f_m$ is a base learner (often a shallow tree) and the weights + $\alpha_m$ control their influence. + + At iteration $m$, boosting chooses $f_m$ to focus on the current residual + errors or misclassified examples of $F_{m-1}$, and then updates the ensemble + to $F_m$. +\end{definition} + +One classical example is AdaBoost for binary classification. It maintains a +distribution of weights over training samples, gives higher weights to +misclassified points, and fits a new weak learner to this reweighted dataset at +each iteration. + +\begin{remark}[Boosting vs.\ Bagging] + \begin{itemize} + \item Bagging trains base learners \emph{in parallel} on resampled + datasets and primarily reduces variance. + \item Boosting trains base learners \emph{sequentially}, focusing on + difficult samples, and can reduce both bias and variance, but is also + more prone to overfitting if not regularized (e.g., via tree depth, + learning rate, or early stopping). + \end{itemize} + Modern implementations such as Gradient Boosting Trees and XGBoost can be + viewed as performing a form of functional gradient descent in function + space, where each tree fits the negative gradient of the loss with respect + to the current ensemble prediction. +\end{remark} + +\begin{figure}[h] + \centering + \includegraphics{../../tikz/8/7.pdf} +\end{figure} + +\begin{note} + From a geometric perspective, both Random Forests and boosting-based tree + ensembles can be seen as constructing a complex, highly nonlinear decision + boundary by patching together many simple, axis-aligned splits. Whereas a + single tree corresponds to a small set of such partitions, an ensemble can + carve out increasingly intricate decision regions, often achieving + state-of-the-art performance on tabular data. +\end{note} + +\end{document} diff --git a/notes/2025/mvp/chapters/9-em-mog.pdf b/notes/2025/mvp/chapters/9-em-mog.pdf new file mode 100644 index 0000000..9929aa6 Binary files /dev/null and b/notes/2025/mvp/chapters/9-em-mog.pdf differ diff --git a/notes/2025/mvp/chapters/9-em-mog.tex b/notes/2025/mvp/chapters/9-em-mog.tex new file mode 100644 index 0000000..c0c8782 --- /dev/null +++ b/notes/2025/mvp/chapters/9-em-mog.tex @@ -0,0 +1,437 @@ +\documentclass[../main]{subfiles} +\begin{document} +\chapter{Expectation--Maximization and Mixture of Gaussians} +\begin{introduction} + \item Mixture of Gaussians (MoG) as a generative clustering model + \item Maximum likelihood with latent variables: why it is hard + \item KL divergence and the evidence lower bound (ELBO) + \item EM algorithm: E-step and M-step, monotonic improvement and local optima + \item Closed-form EM updates for MoG and the connection to K-means +\end{introduction} + +\section{Mixture of Gaussians (MoG): a generative view of clustering} +We revisit clustering from a probabilistic and generative perspective. +Instead of assigning each datapoint to a cluster deterministically, we assume +each datapoint is generated by \emph{first} choosing a latent cluster index and +\emph{then} sampling from a Gaussian distribution attached to that cluster. + +\subsection{Latent variable and generative process} +Let $K\in\mathbb N$ be the number of clusters (components). For each datapoint +$x\in\mathbb R^d$, introduce a latent discrete variable +\begin{equation} + G \in \{1,2,\dots,K\}, +\end{equation} +where $G=k$ indicates that $x$ is generated from the $k$-th Gaussian component. +We model +\begin{equation} + p(G=k)=\pi_k,\qquad \pi_k\ge 0,\qquad \sum_{k=1}^K \pi_k = 1, +\end{equation} +and the conditional likelihood +\begin{equation} + p(x\mid G=k)=\mathcal N(x\mid \mu_k,\Sigma_k), +\end{equation} +where $\mu_k\in\mathbb R^d$ and $\Sigma_k\in\mathbb R^{d\times d}$ is symmetric +positive definite. + +\begin{note} + Earlier chapters may encode the latent cluster by a one-hot vector + $z\in\{0,1\}^K$ with $\sum_k z_k=1$. The integer notation here is + equivalent but notationally lighter: + $G=k \Longleftrightarrow z_k=1$. +\end{note} + +\subsection{Marginal likelihood and the MLE objective} +By marginalizing $G$, the density of $x$ becomes a mixture: +\begin{equation}\label{eq:mog-marginal} + p(x;\theta) + = + \sum_{k=1}^K p(G=k)\,p(x\mid G=k) + = + \sum_{k=1}^K \pi_k\,\mathcal N(x\mid \mu_k,\Sigma_k), +\end{equation} +where we collect all parameters as +\begin{equation} + \theta := \Bigl\{\pi_k,\mu_k,\Sigma_k\Bigr\}_{k=1}^K. +\end{equation} + +Given i.i.d.\ data $\{x_i\}_{i=1}^n$, maximum likelihood estimation solves +\begin{equation}\label{eq:mog-mle} + \argmax_{\theta}\; + \sum_{i=1}^n \log p(x_i;\theta) + = + \argmax_{\theta}\; + \sum_{i=1}^n + \log\!\left( + \sum_{k=1}^K \pi_k\,\mathcal N(x_i\mid \mu_k,\Sigma_k) + \right). +\end{equation} + +\begin{remark} + Unlike supervised learning objectives of the form $\sum_i \log p(y_i\mid + x_i;\theta)$, unsupervised density estimation maximizes $\sum_i \log + p(x_i;\theta)$ directly because no labels are observed. +\end{remark} + +\subsection{Why direct optimization is non-trivial} +The objective \eqref{eq:mog-mle} is difficult for at least two reasons. +\begin{itemize} + \item \textbf{Log-sum structure:} $\log\big(\sum_k \pi_k \mathcal N(\cdot)\big)$ + couples all components inside a logarithm, preventing simple closed-form + derivatives from decoupling. + \item \textbf{Constraints:} $\{\pi_k\}$ must lie on the simplex and each + $\Sigma_k$ must be positive definite. One can use constrained optimization, + penalty methods, or reparameterization, but a naive unconstrained gradient + method is not directly applicable. +\end{itemize} +\begin{remark} + Gradient descent can be made to work with non-trivial adaptations (e.g., + softmax parameters for $\pi$, Cholesky factors for $\Sigma$), but EM + typically provides a faster and more elegant framework for this class of + latent-variable MLE problems. +\end{remark} + +\section{From latent-variable MLE to ELBO} +We now derive the Expectation--Maximization (EM) algorithm in a general setting. + +\subsection{General latent-variable likelihood} +Let $x$ be observed and $z$ be a latent variable (discrete or continuous). +Assume a joint model $p(x,z;\theta)$. The marginal likelihood is +\begin{equation} + p(x;\theta) = \sum_z p(x,z;\theta)\qquad + (\text{or } \int p(x,z;\theta)\,\mathrm dz). +\end{equation} +In MLE, the objective is +\begin{equation} + \argmax_{\theta}\; \sum_{i=1}^n \log p(x_i;\theta). +\end{equation} +The marginalization over $z$ often makes $\log p(x;\theta)$ hard to optimize. + +\subsection{KL divergence: definition and key properties} +\begin{definition}[Kullback--Leibler divergence] + For distributions $Q$ and $P$ on the same variable $Z$, the KL divergence + is + \begin{equation} + \mathrm{KL}(Q\|P) + := \mathbb E_{Z\sim Q}\!\left[\log\frac{Q(Z)}{P(Z)}\right]. + \end{equation} +\end{definition} + +\begin{proposition}[Basic properties of KL]\label{prop:kl-basic} + For any $Q,P$, + \begin{enumerate} + \item $\mathrm{KL}(Q\|P)\ge 0$. + \item $\mathrm{KL}(Q\|P)=0$ if and only if $Q=P$ almost surely. + \item In general, $\mathrm{KL}(Q\|P)\neq \mathrm{KL}(P\|Q)$ (not symmetric). + \end{enumerate} +\end{proposition} +\begin{proof} + By Jensen's inequality applied to the convex function $-\log(\cdot)$, + \begin{align*} + \mathrm{KL}(Q\|P) + &= -\mathbb E_{Q}\!\left[\log \frac{P(Z)}{Q(Z)}\right] + \ge -\log \mathbb E_{Q}\!\left[\frac{P(Z)}{Q(Z)}\right] + = -\log\!\left(\sum_z P(z)\right) + = 0, + \end{align*} + and equality holds iff $\frac{P(Z)}{Q(Z)}$ is constant $Q$-a.s., which gives + $Q=P$ a.s. Non-symmetry follows from counterexamples. +\end{proof} + +\begin{remark} + KL can be written as \textbf{cross-entropy minus entropy}: + \begin{equation} + \mathrm{KL}(Q\|P) + = + \underbrace{\mathbb E_Q[-\log P(Z)]}_{\text{cross-entropy}} + - + \underbrace{\mathbb E_Q[-\log Q(Z)]}_{\text{entropy}}. + \end{equation} + This explains why using the ``wrong code'' $P$ to encode samples from $Q$ + necessarily incurs extra expected description length. +\end{remark} + +\subsection{ELBO decomposition} +Introduce an arbitrary distribution $q(z)$ (a ``variational distribution''). +Then the following identity holds: +\begin{theorem}[ELBO decomposition]\label{thm:elbo} + For any $q(z)$ and any $\theta$, + \begin{equation}\label{eq:elbo-decomp} + \log p(x;\theta) + = + \underbrace{ + \mathbb E_{z\sim q}\!\left[\log \frac{p(x,z;\theta)}{q(z)}\right] + }_{\mathcal L(q,\theta)\;\;(\text{ELBO})} + + + \underbrace{ + \mathrm{KL}\!\left(q(z)\,\middle\|\,p(z\mid x;\theta)\right) + }_{\ge 0}. + \end{equation} + Consequently, $\mathcal L(q,\theta)\le \log p(x;\theta)$ for all $q$. +\end{theorem} +\begin{proof} + Starting from $\log p(x;\theta)$ and taking expectation over $q$, + \begin{align*} + \log p(x;\theta) + &= \sum_z q(z)\,\log p(x;\theta) \\ + &= \sum_z q(z)\,\log \frac{p(x,z;\theta)}{p(z\mid x;\theta)} \\ + &= \sum_z q(z)\,\log \frac{p(x,z;\theta)}{q(z)} + + \sum_z q(z)\,\log \frac{q(z)}{p(z\mid x;\theta)}, + \end{align*} + where the last term is exactly the KL divergence. +\end{proof} + +\begin{definition}[Evidence lower bound (ELBO)] + The functional + \begin{equation} + \mathcal L(q,\theta) + := + \mathbb E_{z\sim q}\!\left[\log p(x,z;\theta)\right] + - + \mathbb E_{z\sim q}\!\left[\log q(z)\right] + = + \mathbb E_{z\sim q}\!\left[\log \frac{p(x,z;\theta)}{q(z)}\right] + \end{equation} + is called the \emph{evidence lower bound}. +\end{definition} + +\begin{remark} + The abbreviation \textbf{ELBO} is commonly read as ``elbow'' in talks. + The bound gap is precisely + $\mathrm{KL}\big(q(z)\|p(z\mid x;\theta)\big)$. + When $q(z)=p(z\mid x;\theta)$, the bound is tight. +\end{remark} + +\begin{note} + When the posterior $p(z\mid x;\theta)$ is intractable, one typically + restricts $q$ to a tractable family and maximizes ELBO approximately (this + is the core idea of variational inference). EM can be viewed as the special + case where the E-step posterior is tractable and makes the bound tight at + the current iterate. +\end{note} + +\section{The EM algorithm} +EM is an iterative algorithm that alternates between optimizing $q$ (E-step) +and optimizing $\theta$ (M-step), using ELBO as a surrogate objective. + +\subsection{E-step and M-step from ELBO} +Suppose we are at $\theta^{(t)}$. +\begin{itemize} + \item \textbf{E-step (fix $\theta$):} + choose $q^{(t+1)}$ to maximize $\mathcal L(q,\theta^{(t)})$. + Since $\log p(x;\theta^{(t)})$ is a constant w.r.t.\ $q$, + Theorem~\ref{thm:elbo} implies + \begin{equation}\label{eq:e-step} + q^{(t+1)}(z) + = + p\bigl(z\mid x;\theta^{(t)}\bigr), + \end{equation} + which minimizes the KL gap to $0$. + + \item \textbf{M-step (fix $q$):} + update $\theta$ by maximizing the ELBO + \begin{equation}\label{eq:m-step} + \theta^{(t+1)} + \in + \argmax_{\theta}\; \mathcal L\!\left(q^{(t+1)},\theta\right). + \end{equation} + Because the term $-\mathbb E_{q^{(t+1)}}[\log q^{(t+1)}(z)]$ does not depend + on $\theta$, the M-step equivalently maximizes + \begin{equation}\label{eq:q-function} + Q(\theta\mid \theta^{(t)}) + := + \mathbb E_{z\sim p(z\mid x;\theta^{(t)})}\!\left[\log p(x,z;\theta)\right], + \end{equation} + the expected complete-data log-likelihood. +\end{itemize} + +\begin{remark} + The ``expectation'' in \textbf{E-step} refers to taking expectation with + respect to the posterior $p(z\mid x;\theta^{(t)})$, which is computed in the + E-step and then used to form the expected complete log-likelihood in the + M-step. +\end{remark} + +\begin{note} + The overall algorithm repeats \textbf{E-step $\rightarrow$ M-step} until + convergence, e.g.\ until $\theta^{(t+1)}$ is sufficiently close to + $\theta^{(t)}$ or the log-likelihood improvement becomes negligible. +\end{note} + +\subsection{Monotonicity and local optima} +\begin{proposition}[Monotonic improvement] + Each EM iteration does not decrease the data log-likelihood: + \[ + \log p(x;\theta^{(t+1)}) \ge \log p(x;\theta^{(t)}). + \] +\end{proposition} +\begin{proof} + In the E-step, the ELBO is made tight at $\theta^{(t)}$ by choosing + $q^{(t+1)}(z)=p(z\mid x;\theta^{(t)})$, hence + $\mathcal L(q^{(t+1)},\theta^{(t)})=\log p(x;\theta^{(t)})$. + In the M-step, we maximize the ELBO in $\theta$, so + \[ + \mathcal L(q^{(t+1)},\theta^{(t+1)}) + \ge + \mathcal L(q^{(t+1)},\theta^{(t)}) + = + \log p(x;\theta^{(t)}). + \] + Since ELBO is always a lower bound, $\log p(x;\theta^{(t+1)})\ge + \mathcal L(q^{(t+1)},\theta^{(t+1)})$, proving the claim. +\end{proof} + +\begin{remark} + EM typically converges to a \emph{stationary point}, which may be a local + optimum. Different initializations can lead to different solutions. +\end{remark} + +% \begin{figure}[h] +% \centering +% \includegraphics[width=0.9\textwidth]{../../mvp/tikz/9/5.pdf} +% \caption{Geometric intuition: each E-step picks an ELBO that ``touches'' the log-likelihood at the current $\theta^{(t)}$, and the M-step maximizes that ELBO to obtain a new parameter $\theta^{(t+1)}$ with increased likelihood.} +% \label{fig:em-elbo-geometry} +% \end{figure} + +\section{EM for Mixture of Gaussians} +We now apply the general EM framework to MoG. + +\subsection{Complete-data likelihood} +For a single datapoint $(x_i,G_i)$, +\begin{equation} + p(x_i,G_i=k;\theta) + = + p(G_i=k)\,p(x_i\mid G_i=k) + = + \pi_k\,\mathcal N(x_i\mid \mu_k,\Sigma_k). +\end{equation} +For the dataset, the complete-data log-likelihood is +\begin{equation} + \log p(\{x_i\},\{G_i\};\theta) + = + \sum_{i=1}^n \sum_{k=1}^K \mathbb I\{G_i=k\} + \Bigl( + \log \pi_k + \log \mathcal N(x_i\mid \mu_k,\Sigma_k) + \Bigr). +\end{equation} + +\subsection{E-step: responsibilities} +Define the \emph{responsibility} (posterior assignment probability) +\begin{equation}\label{eq:responsibility} + \gamma_{ik} + := + p(G_i=k \mid x_i;\theta^{(t)}). +\end{equation} +By Bayes' rule, +\begin{equation}\label{eq:mog-e-step} + \gamma_{ik} + = + \frac{ + \pi_k^{(t)}\,\mathcal N(x_i\mid \mu_k^{(t)},\Sigma_k^{(t)}) + }{ + \sum_{j=1}^K \pi_j^{(t)}\,\mathcal N(x_i\mid \mu_j^{(t)},\Sigma_j^{(t)}) + }. +\end{equation} +Intuitively, $\gamma_{ik}$ measures the fraction of ``credit'' component $k$ +takes for explaining datapoint $x_i$. + +\subsection{M-step: closed-form updates} +Given $\{\gamma_{ik}\}$, the EM objective for MoG becomes +\begin{equation}\label{eq:mog-Q} + Q(\theta\mid \theta^{(t)}) + = + \sum_{i=1}^n \sum_{k=1}^K \gamma_{ik} + \Bigl( + \log \pi_k + \log \mathcal N(x_i\mid \mu_k,\Sigma_k) + \Bigr). +\end{equation} +Let +\begin{equation} + N_k := \sum_{i=1}^n \gamma_{ik}. +\end{equation} + +\subsubsection{Update of $\pi_k$ (simplex constraint)} +We maximize \eqref{eq:mog-Q} w.r.t.\ $\pi$ subject to $\sum_k \pi_k=1$. +Introduce a Lagrange multiplier $\lambda$: +\begin{equation} + \mathcal J(\pi,\lambda) + = + \sum_{i=1}^n \sum_{k=1}^K \gamma_{ik}\log \pi_k + + \lambda\left(1-\sum_{k=1}^K \pi_k\right). +\end{equation} +Setting $\partial \mathcal J/\partial \pi_k=0$ gives +\begin{equation} + \pi_k + = + \frac{1}{n}\sum_{i=1}^n \gamma_{ik} + = + \frac{N_k}{n}. +\end{equation} + +\subsubsection{Update of $\mu_k$} +Using the Gaussian log-density, the terms depending on $\mu_k$ yield the +weighted least-squares problem whose solution is +\begin{equation}\label{eq:mog-mu-update} + \mu_k + = + \frac{\sum_{i=1}^n \gamma_{ik} x_i}{\sum_{i=1}^n \gamma_{ik}} + = + \frac{1}{N_k}\sum_{i=1}^n \gamma_{ik} x_i. +\end{equation} + +\subsubsection{Update of $\Sigma_k$} +Similarly, maximizing w.r.t.\ $\Sigma_k$ gives +\begin{equation}\label{eq:mog-sigma-update} + \Sigma_k + = + \frac{\sum_{i=1}^n \gamma_{ik} (x_i-\mu_k)(x_i-\mu_k)^\top}{\sum_{i=1}^n \gamma_{ik}} + = + \frac{1}{N_k}\sum_{i=1}^n \gamma_{ik} (x_i-\mu_k)(x_i-\mu_k)^\top. +\end{equation} + +\begin{remark} + The updates \eqref{eq:mog-mu-update}--\eqref{eq:mog-sigma-update} are + \textbf{soft-assignment weighted} empirical mean and covariance. If + $\gamma_{ik}\in\{0,1\}$ becomes hard assignment, they reduce to the usual + sample mean/covariance of points in cluster $k$. +\end{remark} + +\subsection{Relationship to K-means as a limiting case} +Assume an isotropic shared covariance $\Sigma_k=\sigma^2 I$ for all $k$. +Then +\begin{equation} + \gamma_{ik} + \propto + \pi_k \exp\!\left(-\frac{\|x_i-\mu_k\|^2}{2\sigma^2}\right). +\end{equation} +As $\sigma\to 0$, the softmax distribution concentrates on the closest mean: +\begin{equation} + \gamma_{ik} + \to + \begin{cases} + 1, & k=\argmin_j \|x_i-\mu_j\|^2,\\ + 0, & \text{otherwise}, + \end{cases} +\end{equation} +which recovers the hard assignment step of K-means. With these hard assignments, +the M-step mean update reduces to the K-means centroid update. + +\begin{remark} + This connection explains a common interpretation: \textbf{K-means is a + special/limiting case of MoG} (and can be seen as a degenerate EM). +\end{remark} + +\section{A clarification: PCA vs regression (from Q\&A)} +\begin{note} + PCA is an \emph{unsupervised} problem: all coordinates are treated + symmetrically and the objective is to find a low-dimensional subspace that + best explains the variance/geometry of $x$. + In contrast, regression is \emph{supervised}: $y$ plays a special role and + the objective is to minimize prediction error of $y$ given $x$. + Therefore, even if one augments $x$ with $y$, the resulting optimization is + not equivalent to PCA because the symmetry between coordinates is broken by + the learning objective. +\end{note} + +\end{document} + + diff --git a/notes/2025/mvp/chapters/9-ul.pdf b/notes/2025/mvp/chapters/9-ul.pdf new file mode 100644 index 0000000..0e5fb12 Binary files /dev/null and b/notes/2025/mvp/chapters/9-ul.pdf differ diff --git a/notes/2025/mvp/chapters/9-ul.tex b/notes/2025/mvp/chapters/9-ul.tex new file mode 100644 index 0000000..555ae42 --- /dev/null +++ b/notes/2025/mvp/chapters/9-ul.tex @@ -0,0 +1,697 @@ +\documentclass[../main]{subfiles} +\begin{document} +\chapter{Unsupervised Learning} +\begin{introduction} + \item Dimensionality reduction + \item Cluster + \item Latent-variable models and the EM algorithm +\end{introduction} +All methods we have discussed so far fall under the category of \textbf{supervised learning}, where the training process relies on labeled data and the model is explicitly guided by pairs $(x,y)$. +In contrast, \textbf{unsupervised learning} operates without labels: the goal is to learn the underlying structure of the data and to approximate the distribution of datapoints $x$ itself. + +\begin{definition}[Unsupervised Learning] + Let $\mathcal{X}$ be an unlabeled dataset. + Unsupervised learning aims to learn a mapping $f:\mathcal{X}\to\mathcal{Z}$ or a generative model $p_\theta(x)$ that captures the intrinsic structure of the underlying data distribution $p(x)$. + Typical objectives include clustering, dimensionality reduction, density estimation, and representation learning. + Importantly, no labeled pairs $(x,y)$ are observed during training. +\end{definition} + +Unsupervised learning can be divided into several tasks: +\begin{enumerate} + \item Dimensionality reduction: datapoints are often lie in high-dimensional manifold, while the dimension that really matter only contribute as a small part. Dimensionality reduction is the process to train the model to find these crutial dimension from enormous amount of dimensions. + \item Clustering. Train model to group similar data togather. + \item Generate models. Various generative model, like SoRA, LLM/GPT, are trained to learn how to generate contents like text or image by learning the distribution of the target context. +\end{enumerate} +\section{Dimensionality reduction} +\subsection{Principal Component Analysis (PCA)} + +\begin{theorem}[Principle of PCA] + PCA seeks a direction $w$ (with $\|w\|=1$) along which the projected data $w^\top x$ + achieves the \emph{maximum possible variance}. + Formally, the goal is to maximize + \begin{equation} + \operatorname{Var}(w^\top X). + \end{equation} +\end{theorem} + + Let the dataset be + \begin{equation} + X = + \begin{pmatrix} + x_1^{\top} \\ + x_2^{\top} \\ + \vdots \\ + x_n^{\top} + \end{pmatrix} + \in \mathbb{R}^{n\times d}, + \end{equation} + with sample mean + \begin{equation} + \bar{x} = \frac{1}{n}\sum_{i=1}^n x_i . + \end{equation} + Each $x_i \in \mathbb{R}^d$ is an observation. + \begin{figure}[h] + \centering + \includegraphics{../../tikz/9/1.pdf} + \end{figure} + + The empirical covariance matrix is defined as + \begin{equation} + \Sigma = \frac{1}{n}\sum_{i=1}^n (x_i - \bar{x})(x_i - \bar{x})^{\top} + \in \mathbb{R}^{d\times d}. + \end{equation} + Entrywise, + \begin{equation} + \Sigma_{jk} + = \frac{1}{n}\sum_{i=1}^n + (x_i^{(j)} - \bar{x}^{(j)})(x_i^{(k)} - \bar{x}^{(k)}). + \end{equation} + \begin{remark} + The covariance matrix defined with the factor $\tfrac{1}{n}$ is technically a biased estimator of the true population covariance. + The unbiased version uses $\tfrac{1}{n-1}$ instead. + However, when $n$ is large, the difference between $\tfrac{1}{n}$ and $\tfrac{1}{n-1}$ is negligible: + \begin{equation} + \frac{1}{n} = \frac{1}{n-1} \left( 1 - \frac{1}{n} \right), + \end{equation} + and thus affects the covariance only by a vanishing scalar factor. + + Moreover, PCA depends solely on the \emph{eigenvectors} of the covariance matrix, i.e., the principal directions. + Multiplying $\Sigma$ by any positive constant does not change its eigenvectors. + Therefore, for PCA, the choice between $\tfrac{1}{n}$ and $\tfrac{1}{n-1}$ has no effect on the principal components. + \end{remark} + \begin{lemma}[Variance of a Linear Projection] + Let $w$ be any unit vector in $\mathbb{R}^d$. + The projection of a data point $x_i$ onto $w$ is the scalar + \begin{equation} + z_i = w^\top x_i . + \end{equation} + The sample mean of the projected data is + \begin{equation} + \bar{z} = \frac{1}{n}\sum_{i=1}^n z_i + = \frac{1}{n}\sum_{i=1}^n w^\top x_i + = w^\top \bar{x}. + \end{equation} + Thus + \begin{equation} + z_i - \bar{z} + = w^\top (x_i - \bar{x}). + \end{equation} + The sample variance of the projected data is + \begin{equation} + \operatorname{Var}(w^\top X) + = \frac{1}{n} \sum_{i=1}^n (z_i - \bar{z})^2 + = \frac{1}{n} \sum_{i=1}^n + \bigl( w^\top (x_i - \bar{x}) \bigr)^2 . + \end{equation} + Using the identity + \[ + (w^\top a)^2 = w^\top (a a^\top) w, + \] + we obtain + \begin{equation} + \operatorname{Var}(w^\top X) + = w^\top + \left( + \frac{1}{n}\sum_{i=1}^n (x_i - \bar{x})(x_i - \bar{x})^\top + \right) + w + = w^\top \Sigma w. + \end{equation} + \end{lemma} + Let $u_1$ be the unit projection vector on $\mathbb R^d$ to project each $x_i$ onto a 1-D space, then PCA is equivalent to solve the below problem + \begin{equation} + \argmax_{\|u_1\|=1}\; u_1^\top \Sigma u_1 . + \end{equation} + By introducing a Lagrange multiplier $\lambda$ for the constraint $u_1^\top u_1 = 1$, we consider the Lagrangian + \begin{equation} + \mathcal{L}(w,\lambda) + = u_1^\top \Sigma u_1 - \lambda \bigl( u_1^\top u_1 - 1 \bigr). + \end{equation} + Taking the derivative with respect to $u_1$ and setting it to zero gives + \begin{equation} + \frac{\partial \mathcal{L}}{\partial u_1} + = 2 \Sigma u_1 - 2 \lambda u_1 = 0, + \end{equation} + hence + \begin{equation} + \Sigma u_1 = \lambda u_1. + \end{equation} + The derivative with respect to $\lambda$ enforces the constraint + \begin{equation} + \frac{\partial \mathcal{L}}{\partial \lambda} + = -\bigl(u_1^\top u_1 - 1\bigr) = 0 + \quad\Longrightarrow\quad + u_1^\top u_1 = 1. + \end{equation} + Therefore, any maximizer \textbf{$u_1$ must be an eigenvector of $\Sigma$}, with $\lambda$ equal to the corresponding eigenvalue. + Since + \begin{equation} + u_1^\top \Sigma u_1 = \lambda + \end{equation} + for any unit eigenvector $u_1$, the optimization problem is solved by choosing $u_1$ as the eigenvector associated with the \textbf{largest eigenvalue of $\Sigma$}. + This $u_1$ is the first principal component. + + After obtaining the first principal component $u_1$, the second principal component $u_2$ is defined as the direction that maximizes the projected variance subject to two constraints: +\begin{equation} + \max_{u_2}\; u_2^\top \Sigma u_2 +\end{equation} +subject to +\begin{equation} + u_2^\top u_2 = 1, + \qquad + u_2^\top u_1 = 0. +\end{equation} +The first condition enforces unit length; the second ensures orthogonality to $u_1$. + +Introduce Lagrange multipliers $\lambda_2$ and $\alpha$. +Consider the Lagrangian +\begin{equation} +\mathcal{L}(u_2,\lambda_2,\alpha) + = u_2^\top \Sigma u_2 + + \lambda_2 (1 - u_2^\top u_2) + + \alpha\, u_2^\top u_1. +\end{equation} +Taking the derivative with respect to $u_2$ and setting it to zero yields +\begin{equation} + \frac{\partial \mathcal{L}}{\partial u_2} + = 2 \Sigma u_2 - 2 \lambda_2 u_2 + \alpha u_1 = 0. +\end{equation} + +Rearranging the stationarity condition gives +\begin{equation} + \Sigma u_2 = \lambda_2 u_2 - \frac{\alpha}{2} u_1. +\end{equation} +Using the orthogonality constraint $u_1^\top u_2 = 0$ and the fact that +\begin{equation} + \Sigma u_1 = \lambda_1 u_1, +\end{equation} +we obtain +\begin{equation} + u_1^\top \Sigma u_2 = \lambda_2\, u_1^\top u_2 - \frac{\alpha}{2} u_1^\top u_1 + = -\frac{\alpha}{2}. +\end{equation} +On the other hand, +\begin{equation} + u_1^\top \Sigma u_2 = (\Sigma u_1)^\top u_2 = \lambda_1 u_1^\top u_2 = 0. +\end{equation} +Thus $\alpha = 0$, and the stationarity condition reduces to the eigenvalue equation +\begin{equation} + \Sigma u_2 = \lambda_2 u_2. +\end{equation} +Therefore $u_2$ must be an eigenvector of $\Sigma$ associated with the second largest eigenvalue. + +By induction: +\begin{theorem} + The first $K$ principal components are exactly the eigenvectors of the covariance matrix $\Sigma$ corresponding to its $K$ largest eigenvalues. +\end{theorem} +Denote these eigenvectors by +\begin{equation} + U_{1:K} + = +( + u_1,u_2 ,\cdots , u_K +) + \in \mathbb{R}^{d \times K}. +\end{equation} +Each $u_i$ satisfies$ + \Sigma u_i = \lambda_i u_i, + \, + \lambda_1 \ge \lambda_2 \ge \cdots \ge \lambda_K.$ + + Given the matrix of the top $K$ principal directions $U_{1:K}$, the projection of a data point $x \in \mathbb{R}^d$ onto the $K$-dimensional PCA subspace is $ + z = U_{1:K}^{\top} x + \in \mathbb{R}^{n\times K}. +$ + For a dataset $X \in \mathbb{R}^{n \times d}$, the projected data matrix is + \begin{equation} + X U_{1:K} + \in \mathbb{R}^{n \times K}. + \end{equation} + This reduces the dimensionality from $d$ to $K$ while retaining the directions of maximal variance. + + \begin{definition}[Centered Data Matrix] + Let the centered data matrix be + \begin{equation} + \hat{X} + = + \begin{pmatrix} + x_1^{\top} - \bar{x}^{\top} \\ + x_2^{\top} - \bar{x}^{\top} \\ + \vdots \\ + x_n^{\top} - \bar{x}^{\top} + \end{pmatrix} + \in \mathbb{R}^{n \times d}. + \end{equation} + Each row of $\hat{X}$ is a mean-subtracted data point. + \end{definition} + + The empirical covariance matrix can be written compactly as + \begin{equation} + \Sigma = \frac{1}{n}\, \hat{X}^{\top}\hat{X}. + \end{equation} + + \begin{theorem}[Eigen-Decomposition of the Covariance] + Because $\Sigma$ is real symmetric, it admits an orthogonal eigendecomposition: + \begin{equation} + \Sigma + = U \Lambda U^{\top}, + \end{equation} + where $U$ is an orthogonal matrix whose columns are eigenvectors + and $\Lambda$ is a diagonal matrix containing the eigenvalues of $\Sigma$. + \end{theorem} + + In practice, numerical libraries such as \texttt{numpy.linalg.eig} or + \texttt{numpy.linalg.eigh} compute the eigen-decomposition of $\Sigma$ directly. + The eigenvectors $\{u_1,\ldots,u_d\}$ form the principal directions of the data. +\subsection{Singular Value Decomposition (SVD)} + Consider the singular value decomposition of the centered data matrix: + \begin{equation} + \hat{X} = U_X\, S\, V^{\top}. + \end{equation} + \newcommand{\mblock}[2]{% + \begingroup + \setlength{\fboxsep}{1pt}% + \colorbox{#1!20}{$\displaystyle #2$}% + \endgroup +} + +\begin{equation} + \hat{X}_{\mathbb{R}^{n\times d}} + = + \mblock{blue}{ U_{X\,\mathbb{R}^{n\times r}}}\, + \mblock{green}{\begin{pmatrix} + \sigma_1&\cdots &0&0&\cdots&0\\ + 0&\sigma_2&\cdots&0&\cdots&0\\ + \vdots&\ddots &&&&\vdots\\ + 0&\cdots&0&\sigma_n&\cdots&0 + \end{pmatrix} _{ \mathbb{R}^{r\times r}}}\, + \mblock{red}{V^{\top}_ {\mathbb{R}^{r\times d}} }. +\end{equation} + Then + \begin{equation} + \hat{X}^{\top}\hat{X} + = V S^{2} V^{\top}, + \end{equation} + which implies that + \begin{equation} + \Sigma = \frac{1}{n} V S^{2} V^{\top}. + \end{equation} + Thus \textbf{the right singular vectors of $\hat{X}$ are exactly the eigenvectors of $\Sigma$}, + and the eigenvalues of $\Sigma$ are the squared singular values of $\hat{X}$ scaled by $\tfrac{1}{n}$. + + + \begin{remark} + This relationship provides an alternative way to compute principal components: + the top $K$ principal directions are simply the first $K$ right singular vectors of $\hat{X}$. + This method is numerically more stable, especially when $n \neq d$ or when $d$ is large. + \end{remark} + \begin{note} + In practice, computing a full SVD of a \(d\times d\) matrix costs \(O(d^3)\) time, + which is prohibitive for high-dimensional data. + Modern numerical linear algebra therefore relies on \emph{approximate} SVD methods + that dramatically reduce the computational burden while preserving the leading singular components. + + A common approach is the \emph{truncated SVD}: + instead of computing all singular values, we only approximate the top \(K\) components. + Techniques such as randomized sketching, subspace iteration, and the Nyström method + project the data onto a low-dimensional subspace in which SVD becomes cheap. + These methods achieve a cost of roughly + \begin{equation*} + O(ndK) \quad\text{or}\quad O(d^2K), + \end{equation*} + depending on the algorithm, reducing the complexity significantly when \(K \ll d\). + + This is why real-world PCA implementations (e.g., \emph{\texttt{sklearn}, \texttt{numpy.linalg.svd}}, + and randomized PCA algorithms) compute only the leading singular vectors rather than performing a full decomposition. + \end{note} +\subsection{t-distributed Stochastic Neighbor Embedding (t-SNE)*} + +t-SNE is a nonlinear dimensionality reduction method designed for visualization +of high-dimensional datasets. +It constructs two probability distributions: +\begin{itemize} + \item one on the pairwise similarities in the high-dimensional space, + \item one on the pairwise similarities in the low-dimensional embedding, +\end{itemize} +and finds an embedding that makes these two distributions as close as possible. + + +\begin{definition}[High-Dimensional Similarity] +For datapoints $x_i, x_j$ in the original space, define a conditional probability +\begin{equation} + p_{j|i} + = \frac{\exp\!\left(-\|x_i - x_j\|^2 / 2\sigma_i^2\right)} + {\sum_{k \neq i} \exp\!\left(-\|x_i - x_k\|^2 / 2\sigma_i^2\right)}. +\end{equation} +The bandwidth $\sigma_i$ is chosen such that the \emph{perplexity} +of the distribution matches a user-specified value. +The symmetric joint probability is +\begin{equation} + p_{ij} = \frac{p_{j|i} + p_{i|j}}{2n}. +\end{equation} +\end{definition} +\begin{note} + It simply applies an RBF kernel to these data points. +\end{note} +\begin{definition}[Low-Dimensional Similarity] +For embedding points $y_i, y_j \in \mathbb{R}^2$, +t-SNE uses a Student-\(t\) distribution with one degree of freedom: +\begin{equation} + q_{ij} + = + \frac{\left(1 + \|y_i - y_j\|^2\right)^{-1}} + {\sum_{k \neq \ell} + \left(1 + \|y_k - y_\ell\|^2\right)^{-1}}. +\end{equation} +The heavy-tailed distribution alleviates the “crowding problem.” +\end{definition} + +\begin{theorem}[t-SNE Objective] +t-SNE seeks an embedding that minimizes the Kullback–Leibler divergence +between the high-dimensional and low-dimensional similarity distributions: +\begin{equation} + \mathcal{L}(Y) + = \mathrm{KL}(P \parallel Q) + = \sum_{i \neq j} p_{ij} \log \frac{p_{ij}}{q_{ij}}. +\end{equation} +Gradient descent on $\mathcal{L}$ yields the final visualization. +\end{theorem} + +t-SNE preserves local neighborhood structure. +Pairs with large $p_{ij}$ are forced to satisfy large $q_{ij}$, keeping +nearby points close. +The heavy-tailed $t$-distribution allows distant points to be modeled far apart, +preventing clusters from collapsing together. + +t-SNE is excellent for visualization but not for general-purpose embedding: +its geometry is not globally meaningful, and repeated runs +may differ due to initialization and stochasticity. +\begin{figure}[h] + \centering + \includegraphics{../../tikz/9/2.pdf} +\end{figure} +\section{Clustering} +\subsection{K-means Clustering Algorithm} +Clustering is a fundamental task in unsupervised learning. +Given a dataset +$ + D = \{x_1, x_2, \ldots, x_n\} \subset \mathbb{R}^d, +$ +our goal is to divide these points into \(K\) meaningful groups. +Here \(K\) is a user-chosen hyperparameter, reflecting how many clusters we expect to find. + +To formalize this idea, assume that each cluster \(k \in [K]\) is represented by a \textbf{center} +$ + m_k \in \mathbb{R}^d, +$ +and each data point chooses exactly one cluster. +We denote the assignment using an indicator variable \(r_{ik} \in \{0,1\}\): +\begin{equation} + r_{ik} = 1 + \quad\Longleftrightarrow\quad + x_i \text{ belongs to cluster } k. +\end{equation} +Because every point must join one and only one cluster, we enforce +\begin{equation} + \sum_{k=1}^{K} r_{ik} = 1, + \qquad \forall i \in [n]. +\end{equation} + +The intuitive goal of K-means is simple: +each point should be close to the center of the cluster it joins. +This leads to the objective function +\begin{theorem} + \begin{equation} + L = + \sum_{i=1}^{n} + \sum_{k=1}^{K} + r_{ik} \, \|x_i - m_k\|^2, + \end{equation} +\end{theorem} +which measures the total within-cluster variance. + +At first glance, minimizing \(L\) over both the assignments \(r\) and the centers \(m\) seems challenging, +because the problem is non-convex and the discrete variables \(r_{ik}\) make the search space combinatorial. +However, K-means becomes surprisingly tractable once we notice a key structural property of its objective. +Although optimizing both the assignments \(r\) and the centers \(m\) jointly is difficult, +each subproblem becomes very simple when the other variable is held fixed: + +\begin{itemize} + \item \textbf{Fixing the centers \(\{m_k\}\):} + assigning each point \(x_i\) reduces to choosing the closest center. + + \item \textbf{Fixing the assignments \(\{r_{ik}\}\):} + updating each center becomes a simple averaging step, since the optimal \(m_k\) is just the mean of the points assigned to cluster \(k\). +\end{itemize} + +This observation directly motivates the alternating minimization strategy used by the K-means algorithm: +we first update the assignments, then update the centers, and repeat the process until convergence. +The objective value decreases at every iteration, and although the algorithm is not guaranteed to reach the global optimum, +it converges to a stable \textbf{local minimum} in practice. + +\textbf{Assignment step (fix the centers).} +For each datapoint \(x_i\), we assign it to the nearest cluster center: +\begin{equation} + r_{ik} = + \begin{cases} + 1, + & k = \displaystyle\argmin_{j\in[K]} \|x_i - m_j\|^2, \\[6pt] + 0, + & \text{otherwise}. + \end{cases} +\end{equation} + +\textbf{Update step (fix the assignments).} +To update \(m_k\), we minimize the partial objective +\begin{equation} + \sum_{i=1}^{n} r_{ik} \|x_i - m_k\|^2 +\end{equation} +with respect to \(m_k\). +Taking the derivative and setting it to zero, +\begin{equation} + \frac{\partial}{\partial m_k} + \sum_{i=1}^{n} r_{ik} \|x_i - m_k\|^2 + = + -2 \sum_{i=1}^{n} r_{ik} (x_i - m_k) + = 0, +\end{equation} +which gives the closed-form mean update: +\begin{equation} + m_k + = + \frac{\sum_{i=1}^{n} r_{ik} x_i} + {\sum_{i=1}^{n} r_{ik}}. +\end{equation} + +\begin{remark} + At each iteration, K-means performs a greedy improvement step: + the assignment update minimizes the objective given the centers, + and the center update minimizes the objective given the assignments. + Hence the objective value is non-increasing and bounded below, + which guarantees convergence. +\end{remark} + +\begin{remark} + However, precisely because each step is greedy and only optimizes a partial variable, + K-means may converge to a \emph{local} minimum rather than the global optimum. + Different initializations can therefore lead to different final solutions. +\end{remark} + +Together, these two steps define the classical K-means iteration, +which alternates between assigning points to their closest centers +and recomputing each center as the average of its assigned datapoints. +\begin{figure}[h] + \centering + \includegraphics[width=\textwidth]{../../tikz/9/3.pdf} +\end{figure} +\begin{note} +At each iteration, the K-means algorithm computes the \emph{centroids} of the current approximate clusters and then uses these centroids as the new cluster centers. + In other words, K-means repeatedly replaces each center with the center of mass of the points currently assigned to it. +\end{note} +\subsection{Mixture of Gaussians (MoG)} +K-means assumes that points belonging to each cluster are concentrated around a center, +but it does not model the \emph{shape} or \emph{spread} of each cluster. +A more expressive model is the \emph{Mixture of Gaussians}, +which assumes that points assigned to the same cluster are generated from a Gaussian distribution with its own mean and covariance. + +Formally, for cluster \(k \in [K]\), assume +$ + (x \mid z_k = 1) \sim \mathcal{N}(\mu_k,\, \Sigma_k). +$ +Here \(z = (z_1,\ldots,z_K)\) is a latent indicator vector with +\begin{equation} + z_k \in \{0,1\}, + \qquad + \sum_{k=1}^{K} z_k = 1. +\end{equation} +Thus \(z_k = 1\) indicates that \(x\) is generated from cluster \(k\). + +The prior distribution over latent clusters is modeled using a categorical distribution: +\begin{equation} + P(z_k = 1) = \pi_k, + \qquad + 0 \le \pi_k \le 1, + \qquad + \sum_{k=1}^{K} \pi_k = 1. +\end{equation} + +Given the latent variable \(z\), the conditional likelihood is +\begin{equation} + P(x \mid z_k = 1) + = + \mathcal{N}(x \mid \mu_k, \Sigma_k). +\end{equation} + +Since MoG is a \emph{generative model}, the marginal density of \(x\) is obtained by summing over all latent clusters: +\begin{equation} + P(x) + = + \sum_{k=1}^{K} + P(z_k = 1)\, P(x \mid z_k = 1) + = + \sum_{k=1}^{K} + \pi_k \, \mathcal{N}(x \mid \mu_k, \Sigma_k). +\end{equation} + +% In practice, the parameters +% \(\{\pi_k, \mu_k, \Sigma_k\}_{k=1}^{K}\) +% are estimated by maximizing the likelihood of the observed data, +% typically using the EM algorithm (Expectation–Maximization), +% which alternates between inferring the latent cluster probabilities and updating the Gaussian parameters. +\begin{figure}[h] + \centering + \includegraphics{../../tikz/9/4.pdf} +\end{figure} +\begin{note} + MoG can be viewed as a soft, probabilistic extension of K-means: +instead of forcing each point into exactly one cluster, +it allows fractional responsibilities and models not only the center but also the spread and orientation of each cluster. +\end{note} +\section{Latent Variable Models and the EM Algorithm} +\subsection{Latent-variable MLE and why it is hard} +MoG is a \emph{latent-variable} model: for each datapoint $x_i$ there is an +unobserved cluster indicator. We will use the lighter integer notation +\begin{equation} + G_i \in \{1,2,\dots,K\}, +\end{equation} +where $G_i=k$ means $x_i$ is generated from component $k$. +(This is equivalent to one-hot encoding used elsewhere.) + +The marginal likelihood for each datapoint is +\begin{equation} + p(x_i;\theta)=\sum_{k=1}^K \pi_k\,\mathcal N(x_i\mid \mu_k,\Sigma_k), +\end{equation} +thus the MLE objective becomes +\begin{equation}\label{eq:ul-mog-mle} + \argmax_{\theta}\; + \sum_{i=1}^{n} + \log\!\left( + \sum_{k=1}^{K} + \pi_k\, \mathcal{N}(x_i \mid \mu_k, \Sigma_k) + \right), +\end{equation} +where $\theta=\{\pi_k,\mu_k,\Sigma_k\}_{k=1}^K$. + +\begin{remark} + The difficulty is the \textbf{log of a sum}, which couples all components. + Moreover, $\sum_k\pi_k=1$ and each $\Sigma_k$ must be positive definite, + making naive unconstrained gradient descent non-trivial. +\end{remark} + +\subsection{KL divergence and ELBO (evidence lower bound)} +To derive EM in a general and reusable way, we introduce KL divergence. +\begin{definition}[Kullback--Leibler divergence] + For distributions $Q,P$ on the same variable $Z$, + \begin{equation} + \mathrm{KL}(Q\|P) + := \mathbb E_{Z\sim Q}\!\left[\log\frac{Q(Z)}{P(Z)}\right]. + \end{equation} +\end{definition} + +\begin{proposition}\label{prop:ul-kl} + $\mathrm{KL}(Q\|P)\ge 0$, and $\mathrm{KL}(Q\|P)=0$ iff $Q=P$ (a.s.). +\end{proposition} + +\begin{theorem}[ELBO decomposition]\label{thm:ul-elbo} + For any $q(z)$ and any $\theta$, + \begin{equation}\label{eq:ul-elbo} + \log p(x;\theta) + = + \underbrace{ + \mathbb E_{z\sim q}\!\left[\log \frac{p(x,z;\theta)}{q(z)}\right] + }_{\mathcal L(q,\theta)\;\;(\text{ELBO})} + + + \underbrace{ + \mathrm{KL}\!\left(q(z)\,\middle\|\,p(z\mid x;\theta)\right) + }_{\ge 0}. + \end{equation} + Hence $\mathcal L(q,\theta)\le \log p(x;\theta)$. +\end{theorem} + +\begin{remark} + ELBO is commonly read as ``elbow'' in talks. + The bound is tight when $q(z)=p(z\mid x;\theta)$. +\end{remark} + +\subsection{The EM algorithm (Expectation--Maximization)} +EM alternates between optimizing the variational distribution $q$ and the model +parameters $\theta$. + +\textbf{E-step (fix $\theta$).} +Given $\theta^{(t)}$, maximize ELBO in $q$: +\begin{equation} + q^{(t+1)}(z)=p(z\mid x;\theta^{(t)}), +\end{equation} +which drives the KL gap in \eqref{eq:ul-elbo} to $0$ at $\theta^{(t)}$. + +\textbf{M-step (fix $q$).} +With $q^{(t+1)}$ fixed, update parameters by maximizing ELBO: +\begin{equation} + \theta^{(t+1)}\in\argmax_{\theta}\; \mathcal L\!\left(q^{(t+1)},\theta\right). +\end{equation} +Because the $-\mathbb E_{q^{(t+1)}}[\log q^{(t+1)}]$ term does not depend on +$\theta$, this is equivalent to maximizing the expected complete-data +log-likelihood +\begin{equation} + Q(\theta\mid \theta^{(t)}) + := + \mathbb E_{z\sim p(z\mid x;\theta^{(t)})}\!\left[\log p(x,z;\theta)\right]. +\end{equation} + +\begin{remark} + Repeating E-step $\rightarrow$ M-step yields a monotone (non-decreasing) + data log-likelihood sequence, and EM converges to a stationary point (not + necessarily the global optimum). +\end{remark} + +\subsection{EM for MoG: responsibilities and closed-form updates} +For MoG, the complete-data likelihood for one datapoint is +\begin{equation} + p(x_i,G_i=k;\theta) + = \pi_k\,\mathcal N(x_i\mid \mu_k,\Sigma_k). +\end{equation} + +\textbf{E-step.} Compute posterior responsibilities +\begin{equation} + \gamma_{ik} + := + p(G_i=k\mid x_i;\theta^{(t)}) + = + \frac{\pi_k^{(t)}\, \mathcal{N}(x_i \mid \mu_k^{(t)}, \Sigma_k^{(t)})} + {\sum_{j=1}^{K} \pi_j^{(t)}\, \mathcal{N}(x_i \mid \mu_j^{(t)}, \Sigma_j^{(t)})}. +\end{equation} + +\textbf{M-step.} Let $N_k:=\sum_{i=1}^n\gamma_{ik}$. Then the maximizer has +closed-form updates: +\begin{gather} + \pi_k=\frac{N_k}{n},\\ + \mu_k=\frac{1}{N_k}\sum_{i=1}^n \gamma_{ik}x_i,\\ + \Sigma_k=\frac{1}{N_k}\sum_{i=1}^n \gamma_{ik}(x_i-\mu_k)(x_i-\mu_k)^{\top}. +\end{gather} + +\begin{remark} + These are \textbf{soft-assignment weighted} empirical mean and covariance. + When responsibilities collapse to hard assignments, MoG-EM reduces to the + familiar K-means updates. +\end{remark} +\end{document} \ No newline at end of file diff --git a/notes/2025/mvp/main.pdf b/notes/2025/mvp/main.pdf index 8cf0e9d..6e25b79 100644 Binary files a/notes/2025/mvp/main.pdf and b/notes/2025/mvp/main.pdf differ diff --git a/notes/2025/mvp/main.tex b/notes/2025/mvp/main.tex index ee48f36..b2e7a73 100644 --- a/notes/2025/mvp/main.tex +++ b/notes/2025/mvp/main.tex @@ -1,5 +1,8 @@ \documentclass[lang=en,newtx,10pt]{elegantbook} \usepackage{subfiles} +\usepackage{wrapfig} +\usepackage{amssymb} +\usepackage{pifont} \title{Machine Learning} \author{Shaoheng Yan (\href{https://www.photonyan.fun/about}{PhotonYan})} @@ -63,4 +66,7 @@ \subfile{chapters/5-rt.tex} \subfile{chapters/6-lt.tex} \subfile{chapters/7-gp.tex} +\subfile{chapters/8-tel.tex} +\subfile{chapters/9-ul.tex} +% \subfile{chapters/9-em-mog.tex} \end{document} \ No newline at end of file diff --git a/notes/2025/mvp/tikz/9/5.pdf b/notes/2025/mvp/tikz/9/5.pdf new file mode 100644 index 0000000..ddd593d Binary files /dev/null and b/notes/2025/mvp/tikz/9/5.pdf differ diff --git a/notes/2025/mvp/tikz/9/5.tex b/notes/2025/mvp/tikz/9/5.tex new file mode 100644 index 0000000..7f7dd47 --- /dev/null +++ b/notes/2025/mvp/tikz/9/5.tex @@ -0,0 +1,48 @@ +\documentclass[tikz,border=5pt]{standalone} +\usepackage{amsmath} +\usetikzlibrary{arrows.meta} + +\begin{document} +\begin{tikzpicture}[>=Latex,scale=1] + +% axes +\draw[->] (0,0) -- (6.5,0) node[below] {$\theta$}; +\draw[->] (0,0) -- (0,4.3) node[left] {$\log p(x;\theta)$}; + +% true log-likelihood curve (white curve in lecture) +\draw[thick] plot[smooth] coordinates { + (0.4,1.1) (1.2,1.6) (2.2,2.1) (3.4,2.9) (4.6,3.1) (5.8,3.35) +}; +\node[black] at (5.3,3.65) {\small data log-likelihood}; + +% theta_old and theta_new markers +\coordinate (told) at (2.2,2.1); +\coordinate (tnew) at (4.6,3.1); +\fill (told) circle (2pt); +\fill (tnew) circle (2pt); +\node[below] at (told) {\small $\theta^{(t)}$}; +\node[below] at (tnew) {\small $\theta^{(t+1)}$}; + +% ELBO curve touching at theta^{(t)} (red curve) +\draw[thick,red] plot[smooth] coordinates { + (0.4,0.7) (1.2,1.3) (2.2,2.1) (3.4,2.55) (4.6,3.0) (5.8,3.05) +}; +\node[red] at (1.1,0.9) {\small ELBO at $\theta^{(t)}$}; + +% ELBO curve touching at theta^{(t+1)} (orange curve), indicative of next iteration +\draw[thick,orange!80!black] plot[smooth] coordinates { + (0.4,0.6) (1.2,1.0) (2.2,1.7) (3.4,2.5) (4.6,3.1) (5.8,3.25) +}; +\node[orange!80!black] at (5.0,2.55) {\small ELBO at $\theta^{(t+1)}$}; + +% arrows for E-step and M-step +\draw[->,thick] (2.2,0.2) -- (2.2,1.95); +\node[right] at (2.25,1.1) {\small E-step: tighten}; + +\draw[->,thick] (2.35,2.05) -- (4.45,3.02); +\node[above] at (3.6,2.8) {\small M-step: maximize}; + +\end{tikzpicture} +\end{document} + + diff --git a/notes/2025/tikz/8/1.pdf b/notes/2025/tikz/8/1.pdf new file mode 100644 index 0000000..25de9bf Binary files /dev/null and b/notes/2025/tikz/8/1.pdf differ diff --git a/notes/2025/tikz/8/1.tex b/notes/2025/tikz/8/1.tex new file mode 100644 index 0000000..1a75ed3 --- /dev/null +++ b/notes/2025/tikz/8/1.tex @@ -0,0 +1,21 @@ +\documentclass[tikz]{standalone} + +\usepackage{tikz} +\usetikzlibrary{positioning} + +\begin{document} + +\begin{tikzpicture}[>=stealth, node distance=1.8cm] + % root node + \node[circle,draw,minimum width=13mm,label=right:{$w^\top x + b \ge 0$}] (root) {}; + + % leaves + \node[draw,rectangle,below left=1.4cm and 2.0cm of root] (plus) {$+1$}; + \node[draw,rectangle,below right=1.4cm and 2.0cm of root] (minus) {$-1$}; + + % edges + \draw[->] (root) -- node[above left] {yes} (plus); + \draw[->] (root) -- node[above right] {no} (minus); +\end{tikzpicture} + +\end{document} \ No newline at end of file diff --git a/notes/2025/tikz/8/2.bbl b/notes/2025/tikz/8/2.bbl new file mode 100644 index 0000000..e69de29 diff --git a/notes/2025/tikz/8/2.blg b/notes/2025/tikz/8/2.blg new file mode 100644 index 0000000..41db3d9 --- /dev/null +++ b/notes/2025/tikz/8/2.blg @@ -0,0 +1,3 @@ +[0] Config.pm:308> INFO - This is Biber 2.20 +[0] Config.pm:311> INFO - Logfile is '2.blg' +[42] biber:340> INFO - === Wed Nov 19, 2025, 15:59:11 diff --git a/notes/2025/tikz/8/2.pdf b/notes/2025/tikz/8/2.pdf new file mode 100644 index 0000000..d8230ce Binary files /dev/null and b/notes/2025/tikz/8/2.pdf differ diff --git a/notes/2025/tikz/8/2.tex b/notes/2025/tikz/8/2.tex new file mode 100644 index 0000000..d5603b8 --- /dev/null +++ b/notes/2025/tikz/8/2.tex @@ -0,0 +1,49 @@ +\documentclass[tikz]{standalone} + +\usepackage{tikz} +\usetikzlibrary{positioning} + +\begin{document} + +\begin{tikzpicture}[>=stealth, node distance=15mm] + + % root internal node (feature A) + \node[circle,draw,minimum size=11mm, + label=above:{\small $A$}] (A) {}; + + % right leaf: predict -1, set {4,5,6,7,9} + \node[rectangle,draw,below right=16mm and 24mm of A, + minimum width=12mm,minimum height=9mm] (Rminus) {$-1$}; + + % left internal node (feature B) + \node[circle,draw,below left=16mm and 10mm of A, + minimum size=11mm, + label=left:{\small $B$}] (B) {}; + + % left leaf under B: +1, set {1,2} + \node[rectangle,draw,below left=16mm and 8mm of B, + minimum width=12mm,minimum height=9mm] (Lplus) {$+1$}; + + % right leaf under B: -1, set {3,8} + \node[rectangle,draw,below right=16mm and 8mm of B, + minimum width=12mm,minimum height=9mm] (Lminus) {$-1$}; + + % edges from root + \draw[->] (A) -- node[above left,pos=0.45] {\scriptsize yes} + node[left,pos=0.55] {\scriptsize $\{1,2,3,8\}$} + (B); + \draw[->] (A) -- node[above right,pos=0.45] {\scriptsize no} + node[right,pos=0.55] {\scriptsize $\{4,5,6,7,9\}$} + (Rminus); + + % edges from B + \draw[->] (B) -- node[above left,pos=0.45] {\scriptsize yes} + node[left,pos=0.65] {\scriptsize $\{1,2\}$} + (Lplus); + \draw[->] (B) -- node[above right,pos=0.45] {\scriptsize no} + node[right,pos=0.65] {\scriptsize $\{3,8\}$} + (Lminus); + +\end{tikzpicture} + +\end{document} \ No newline at end of file diff --git a/notes/2025/tikz/8/3.pdf b/notes/2025/tikz/8/3.pdf new file mode 100644 index 0000000..da92626 Binary files /dev/null and b/notes/2025/tikz/8/3.pdf differ diff --git a/notes/2025/tikz/8/3.tex b/notes/2025/tikz/8/3.tex new file mode 100644 index 0000000..b20f5f4 --- /dev/null +++ b/notes/2025/tikz/8/3.tex @@ -0,0 +1,36 @@ +\documentclass[tikz]{standalone} + +\usepackage{tikz} +\usetikzlibrary{calc} + +\begin{document} + +\begin{tikzpicture}[>=stealth,line width=0.5pt] + \fill[red!15] (2,2) rectangle (4,4); + % Axes + \draw[->] (-0.2,0) -- (4.2,0) node[right] {$B$}; + \draw[->] (0,-0.2) -- (0,4.2) node[above] {$A$}; + + % Horizontal and vertical split lines + \draw (0,2) -- (4,2); + \draw (2,2) -- (2,4); + + % Highlight the top-right cell (in light pink) + + + % Region labels + \node at (1,3) {$-1$}; + \node at (3,3) {$+1$}; + \node at (2,1) {$-1$}; + % \node at (3,1) {$-1$}; + + % Axis tick labels (+1 / -1) + \node[left] at (0,3) {$+1$}; + \node[left] at (0,1) {$-1$}; + + \node[below] at (1,0) {$-1$}; + \node[below] at (3,0) {$+1$}; + +\end{tikzpicture} + +\end{document} \ No newline at end of file diff --git a/notes/2025/tikz/8/4.pdf b/notes/2025/tikz/8/4.pdf new file mode 100644 index 0000000..bcbe9a7 Binary files /dev/null and b/notes/2025/tikz/8/4.pdf differ diff --git a/notes/2025/tikz/8/4.tex b/notes/2025/tikz/8/4.tex new file mode 100644 index 0000000..1dd715c --- /dev/null +++ b/notes/2025/tikz/8/4.tex @@ -0,0 +1,40 @@ +\documentclass[tikz]{standalone} +\usepackage{tikz} +\usetikzlibrary{positioning} + +\begin{document} + +\begin{tikzpicture}[% + >=stealth, + level distance=18mm, + sibling distance=26mm, + every node/.style={font=\small} +] + +% root = 1.0 +\node[circle, draw, minimum size=9mm] (root) {$1.0$} + child[left] { + % apple leaf + node[align=center] (apple) {apple\\$0.2$} + edge from parent node[midway, left] {0} + } + child[right] { + % internal node for orange + banana + node[circle, draw, minimum size=9mm] (node2) {$0.8$} + edge from parent node[midway, right] {1} + }; + +% children of node2 +\node[align=center, below=12.7mm of node2, xshift=8mm] (orange) + {orange\\$0.3$}; + +\node[align=center, below=12mm of node2, xshift=-8mm] (banana) + {banana\\$0.5$}; + +% edges from node2 to leaves manually +\draw[->] (node2) -- (orange) node[midway,right ] {0}; +\draw[->] (node2) -- (banana) node[midway, left] {1}; + +\end{tikzpicture} + +\end{document} \ No newline at end of file diff --git a/notes/2025/tikz/8/5.pdf b/notes/2025/tikz/8/5.pdf new file mode 100644 index 0000000..abc5430 Binary files /dev/null and b/notes/2025/tikz/8/5.pdf differ diff --git a/notes/2025/tikz/8/5.tex b/notes/2025/tikz/8/5.tex new file mode 100644 index 0000000..177243f --- /dev/null +++ b/notes/2025/tikz/8/5.tex @@ -0,0 +1,36 @@ +\documentclass[tikz]{standalone} +\usepackage{tikz} + +\begin{document} + +\begin{tikzpicture}[every node/.style={font=\small}] + % 圆半径 + \def\r{1.8} + + % 圆心位置 + \coordinate (Xc) at (0,0); + \coordinate (Yc) at (3,0); + + % 着色:左 H(X),右 H(Y),交叠区域颜色更深 + \fill[blue!50,opacity=0.5] (Xc) circle (\r); + \fill[red!50,opacity=0.5] (Yc) circle (\r); + + % 边界 + \draw (Xc) circle (\r); + \draw (Yc) circle (\r); + + % 顶部标 H(X), H(Y) + \node[above] at (0, \r+0.2) {$H(X)$}; + \node[above] at (3, \r+0.2) {$H(Y)$}; + + % 条件熵部分:左右各一块 + \node at (-0.6,0) {$H(X\mid Y)$}; + \node at (3.6,0) {$H(Y\mid X)$}; + + % 中间交叠部分:互信息 + \draw (1.5,-0.3)--(1.5,-1.5); + \node at (1.5,-2) {$I(X;Y)$}; + +\end{tikzpicture} + +\end{document} \ No newline at end of file diff --git a/notes/2025/tikz/8/6.pdf b/notes/2025/tikz/8/6.pdf new file mode 100644 index 0000000..66d8fed Binary files /dev/null and b/notes/2025/tikz/8/6.pdf differ diff --git a/notes/2025/tikz/8/6.tex b/notes/2025/tikz/8/6.tex new file mode 100644 index 0000000..3e397c7 --- /dev/null +++ b/notes/2025/tikz/8/6.tex @@ -0,0 +1,51 @@ +\documentclass[tikz]{standalone} + +\usepackage{tikz} +\usetikzlibrary{positioning} + +\begin{document} + +\begin{tikzpicture}[ + >=stealth, + every node/.style={font=\small}, + box/.style={draw,rounded corners,minimum width=18mm,minimum height=7mm,align=center}, + tree/.style={draw,rectangle,minimum width=10mm,minimum height=7mm,align=center} +] + +% Original dataset +\node[box] (data) {Training\\data $D$}; + +% Bootstrap samples +\node[box,below left=10mm and 6mm of data] (b1) {$D^{(1)}$}; +\node[box,below =10mm of data] (b2) {$D^{(2)}$}; +\node[box,below right=10mm and 6mm of data] (b3) {$D^{(M)}$}; + +% Arrows from original data to bootstraps +\draw[->] (data.south west) -- (b1.north); +\draw[->] (data.south) -- (b2.north); +\draw[->] (data.south east) -- (b3.north); + +% Base learners (trees) +\node[tree,below=10mm of b1] (t1) {$f_1$}; +\node[tree,below=10mm of b2] (t2) {$f_2$}; +\node[tree,below=10mm of b3] (t3) {$f_M$}; + +\draw[->] (b1) -- (t1); +\draw[->] (b2) -- (t2); +\draw[->] (b3) -- (t3); + +% Aggregation box +\node[box,below=16mm of t2,minimum width=26mm] (agg) {Average /\\majority vote}; + +\draw[->] (t1.south) -- ([xshift=-8mm]agg.north); +\draw[->] (t2.south) -- (agg.north); +\draw[->] (t3.south) -- ([xshift=8mm]agg.north); + +% Output +\node[right=20mm of agg] (out) {$F_{\mathrm{bag}}(x)$}; +\draw[->] (agg.east) -- (out.west); + +\end{tikzpicture} + +\end{document} + diff --git a/notes/2025/tikz/8/7.pdf b/notes/2025/tikz/8/7.pdf new file mode 100644 index 0000000..1027b33 Binary files /dev/null and b/notes/2025/tikz/8/7.pdf differ diff --git a/notes/2025/tikz/8/7.tex b/notes/2025/tikz/8/7.tex new file mode 100644 index 0000000..09ea585 --- /dev/null +++ b/notes/2025/tikz/8/7.tex @@ -0,0 +1,39 @@ +\documentclass[tikz]{standalone} + +\usepackage{tikz} +\usetikzlibrary{positioning} + +\begin{document} + +\begin{tikzpicture}[ + >=stealth, + every node/.style={font=\small}, + box/.style={draw,rounded corners,minimum width=22mm,minimum height=7mm,align=center}, + tree/.style={draw,rectangle,minimum width=10mm,minimum height=7mm,align=center} +] + +% Data with weights +\node[box] (d0) {Data $D$\\weights $w^{(0)}$}; + +% First tree +\node[tree,right=15mm of d0] (f1) {$f_1$}; +\draw[->] (d0) -- node[above]{fit} (f1); + +% Reweighted data +\node[box,below=10mm of f1,minimum width=26mm] (d1) {Data $D$\\weights $w^{(1)}$}; +\draw[->] (f1.south) -- node[right]{update} (d1.north); + +% Second tree +\node[tree,right=15mm of d1] (f2) {$f_2$}; +\draw[->] (d1) -- node[below]{fit} (f2); + +% Ensemble arrows +\node[box,above=10mm of f2,minimum width=30mm] (F) {$F_2(x)=\alpha_1 f_1(x)+\alpha_2 f_2(x)$}; + +\draw[->,dashed] (f1.north east) -- (F.west); +\draw[->,dashed] (f2.north) -- (F.south); + +\end{tikzpicture} + +\end{document} + diff --git a/notes/2025/tikz/9/1.pdf b/notes/2025/tikz/9/1.pdf new file mode 100644 index 0000000..39ee108 Binary files /dev/null and b/notes/2025/tikz/9/1.pdf differ diff --git a/notes/2025/tikz/9/1.tex b/notes/2025/tikz/9/1.tex new file mode 100644 index 0000000..a89165b --- /dev/null +++ b/notes/2025/tikz/9/1.tex @@ -0,0 +1,35 @@ +\documentclass[tikz,border=5pt]{standalone} + +\begin{document} +\begin{tikzpicture}[scale=0.6,>=stealth] + + % Axes + \draw[thick,->] (-3,0) -- (3.5,0) node[below] {$x_1$}; + \draw[thick,->] (0,-3) -- (0,3.5) node[left] {$x_2$}; + + % Scatter points (rough diagonal cluster) + \foreach \x/\y in { + % upper-right cloud + 0.2/0.6, 0.4/0.9, 0.5/1.1, 0.7/1.2, 0.8/1.5, + 1.0/1.6, 1.1/1.8, 1.3/2.0, 1.4/2.2, 1.6/2.3, + 1.8/2.5, 2.0/2.6, 2.1/2.8, 2.2/3.0, + 0.1/0.9, 0.3/1.0, 0.6/1.3, 0.9/1.7, 1.2/1.9, + % central band + -0.2/0.2, 0.0/0.4, 0.2/0.5, 0.3/0.7, 0.5/0.8, + 0.7/1.0, 0.9/1.2, 1.0/1.4, + % lower-left small cloud + -2.0/-0.7, -1.8/-0.6, -1.7/-0.4, -1.5/-0.5, -1.3/-0.3, + -1.2/-0.2, -1.0/-0.1, -0.9/0.0, -0.8/0.1 + }{ + \fill (\x,\y) circle[radius=1.6pt]; + } + + % PCA axes + % PC1: along the main diagonal direction (roughly slope ~1) + \draw[ thick,blue,->] (-3,-1.8) -- (3,3.2) node[anchor=south west] {$z_1$}; + + % PC2: perpendicular to PC1 + \draw[ thick,red,<-] (-2.5,3) -- (2.5,-3) node[anchor=north west] {$z_2$}; + +\end{tikzpicture} +\end{document} \ No newline at end of file diff --git a/notes/2025/tikz/9/2.pdf b/notes/2025/tikz/9/2.pdf new file mode 100644 index 0000000..935a0ed Binary files /dev/null and b/notes/2025/tikz/9/2.pdf differ diff --git a/notes/2025/tikz/9/2.tex b/notes/2025/tikz/9/2.tex new file mode 100644 index 0000000..af0bc6e --- /dev/null +++ b/notes/2025/tikz/9/2.tex @@ -0,0 +1,45 @@ +\documentclass[tikz,border=5pt]{standalone} + +\usepackage{xcolor} + +\begin{document} +\begin{tikzpicture}[>=stealth,scale=0.6] + + % Axes in embedding space + \draw[->,] (-3,0) -- (3.5,0) node[right] {$y_1$}; + \draw[->,] (0,-3) -- (0,3.5) node[above] {$y_2$}; +% \node[above right] at (2.2,3.2) {\small t-SNE embedding space}; + + % Cluster 1 (blue) + \foreach \x/\y in { + -1.4/1.2, -1.6/1.4, -1.2/1.5, -1.5/1.1, -1.3/1.3, + -1.7/1.2, -1.4/1.0, -1.1/1.4 + }{ + \fill[blue!70] (\x,\y) circle (1.7pt); + } + \node[blue!70] at (-1.6,1.9) {\small $\mu_A$}; + + % Cluster 2 (red) + \foreach \x/\y in { + 1.4/1.0, 1.6/1.2, 1.8/0.9, 1.5/0.7, 1.9/1.1, + 1.3/0.8, 1.7/0.6, 1.5/1.3 + }{ + \fill[red!70] (\x,\y) circle (1.7pt); + } + \node[red!70] at (2.2,1.6) {\small $\mu_B$}; + + % Cluster 3 (green) + \foreach \x/\y in { + -0.4/-1.2, -0.2/-1.0, 0.0/-1.3, 0.2/-1.1, + -0.1/-1.5, 0.3/-1.4, -0.3/-1.0, 0.1/-1.6 + }{ + \fill[green!70!black] (\x,\y) circle (1.7pt); + } + \node[green!70!black] at (0.4,-0.6) {\small $\mu_C$}; + + % A few “outliers” + \fill[gray!70] (2.4,-1.5) circle (1.7pt); + \fill[gray!70] (-2.3,-1.8) circle (1.7pt); + +\end{tikzpicture} +\end{document} \ No newline at end of file diff --git a/notes/2025/tikz/9/3.pdf b/notes/2025/tikz/9/3.pdf new file mode 100644 index 0000000..1f1ecef Binary files /dev/null and b/notes/2025/tikz/9/3.pdf differ diff --git a/notes/2025/tikz/9/3.tex b/notes/2025/tikz/9/3.tex new file mode 100644 index 0000000..5db6054 --- /dev/null +++ b/notes/2025/tikz/9/3.tex @@ -0,0 +1,88 @@ +\documentclass[tikz,border=5pt]{standalone} +\usepackage{xcolor} + +\begin{document} +\begin{tikzpicture}[scale=1] + +% ---------- common style ---------- +\tikzstyle{panel}=[draw=black!40,rounded corners,thick] +\tikzstyle{center}=[circle,draw=black,fill=white,inner sep=1pt] + +% ================== Panel 1: initial assignment ================== +\begin{scope}[shift={(0,0)}] + % clip region + \clip (-2.4,-3) rectangle (2.4,3); + + % data blobs (new color scheme) + \fill[magenta!20] (-0.4,1.3) circle (0.9); % magenta blob + \fill[gray!20] (-0.8,-1.2) ellipse (0.8 and 0.45); % gray blob + \fill[cyan!20] (1.1,-0.1) ellipse (1.5 and 1.0); % cyan blob + + % initial (bad) centers + \node[center] (c1a) at (-1.5, 2.0) {}; + \node[center] (c2a) at ( 1.8, 1.8) {}; + \node[center] (c3a) at ( 0.5, -2.0) {}; + + % dashed circular decision regions + \draw[magenta!60,dashed,thick] (c1a) circle (1.4); + \draw[cyan!60,dashed,thick] (c2a) circle (1.6); + \draw[gray!60,dashed,thick] (c3a) circle (1.4); + + % frame + title (also inside clip) + \draw[panel] (-2.4,-3) rectangle (2.4,3); + \node at (0,2.7) {\small Iteration 0}; +\end{scope} + +% ================== Panel 2: updated centers ================== +\begin{scope}[shift={(6,0)}] + \clip (-2.4,-3) rectangle (2.4,3); + + % same data blobs + \fill[magenta!20] (-0.4,1.3) circle (0.9); + \fill[gray!20] (-0.8,-1.2) ellipse (0.8 and 0.45); + \fill[cyan!20] (1.1,-0.1) ellipse (1.5 and 1.0); + + % means after one update + \node[center] (c1b) at (-0.8, 1.7) {}; + \node[center] (c2b) at ( 1.5,0.3) {}; + \node[center] (c3b) at (-0.3,-1.4) {}; + + % dashed regions + \draw[magenta!60,dashed,thick] (c1b) circle (1.0); + \draw[cyan!60,dashed,thick] (c2b) circle (1.5); + \draw[gray!60,dashed,thick] (c3b) circle (0.9); + + \draw[panel] (-2.4,-3) rectangle (2.4,3); + \node at (0,2.7) {\small Iteration 1}; +\end{scope} + +% ================== Panel 3: converged ================== +\begin{scope}[shift={(12,0)}] + \clip (-2.4,-3) rectangle (2.4,3); + + % same data blobs + \fill[magenta!20] (-0.4,1.3) circle (0.9); + \fill[gray!20] (-0.8,-1.2) ellipse (0.8 and 0.45); + \fill[cyan!20] (1.1,-0.1) ellipse (1.5 and 1.0); + + % final means with labels + \node[center] + (c1c) at (-0.4, 1.3) {}; + + \node[center] + (c2c) at ( 1.1,-0.1) {}; + + \node[center] + (c3c) at (-0.8,-1.2) {}; + + % final boundaries + \draw[magenta!60,dashed,thick] (c1c) circle (1.0); + \draw[cyan!60,dashed,thick] (c2c) circle (1.5); + \draw[gray!60,dashed,thick] (c3c) circle (0.9); + + \draw[panel] (-2.4,-3) rectangle (2.4,3); + \node at (0,2.7) {\small Iteration 2}; +\end{scope} + +\end{tikzpicture} +\end{document} \ No newline at end of file diff --git a/notes/2025/tikz/9/4.pdf b/notes/2025/tikz/9/4.pdf new file mode 100644 index 0000000..adc8256 Binary files /dev/null and b/notes/2025/tikz/9/4.pdf differ diff --git a/notes/2025/tikz/9/4.tex b/notes/2025/tikz/9/4.tex new file mode 100644 index 0000000..2c44612 --- /dev/null +++ b/notes/2025/tikz/9/4.tex @@ -0,0 +1,34 @@ +\documentclass[tikz,border=5pt]{standalone} +\usepackage{amsmath} +\usetikzlibrary{positioning,fit,arrows.meta} + +\begin{document} +\begin{tikzpicture}[ + latent/.style={circle,draw,thick,minimum size=14pt,inner sep=0pt}, + observed/.style={latent,fill=gray!20}, + plate/.style={draw,thick,rounded corners,inner sep=8pt}, + >=Latex +] + +% nodes +\node[latent] (Z) {$Z$}; +\node[observed] (X) [below=1.6cm of Z] {$X$}; + +% arrow Z -> X +\draw[->,thick] (Z) -- (X); + +% labels on the right +\node[right=0.5cm of Z] {\small latent variable}; +\node[right=0.5cm of X] {\small observed data}; + +% prior / parameters +\draw[<-] (Z) --++ (-1,0); +\draw[->] (X) --++ (0,-1); +\node[left=0.9cm of Z] {$\pi$}; +\node[below=0.8cm of X] {\small $\mu_k,\;\Sigma_k$}; + +% plate surrounding the generative block +\node[plate,fit=(Z)(X)] (plate1) {}; + +\end{tikzpicture} +\end{document} \ No newline at end of file