diff --git a/blog/authors.json b/blog/authors.json index a3f5c7329f0..ae2fbcf851e 100644 --- a/blog/authors.json +++ b/blog/authors.json @@ -17,5 +17,16 @@ "socials": { "github": "kunkunlin1221" } + }, + "nbswords": { + "name": "nbswords", + "title": "Research Engineer", + "url": "https://github.com/nbswords", + "image_url": "https://github.com/nbswords.png", + "socials": { + "github": "nbswords", + "linkedin": "nbswords", + "x": "nbswordsYu" + } } } diff --git a/i18n/en/docusaurus-plugin-content-docs-papers/current/image-generation/2406-mar/img/img1.jpg b/i18n/en/docusaurus-plugin-content-docs-papers/current/image-generation/2406-mar/img/img1.jpg new file mode 100644 index 00000000000..b3275300e2c Binary files /dev/null and b/i18n/en/docusaurus-plugin-content-docs-papers/current/image-generation/2406-mar/img/img1.jpg differ diff --git a/i18n/en/docusaurus-plugin-content-docs-papers/current/image-generation/2406-mar/img/img10.jpg b/i18n/en/docusaurus-plugin-content-docs-papers/current/image-generation/2406-mar/img/img10.jpg new file mode 100644 index 00000000000..578f316d23c Binary files /dev/null and b/i18n/en/docusaurus-plugin-content-docs-papers/current/image-generation/2406-mar/img/img10.jpg differ diff --git a/i18n/en/docusaurus-plugin-content-docs-papers/current/image-generation/2406-mar/img/img11.jpg b/i18n/en/docusaurus-plugin-content-docs-papers/current/image-generation/2406-mar/img/img11.jpg new file mode 100644 index 00000000000..e59de2b97eb Binary files /dev/null and b/i18n/en/docusaurus-plugin-content-docs-papers/current/image-generation/2406-mar/img/img11.jpg differ diff --git a/i18n/en/docusaurus-plugin-content-docs-papers/current/image-generation/2406-mar/img/img12.jpg b/i18n/en/docusaurus-plugin-content-docs-papers/current/image-generation/2406-mar/img/img12.jpg new file mode 100644 index 00000000000..fc19bc5bc15 Binary files /dev/null and b/i18n/en/docusaurus-plugin-content-docs-papers/current/image-generation/2406-mar/img/img12.jpg differ diff --git a/i18n/en/docusaurus-plugin-content-docs-papers/current/image-generation/2406-mar/img/img13.jpg b/i18n/en/docusaurus-plugin-content-docs-papers/current/image-generation/2406-mar/img/img13.jpg new file mode 100644 index 00000000000..cefad30cdfc Binary files /dev/null and b/i18n/en/docusaurus-plugin-content-docs-papers/current/image-generation/2406-mar/img/img13.jpg differ diff --git a/i18n/en/docusaurus-plugin-content-docs-papers/current/image-generation/2406-mar/img/img14.jpg b/i18n/en/docusaurus-plugin-content-docs-papers/current/image-generation/2406-mar/img/img14.jpg new file mode 100644 index 00000000000..ee745038b69 Binary files /dev/null and b/i18n/en/docusaurus-plugin-content-docs-papers/current/image-generation/2406-mar/img/img14.jpg differ diff --git a/i18n/en/docusaurus-plugin-content-docs-papers/current/image-generation/2406-mar/img/img15.jpg b/i18n/en/docusaurus-plugin-content-docs-papers/current/image-generation/2406-mar/img/img15.jpg new file mode 100644 index 00000000000..e095988085a Binary files /dev/null and b/i18n/en/docusaurus-plugin-content-docs-papers/current/image-generation/2406-mar/img/img15.jpg differ diff --git a/i18n/en/docusaurus-plugin-content-docs-papers/current/image-generation/2406-mar/img/img16.jpg b/i18n/en/docusaurus-plugin-content-docs-papers/current/image-generation/2406-mar/img/img16.jpg new file mode 100644 index 00000000000..4b2d10a1e5e Binary files /dev/null and b/i18n/en/docusaurus-plugin-content-docs-papers/current/image-generation/2406-mar/img/img16.jpg differ diff --git a/i18n/en/docusaurus-plugin-content-docs-papers/current/image-generation/2406-mar/img/img17.jpg b/i18n/en/docusaurus-plugin-content-docs-papers/current/image-generation/2406-mar/img/img17.jpg new file mode 100644 index 00000000000..b0b29804523 Binary files /dev/null and b/i18n/en/docusaurus-plugin-content-docs-papers/current/image-generation/2406-mar/img/img17.jpg differ diff --git a/i18n/en/docusaurus-plugin-content-docs-papers/current/image-generation/2406-mar/img/img18.jpg b/i18n/en/docusaurus-plugin-content-docs-papers/current/image-generation/2406-mar/img/img18.jpg new file mode 100644 index 00000000000..1d5fa70f1b1 Binary files /dev/null and b/i18n/en/docusaurus-plugin-content-docs-papers/current/image-generation/2406-mar/img/img18.jpg differ diff --git a/i18n/en/docusaurus-plugin-content-docs-papers/current/image-generation/2406-mar/img/img19.jpg b/i18n/en/docusaurus-plugin-content-docs-papers/current/image-generation/2406-mar/img/img19.jpg new file mode 100644 index 00000000000..504121111b7 Binary files /dev/null and b/i18n/en/docusaurus-plugin-content-docs-papers/current/image-generation/2406-mar/img/img19.jpg differ diff --git a/i18n/en/docusaurus-plugin-content-docs-papers/current/image-generation/2406-mar/img/img2.jpg b/i18n/en/docusaurus-plugin-content-docs-papers/current/image-generation/2406-mar/img/img2.jpg new file mode 100644 index 00000000000..7d295a6baa0 Binary files /dev/null and b/i18n/en/docusaurus-plugin-content-docs-papers/current/image-generation/2406-mar/img/img2.jpg differ diff --git a/i18n/en/docusaurus-plugin-content-docs-papers/current/image-generation/2406-mar/img/img3.jpg b/i18n/en/docusaurus-plugin-content-docs-papers/current/image-generation/2406-mar/img/img3.jpg new file mode 100644 index 00000000000..13d02eccc31 Binary files /dev/null and b/i18n/en/docusaurus-plugin-content-docs-papers/current/image-generation/2406-mar/img/img3.jpg differ diff --git a/i18n/en/docusaurus-plugin-content-docs-papers/current/image-generation/2406-mar/img/img4.jpg b/i18n/en/docusaurus-plugin-content-docs-papers/current/image-generation/2406-mar/img/img4.jpg new file mode 100644 index 00000000000..461284a0a04 Binary files /dev/null and b/i18n/en/docusaurus-plugin-content-docs-papers/current/image-generation/2406-mar/img/img4.jpg differ diff --git a/i18n/en/docusaurus-plugin-content-docs-papers/current/image-generation/2406-mar/img/img5.jpg b/i18n/en/docusaurus-plugin-content-docs-papers/current/image-generation/2406-mar/img/img5.jpg new file mode 100644 index 00000000000..412bf3801e9 Binary files /dev/null and b/i18n/en/docusaurus-plugin-content-docs-papers/current/image-generation/2406-mar/img/img5.jpg differ diff --git a/i18n/en/docusaurus-plugin-content-docs-papers/current/image-generation/2406-mar/img/img6.jpg b/i18n/en/docusaurus-plugin-content-docs-papers/current/image-generation/2406-mar/img/img6.jpg new file mode 100644 index 00000000000..1faf46bddfb Binary files /dev/null and b/i18n/en/docusaurus-plugin-content-docs-papers/current/image-generation/2406-mar/img/img6.jpg differ diff --git a/i18n/en/docusaurus-plugin-content-docs-papers/current/image-generation/2406-mar/img/img7.jpg b/i18n/en/docusaurus-plugin-content-docs-papers/current/image-generation/2406-mar/img/img7.jpg new file mode 100644 index 00000000000..c60b5f2bc54 Binary files /dev/null and b/i18n/en/docusaurus-plugin-content-docs-papers/current/image-generation/2406-mar/img/img7.jpg differ diff --git a/i18n/en/docusaurus-plugin-content-docs-papers/current/image-generation/2406-mar/img/img8.jpg b/i18n/en/docusaurus-plugin-content-docs-papers/current/image-generation/2406-mar/img/img8.jpg new file mode 100644 index 00000000000..693e2695661 Binary files /dev/null and b/i18n/en/docusaurus-plugin-content-docs-papers/current/image-generation/2406-mar/img/img8.jpg differ diff --git a/i18n/en/docusaurus-plugin-content-docs-papers/current/image-generation/2406-mar/img/img9.jpg b/i18n/en/docusaurus-plugin-content-docs-papers/current/image-generation/2406-mar/img/img9.jpg new file mode 100644 index 00000000000..3821a9fef43 Binary files /dev/null and b/i18n/en/docusaurus-plugin-content-docs-papers/current/image-generation/2406-mar/img/img9.jpg differ diff --git a/i18n/en/docusaurus-plugin-content-docs-papers/current/image-generation/2406-mar/index.md b/i18n/en/docusaurus-plugin-content-docs-papers/current/image-generation/2406-mar/index.md new file mode 100644 index 00000000000..147d007e6e9 --- /dev/null +++ b/i18n/en/docusaurus-plugin-content-docs-papers/current/image-generation/2406-mar/index.md @@ -0,0 +1,256 @@ +--- +title: "[24.06] MAR" +authors: nbswords +--- + +## Reforging the Order of Generation + +[**Autoregressive Image Generation without Vector Quantization**](https://arxiv.org/abs/2406.11838) + +--- + +:::info +This article is simultaneously published on [**nbswords 的 Medium**](https://medium.com/@nbswords/autoregressive-image-generation-without-vector-quantization-516b68b5acfa) +::: + +Current autoregressive image generation models often use vector quantization (VQ) to discretize images into tokens, mimicking the success of autoregressive models in the NLP domain. However, the authors argue that such a discrete space is not necessary for autoregressive image generation. Therefore, they propose an autoregressive image generation model based on continuous space, which achieves higher accuracy and faster inference time. + +## Background Knowledge + +### Vector Quantization (VQ) + +This is a long-established technique to accelerate vector search. The method segments a feature space vector (embedding vector) into different groups, each represented by a centroid vector serving as an index. A codebook containing all centroid vector indices is then used to access these groups of vectors. + +
+
+![VQ](./img/img1.jpg) +
+
+ +For details, please refer to [Survey Of Vector Space Search](https://medium.com/@nbswords/survey-of-vector-space-search-26555890ca5e) or [Vector quantization wiki](https://en.wikipedia.org/wiki/Vector_quantization). + +### Auto-regressive Image Generation + +Early Visual Autoregressive Models (VAR) treated the image generation task as GPT-like autoregressive text generation, viewing each pixel as a category. The model’s task was to perform multi-class prediction using categorical cross-entropy. Examples include Google's [Image Transformer, 2018](https://arxiv.org/abs/1802.05751) and OpenAI’s [ImageGPT, 2020](https://cdn.openai.com/papers/Generative_Pretraining_from_Pixels_V2.pdf). + +To accelerate image generation speed, current autoregressive image generation models commonly introduce VQ for two-stage training: the first stage learns a codebook in the latent space for image reconstruction, and the second stage autoregressively generates images based on the learned codebook. + +- Take [VQ-VAE, 2017](https://arxiv.org/abs/1711.00937) as an example: + + - In the encode stage, a CNN extracts image features, then vector quantization is applied on the feature map $z_e$ to obtain centroid vectors (purple vectors, $e_1 \sim e_K$). Next, distances between each feature point in $z_e$ and the centroid vectors are computed, and the nearest centroid vector index replaces each feature point, producing the discrete representation $q(z|x)$. + - In the decode stage, the image is generated using the mapped back $Z_q$ from $q$. + +
+
+![VQ-VAE](./img/img2.jpg) +
+
+ +Autoregressive models speeding up image generation with VQ sounds great, but are there no drawbacks? + +Certainly, there are: 1. VQ encoders are hard to train; 2. VQ degrades the quality of the reconstructed images. + +## Method + +### Abandoning VQ, Embracing Diffusion + +Since diffusion models can represent the joint probability distribution of all pixels or tokens in an image, why not use them to represent the probability distribution of each token? + +- Diffusion models can generate images from noise conditioned on an input prompt/image. + +
+
+![diffusion+text_prompt](./img/img3.jpg) +
+
+ +- The current approach conditions on the output of a transformer to generate images from noise (details of the transformer input will be explained later). + +
+
+![diffusion+transformer](./img/img4.jpg) +
+
+ +Their method autoregressively predicts the conditional latent variable $z$ for each token, then uses a diffusion model (a small MLP) to perform denoising and obtain the output $x$'s probability distribution $p(x|z)$. + +
+
+![autoregressive_diffusion](./img/img5.jpg) +
+
+ +They propose a Diffusion Loss to replace the original categorical cross-entropy. + +
+
+![diffusion_cal](./img/img6.jpg) +
+
+
+
+![diffusion_cal_2](./img/img7.jpg) +
+
+ +- $\varepsilon$ is a noise sample drawn from a standard normal distribution $\mathcal{N}(0,1)$. +- $\varepsilon_{\theta}$ is a small MLP, where $\varepsilon_{\theta}(x_t | t, z)$ means the model takes timestep $t$ and condition $z$ as inputs and predicts the noise vector $\varepsilon_{\theta}$ given $x_t$. + + - $t$ is the timestamp in the noise schedule, and $z$ is the condition vector generated by the Transformer based on context. + +Finally, similar to DDPM, during inference the model uses the reverse diffusion process to generate images (from $x_t$ to $x_0$). + +
+
+![reverse_diff](./img/img8.jpg) +
+
+ +Additionally, readers familiar with diffusion models for image generation will recognize the hyperparameter called temperature, which controls sampling randomness—higher temperature means more randomness, lower means less. In this work, temperature is controlled by $\tau * \sigma_t \delta$, following the method introduced in [Diffusion Models Beat GANs on Image Synthesis](https://arxiv.org/abs/2105.05233). + +Reflecting on this approach reveals a clever combination of VAE’s conditional generation and DDPM’s denoising: compressing image features with a VAE and generating images with DDPM. + +
+
+![VAE_and_Diffusion](./img/img9.jpg) +
+
+ +### Integrating Concepts from Autoregressive and Masked Generative Models + +As the author of [MAE (Masked Autoencoders Are Scalable Vision Learners), 2021](https://arxiv.org/abs/2111.06377), Kai Ming naturally considered incorporating MAE’s ideas into the model. + +- The main pipeline refers to follow-up works on MAE: [MaskGIT, 2022](https://arxiv.org/abs/2202.04200) and [MAGE, 2022](https://arxiv.org/abs/2211.09117), both using Transformers for Masked Autoregressive (MAR) modeling. MaskGIT’s contribution is using a bidirectional transformer decoder to predict multiple tokens simultaneously, while MAGE unifies image representation learning and image synthesis. +- This work adopts MAE’s bidirectional attention mechanism, placing mask tokens \[m] in the middle layers so that all tokens can see each other, rather than only previous tokens. Loss is computed only on unmasked tokens. + + - Note: This is not the conventional causal vs. bidirectional attention. For deeper understanding, refer to the original MAE paper. + + - The advantage is improved image generation quality; the disadvantage is that training and inference cannot leverage kv cache acceleration. However, since multiple tokens are predicted simultaneously, the method is still reasonably fast. + +
+
+![bidirect](./img/img10.jpg) +
+
+ +- The figure below compares standard sequential AR, random-order AR (random masking of one token), and their approach (random masking of multiple tokens with simultaneous prediction)—this reveals that the transformer input mentioned earlier is a masked image. + +
+
+![AR_and_MAR](./img/img11.jpg) +
+
+ +## Implementation + +- Diffusion Loss: uses a cosine-shaped noise schedule; training uses 1000 DDPM steps while inference uses only 100 steps. +- Denoising MLP (small MLP): consists of 3 blocks with 1024 channels each; each block contains LayerNorm, linear layer, SiLU activation, and residual connections. AdaLN is used to inject the transformer output $z$ into the LayerNorm layers. +- Tokenizer: uses publicly available tokenizers from LDM, including VQ-16 and KL-16. VQ-16 is based on VQ-GAN with GAN loss and perceptual loss; KL-16 uses KL divergence regularization and does not rely on VQ. +- Transformer: a ViT receives the token sequence from the tokenizer, adds positional encoding and a class token [CLS], then passes through 32 layers of transformer blocks each with 1024 channels. +- Masked autoregressive models: training uses a masking ratio between 0.7 and 1.0 (e.g., 70% tokens randomly masked). To avoid sequences becoming too short, 64 [CLS] tokens are always padded. During inference, the masking ratio is gradually reduced from 1.0 to 0 using a cosine schedule over 64 steps by default. +- Baseline autoregressive model: a GPT model with causal attention, input appended with a [CLS] token, supporting kv cache and temperature parameters. + +## Experiments + +The model experiments were conducted using AR/MAR-L (\~400M parameters), trained for 400 epochs on ImageNet 256×256 images. + +### Diffusion Loss vs. Cross-Entropy Loss + +Among all AR/MAR variants, models trained with Diffusion Loss consistently outperformed those trained with cross-entropy loss. The AR model saw the smallest improvement, with increasing gains moving towards MAR+bidirectional+more than one predictions (preds), demonstrating the critical importance of Diffusion Loss for MAR models. + +- Diffusion Loss also incorporates Classifier-Free Guidance (CFG), commonly used in diffusion models, to enhance generation quality. +- Fréchet Inception Distance (FID) is better when lower, while Inception Score (IS) is better when higher; both metrics assess generated image quality. + +
+
+![Diffusion Loss](./img/img12.jpg) +
+
+ +### Tokenizers + +Experiments tested different tokenizers paired with Diffusion Loss. Moving away from discrete space to continuous space allows Diffusion Loss to work on both continuous and discrete tokenizers. + +- VQ-16 refers to taking the continuous latent before vector quantization in VQ-VAE as tokens. Both VQ-16 and KL-16 tokenizers come from LDM but trained on ImageNet instead of OpenImages. +- Consistency Decoder is a non-VQ tokenizer originating from [DALL·E 3](https://github.com/openai/consistencydecoder). +- Reconstruction FID (rFID) is better when lower, used to evaluate tokenizer quality. + +
+
+![Tokenizers](./img/img13.jpg) +
+
+ +Results show continuous tokenizers like KL-16 outperform discrete ones like VQ-16. The model also works compatibly with different tokenizers such as Consistency Decoder. + +### Denoising MLP + +Performance comparison among MLPs of different sizes shows width=1024 yields the best results. + +
+
+![Denosing MLP](./img/img14.jpg) +
+
+ +### Sampling Steps of Diffusion Loss + +Different diffusion steps impact generation quality. Using 100 steps already achieves good performance. + +
+
+![Sampling steps of Diffusion Loss](./img/img15.jpg) +
+
+ +### Temperature of Diffusion Loss + +Temperature is also an important factor for Diffusion Loss. + +
+
+![Temperature](./img/img16.jpg) +
+
+ +### Speed/Accuracy Trade-off + +Since kv cache cannot be used, this evaluation is critical. Testing was done on an A100 GPU with batch size=256. + +
+
+![Speed](./img/img17.jpg) +
+
+ +- MAR: each point represents different autoregressive steps (8 to 128). +- DiT: each point represents different diffusion steps (50, 75, 150, 250), using DiT-XL here. +- AR: uses kv cache. + +Despite no kv cache, this model maintains decent inference speed, although default settings (step=64) are noticeably slower. + +### System-Level Comparison + +Compared with other models, only the largest model MAR-H achieves the best performance, but MAR-L already performs well. + +
+
+![System-level comparison](./img/img18.jpg) +
+
+ +## Conclusion + +In summary, this work breaks away from traditional autoregressive image generation methods by combining Diffusion and MAE concepts to open a new direction. It demonstrates promising results using the simplest DDPM; better diffusion models should yield even stronger outcomes. Looking forward to more derivative works. + +Below are some example images generated by the model. + +
+
+![result](./img/img19.jpg) +
+
+ +## Reference + +[https://zhouyifan.net/2024/07/27/20240717-ar-wo-vq/](https://zhouyifan.net/2024/07/27/20240717-ar-wo-vq/) diff --git a/i18n/en/docusaurus-plugin-content-docs-papers/current/image-generation/_category_.json b/i18n/en/docusaurus-plugin-content-docs-papers/current/image-generation/_category_.json new file mode 100644 index 00000000000..85bc3954950 --- /dev/null +++ b/i18n/en/docusaurus-plugin-content-docs-papers/current/image-generation/_category_.json @@ -0,0 +1,6 @@ +{ + "label": "Image Generation", + "link": { + "type": "generated-index" + } +} diff --git a/i18n/ja/docusaurus-plugin-content-docs-papers/current/image-generation/2406-mar/img/img1.jpg b/i18n/ja/docusaurus-plugin-content-docs-papers/current/image-generation/2406-mar/img/img1.jpg new file mode 100644 index 00000000000..b3275300e2c Binary files /dev/null and b/i18n/ja/docusaurus-plugin-content-docs-papers/current/image-generation/2406-mar/img/img1.jpg differ diff --git a/i18n/ja/docusaurus-plugin-content-docs-papers/current/image-generation/2406-mar/img/img10.jpg b/i18n/ja/docusaurus-plugin-content-docs-papers/current/image-generation/2406-mar/img/img10.jpg new file mode 100644 index 00000000000..578f316d23c Binary files /dev/null and b/i18n/ja/docusaurus-plugin-content-docs-papers/current/image-generation/2406-mar/img/img10.jpg differ diff --git a/i18n/ja/docusaurus-plugin-content-docs-papers/current/image-generation/2406-mar/img/img11.jpg b/i18n/ja/docusaurus-plugin-content-docs-papers/current/image-generation/2406-mar/img/img11.jpg new file mode 100644 index 00000000000..e59de2b97eb Binary files /dev/null and b/i18n/ja/docusaurus-plugin-content-docs-papers/current/image-generation/2406-mar/img/img11.jpg differ diff --git a/i18n/ja/docusaurus-plugin-content-docs-papers/current/image-generation/2406-mar/img/img12.jpg b/i18n/ja/docusaurus-plugin-content-docs-papers/current/image-generation/2406-mar/img/img12.jpg new file mode 100644 index 00000000000..fc19bc5bc15 Binary files /dev/null and b/i18n/ja/docusaurus-plugin-content-docs-papers/current/image-generation/2406-mar/img/img12.jpg differ diff --git a/i18n/ja/docusaurus-plugin-content-docs-papers/current/image-generation/2406-mar/img/img13.jpg b/i18n/ja/docusaurus-plugin-content-docs-papers/current/image-generation/2406-mar/img/img13.jpg new file mode 100644 index 00000000000..cefad30cdfc Binary files /dev/null and b/i18n/ja/docusaurus-plugin-content-docs-papers/current/image-generation/2406-mar/img/img13.jpg differ diff --git a/i18n/ja/docusaurus-plugin-content-docs-papers/current/image-generation/2406-mar/img/img14.jpg b/i18n/ja/docusaurus-plugin-content-docs-papers/current/image-generation/2406-mar/img/img14.jpg new file mode 100644 index 00000000000..ee745038b69 Binary files /dev/null and b/i18n/ja/docusaurus-plugin-content-docs-papers/current/image-generation/2406-mar/img/img14.jpg differ diff --git a/i18n/ja/docusaurus-plugin-content-docs-papers/current/image-generation/2406-mar/img/img15.jpg b/i18n/ja/docusaurus-plugin-content-docs-papers/current/image-generation/2406-mar/img/img15.jpg new file mode 100644 index 00000000000..e095988085a Binary files /dev/null and b/i18n/ja/docusaurus-plugin-content-docs-papers/current/image-generation/2406-mar/img/img15.jpg differ diff --git a/i18n/ja/docusaurus-plugin-content-docs-papers/current/image-generation/2406-mar/img/img16.jpg b/i18n/ja/docusaurus-plugin-content-docs-papers/current/image-generation/2406-mar/img/img16.jpg new file mode 100644 index 00000000000..4b2d10a1e5e Binary files /dev/null and b/i18n/ja/docusaurus-plugin-content-docs-papers/current/image-generation/2406-mar/img/img16.jpg differ diff --git a/i18n/ja/docusaurus-plugin-content-docs-papers/current/image-generation/2406-mar/img/img17.jpg b/i18n/ja/docusaurus-plugin-content-docs-papers/current/image-generation/2406-mar/img/img17.jpg new file mode 100644 index 00000000000..b0b29804523 Binary files /dev/null and b/i18n/ja/docusaurus-plugin-content-docs-papers/current/image-generation/2406-mar/img/img17.jpg differ diff --git a/i18n/ja/docusaurus-plugin-content-docs-papers/current/image-generation/2406-mar/img/img18.jpg b/i18n/ja/docusaurus-plugin-content-docs-papers/current/image-generation/2406-mar/img/img18.jpg new file mode 100644 index 00000000000..1d5fa70f1b1 Binary files /dev/null and b/i18n/ja/docusaurus-plugin-content-docs-papers/current/image-generation/2406-mar/img/img18.jpg differ diff --git a/i18n/ja/docusaurus-plugin-content-docs-papers/current/image-generation/2406-mar/img/img19.jpg b/i18n/ja/docusaurus-plugin-content-docs-papers/current/image-generation/2406-mar/img/img19.jpg new file mode 100644 index 00000000000..504121111b7 Binary files /dev/null and b/i18n/ja/docusaurus-plugin-content-docs-papers/current/image-generation/2406-mar/img/img19.jpg differ diff --git a/i18n/ja/docusaurus-plugin-content-docs-papers/current/image-generation/2406-mar/img/img2.jpg b/i18n/ja/docusaurus-plugin-content-docs-papers/current/image-generation/2406-mar/img/img2.jpg new file mode 100644 index 00000000000..7d295a6baa0 Binary files /dev/null and b/i18n/ja/docusaurus-plugin-content-docs-papers/current/image-generation/2406-mar/img/img2.jpg differ diff --git a/i18n/ja/docusaurus-plugin-content-docs-papers/current/image-generation/2406-mar/img/img3.jpg b/i18n/ja/docusaurus-plugin-content-docs-papers/current/image-generation/2406-mar/img/img3.jpg new file mode 100644 index 00000000000..13d02eccc31 Binary files /dev/null and b/i18n/ja/docusaurus-plugin-content-docs-papers/current/image-generation/2406-mar/img/img3.jpg differ diff --git a/i18n/ja/docusaurus-plugin-content-docs-papers/current/image-generation/2406-mar/img/img4.jpg b/i18n/ja/docusaurus-plugin-content-docs-papers/current/image-generation/2406-mar/img/img4.jpg new file mode 100644 index 00000000000..461284a0a04 Binary files /dev/null and b/i18n/ja/docusaurus-plugin-content-docs-papers/current/image-generation/2406-mar/img/img4.jpg differ diff --git a/i18n/ja/docusaurus-plugin-content-docs-papers/current/image-generation/2406-mar/img/img5.jpg b/i18n/ja/docusaurus-plugin-content-docs-papers/current/image-generation/2406-mar/img/img5.jpg new file mode 100644 index 00000000000..412bf3801e9 Binary files /dev/null and b/i18n/ja/docusaurus-plugin-content-docs-papers/current/image-generation/2406-mar/img/img5.jpg differ diff --git a/i18n/ja/docusaurus-plugin-content-docs-papers/current/image-generation/2406-mar/img/img6.jpg b/i18n/ja/docusaurus-plugin-content-docs-papers/current/image-generation/2406-mar/img/img6.jpg new file mode 100644 index 00000000000..1faf46bddfb Binary files /dev/null and b/i18n/ja/docusaurus-plugin-content-docs-papers/current/image-generation/2406-mar/img/img6.jpg differ diff --git a/i18n/ja/docusaurus-plugin-content-docs-papers/current/image-generation/2406-mar/img/img7.jpg b/i18n/ja/docusaurus-plugin-content-docs-papers/current/image-generation/2406-mar/img/img7.jpg new file mode 100644 index 00000000000..c60b5f2bc54 Binary files /dev/null and b/i18n/ja/docusaurus-plugin-content-docs-papers/current/image-generation/2406-mar/img/img7.jpg differ diff --git a/i18n/ja/docusaurus-plugin-content-docs-papers/current/image-generation/2406-mar/img/img8.jpg b/i18n/ja/docusaurus-plugin-content-docs-papers/current/image-generation/2406-mar/img/img8.jpg new file mode 100644 index 00000000000..693e2695661 Binary files /dev/null and b/i18n/ja/docusaurus-plugin-content-docs-papers/current/image-generation/2406-mar/img/img8.jpg differ diff --git a/i18n/ja/docusaurus-plugin-content-docs-papers/current/image-generation/2406-mar/img/img9.jpg b/i18n/ja/docusaurus-plugin-content-docs-papers/current/image-generation/2406-mar/img/img9.jpg new file mode 100644 index 00000000000..3821a9fef43 Binary files /dev/null and b/i18n/ja/docusaurus-plugin-content-docs-papers/current/image-generation/2406-mar/img/img9.jpg differ diff --git a/i18n/ja/docusaurus-plugin-content-docs-papers/current/image-generation/2406-mar/index.md b/i18n/ja/docusaurus-plugin-content-docs-papers/current/image-generation/2406-mar/index.md new file mode 100644 index 00000000000..063c6b9b833 --- /dev/null +++ b/i18n/ja/docusaurus-plugin-content-docs-papers/current/image-generation/2406-mar/index.md @@ -0,0 +1,253 @@ +--- +title: "[24.06] MAR" +authors: nbswords +--- + +## 再び生成の秩序を鍛え直す + +[**Autoregressive Image Generation without Vector Quantization**](https://arxiv.org/abs/2406.11838) + +--- + +:::info +本記事は [**nbswords 的 Medium**](https://medium.com/@nbswords/autoregressive-image-generation-without-vector-quantization-516b68b5acfa) に同時公開されています。 +::: + +現在の自己回帰型画像生成モデルは、NLP 分野の自己回帰モデルの成功を模倣するために、画像をベクトル量子化(VQ)して複数のトークンに離散化する手法が一般的ですが、著者はこの離散空間が自己回帰型画像生成に必須ではないと考え、連続空間に基づく自己回帰画像生成モデルを提案しました。これにより、より高い精度と高速な推論時間を両立しています。 + +## 背景知識 + +### ベクトル量子化(Vector Quantization, VQ) + +これは長年用いられているベクトル探索の高速化技術であり、特徴空間のベクトル(埋め込みベクトル)を複数のグループに分割し、各グループを代表するベクトルをインデックスとして扱い、すべての代表ベクトルのインデックスを含むコードブックを通じてベクトル群を管理します。 + +
+
+![VQ](./img/img1.jpg) +
+
+ +詳しい方法は [Survey Of Vector Space Search](https://medium.com/@nbswords/survey-of-vector-space-search-26555890ca5e) や [Vector quantization wiki](https://en.wikipedia.org/wiki/Vector_quantization) を参照してください。 + +### 自己回帰型画像生成(Auto-regressive image generation) + +初期の Visual Autoregressive Model (VAR)は、画像生成タスクを GPT のような自己回帰型テキスト生成として直接模擬し、画像の各ピクセルを一つのカテゴリとして扱い、モデルは categorical cross entropy で多クラス予測を行います。Google の[Image Transformer, 2018](https://arxiv.org/abs/1802.05751)や OpenAI の[ImageGPT, 2020](https://cdn.openai.com/papers/Generative_Pretraining_from_Pixels_V2.pdf)がこの方法に該当します。 + +画像生成速度を上げるため、現在の自己回帰画像生成モデルでは VQ を導入した 2 段階学習が一般的です。第 1 段階で latent space 上で画像復元に用いる codebook を学習し、第 2 段階で学習した codebook に基づいて自己回帰的に画像を生成します。 + +- 例として[VQ-VAE, 2017](https://arxiv.org/abs/1711.00937) + + - Encode 段階では CNN で画像特徴を抽出し、feature map $z_e$ に対して VQ を行い、各グループの代表ベクトル(紫色のベクトル、$e_1$ ~ $e_K$)を取得します。次に $z_e$ の各特徴点と代表ベクトル間の距離を計算し、最も近い代表ベクトルのインデックスで置換し $q(z|x)$ を得ます。これが離散特徴です。 + - Decode 段階では $q$ からマッピングされた $Z_q$ を用いて画像を生成します。 + +
+
+![VQ-VAE](./img/img2.jpg) +
+
+ +自己回帰モデルで VQ を使い画像生成を高速化するのは魅力的ですが、欠点はないのでしょうか? + +もちろんあります。1. VQ エンコーダの訓練が難しい 2. VQ によって復元画像の品質が低下する点です。 + +## 方法 + +### VQ を捨てて、Diffusion を採用 + +Diffusion モデルは画像の全ピクセルやトークンの結合確率分布を表現できるなら、なぜ各トークンの条件付き確率分布を表現できないのか? + +- Diffusion は入力のプロンプトや画像を条件にノイズから画像を生成する + +
+
+![diffusion+text_prompt](./img/img3.jpg) +
+
+- 今回はTransformerの出力を条件としてノイズから生成(Transformerの入力については後述) +
+
+![diffusion+transformer](./img/img4.jpg) +
+
+ +彼らの手法は、各トークンの条件付き潜在変数 z を自己回帰的に予測し、小さな MLP の Diffusion モデルでノイズ除去(denoising)を行い、出力 x の確率分布 p(x|z) を得る + +
+
+![autoregressive_diffusion](./img/img5.jpg) +
+
+ +元のカテゴリカルクロスエントロピーに代えて Diffusion Loss を提案 + +
+
+![diffusion_cal](./img/img6.jpg) +
+
+
+
+![diffusion_cal_2](./img/img7.jpg) +
+
+ +- $ε$ は(0,1)の正規分布からサンプリングされたノイズ +- $ε_θ$ は小さな MLP で、$ε_θ(x_t|t,z)$ は時間 $t$ と条件 $z$ をもとに入力 $x_t$ からノイズベクトル $ε_θ$ を推定するモデル + + - $t$ はノイズスケジュールのタイムスタンプ、$z$ は Transformer が文脈から生成した条件ベクトル + +最後に、DDPM 同様に推論時には逆拡散過程(reverse diffusion process)で画像を生成($x_t$ から $x_0$ へ) + +
+
+![reverse_diff](./img/img8.jpg) +
+
+ +また、Diffusion モデルの画像生成では「温度(temperature)」という超パラメータがあり、温度が高いほど生成のランダム性が大きく、低いほど少ない。今回のモデルでは温度 $\tau$ \* $\sigma_t \delta$ を使い調整。この手法は[Diffusion Models Beat GANs on Image Synthesis](https://arxiv.org/abs/2105.05233)に由来。 + +これは VAE の条件生成と DDPM のノイズ除去を巧みに組み合わせ、VAE で画像特徴を圧縮し DDPM で画像を生成するという非常に洗練された手法である。 + +
+
+![VAE_and_Diffusion](./img/img9.jpg) +
+
+ +### Autoregressive と Masked Generative Models の概念を融合 + +本モデルは[MAE (Masked Autoencoders Are Scalable Vision Learners), 2021](https://arxiv.org/abs/2111.06377)の作者である愷明氏の発想を踏襲し、MAE の考えを取り入れている。 + +- 主に MAE の後続研究[MaskGIT, 2022](https://arxiv.org/abs/2202.04200)と[MAGE, 2022](https://arxiv.org/abs/2211.09117)を参考に、いずれも Transformer を用いた Masked Autoregressive(MAR)である。MaskGIT は bidirectional transformer decoder で複数トークンを同時予測する点が貢献、MAGE は画像表現学習と画像合成の統合が貢献。 +- 本作でも MAE の双方向注意機構を用い、マスクトークン [m]を中間層に置き全トークンが互いを参照可能にし、マスクされていないトークンのみで損失を計算する。 + + - これは一般的に言われる causal attention や bidirectional attention とは異なるため、理解したい場合は MAE 論文を参照。 + - この設計により画像生成性能が向上する反面、Transformer の訓練・推論で kv cache を使った高速化ができなくなる。ただし複数トークンを同時予測するため速度低下は限定的。 + +
+
+![bidirect](./img/img10.jpg) +
+
+ +- 下図は通常の順次 AR、ランダム順序での AR(1 トークンマスク)、今回の複数トークン同時予測 AR(複数トークンマスク)の比較。ここで Transformer への入力が masked image である理由が明かされる。 + +
+
+![AR_and_MAR](./img/img11.jpg) +
+
+ +## 実装 + +- Diffusion Loss:cosine 型ノイズスケジュール、訓練時は DDPM を 1000 ステップ、推論時は 100 ステップで実行 +- Denoising MLP(小型 MLP):3 層 1024 チャネルブロック、各ブロックに LayerNorm、線形層、SiLU 活性化関数、Residual 接続を含む。実装上は AdaLN で Transformer の出力 z を LayerNorm 層に組み込む +- Tokenizer:LDM 公開の Tokenizer を利用、VQ-16 と KL-16 を含む。VQ-16 は VQ-GAN ベースで GAN loss と知覚損失を用い、KL-16 は KL ダイバージェンス正則化で VQ に依存しない +- Transformer:Tokenizer 処理済みのトークン列を受け取り、位置エンコードと [CLS] トークンを付加、32 層 1024 チャネルの Transformer ブロックを通す ViT を使用 +- Masked autoregressive models:訓練時は [0.7, 1.0] のマスキング率で、0.7 はランダムに 70%のトークンをマスク。短すぎるシーケンスを避けるため常に 64 個の [cls] トークンでパディング。推論時はマスキング率を 1.0 から 0 へ段階的に下げ、cosine スケジュールでステップ数調整。デフォルトは 64 ステップ。 +- ベースラインの Autoregressive Model:causal attention を用いた GPT モデルで、入力に [cls] を付け、kv cache と温度パラメータを利用。 + +## 実験 + +モデル実験設定はすべて AR/MAR-L(約 4 億パラメータ)、400 エポック、ImageNet 256×256 + +### Diffusion Loss と Cross-entropy Loss の比較 + +すべての AR/MAR バリエーションにおいて、Diffusion Loss を使った方が性能が良い。中でも AR の改善は最小で、下に行くほど改善が大きく、MAR+bidirect+複数予測モデルの改善が最大。Diffusion Loss が MAR モデルにとって重要であることが示されている。 + +- Diffusion Loss は他の拡散モデルでも使われる CFG(Classifier-Free Guidance)を用いて生成性能を向上させている +- Fréchet Inception Distance (FID) は数値が低いほど良い、Inception Score (IS) は高いほど良い。両者は生成画像の品質評価指標 + +
+
+![Diffusion Loss](./img/img12.jpg) +
+
+ +### トークナイザー + +異なるトークナイザーを使い Diff Loss と組み合わせた効果を検証。離散空間から連続空間へ脱却したため、トークナイザーは制限されず、Diff Loss は連続・離散どちらのトークナイザーにも適用可能。 + +- VQ-16:VQ-VAE の VQ 前の連続潜在表現をトークン化したもの。前述の通り、VQ-16 と KL-16 は LDM 提供のもので、ImageNet で学習している(OpenImages ではない) +- Consistency Decoder:非 VQ のトークナイザーで、[DALL·E 3](https://github.com/openai/consistencydecoder)由来 +- reconstruction FID (rFID):低いほど良く、トークナイザー評価指標 + +
+
+![Tokenizers](./img/img13.jpg) +
+
+ +結果として、連続型トークナイザーの KL-16 は離散型 VQ-16 より良い性能を示し、Consistency Decoder のような異なるトークナイザーにも対応可能なことを示している。 + +### Denoising MLP + +パラメータ数の異なる MLP の性能比較。幅(width)1024 が最良の性能を示す。 + +
+
+![Denosing MLP](./img/img14.jpg) +
+
+ +### Diffusion Loss のサンプリングステップ数 + +Diffusion のステップ数による生成品質の影響。ステップ数 100 で十分良い性能を得られる。 + +
+
+![Sampling steps of Diffusion Loss](./img/img15.jpg) +
+
+ +### Diffusion Loss の温度パラメータ + +Diff Loss における温度設定も重要。 + +
+
+![Temperature](./img/img16.jpg) +
+
+ +### スピードと精度のトレードオフ + +kv cache を使用できないため、この評価は重要。A100 を用い、バッチサイズ 256 で計測。 + +
+
+![Speed](./img/img17.jpg) +
+
+ +- MAR: 各点は異なる自己回帰ステップ数(8 ~ 128) +- DiT: 各点は異なる拡散ステップ数(50, 75, 150, 250)、DiT-XL を使用 +- AR: kv cache を使用 + +kv cache なしでも推論速度は悪くないが、デフォルト設定(ステップ 64)はやや遅い。 + +### システムレベルの比較 + +他モデルとの比較。最大モデル MAR-H が最良の性能を示すが、MAR-L も十分良好。 + +
+
+![System-level comparison](./img/img18.jpg) +
+
+ +## 結論 + +本研究は自己回帰画像生成の従来手法を打破し、Diffusion と MAE を融合した新たな方向性を切り拓いた。単純な DDPM を使った実証に留まるが、より優れた拡散モデルの適用でさらなる性能向上が期待される。今後の派生研究に期待したい。 + +最後に一部の生成画像を掲載。 + +
+
+![result](./img/img19.jpg) +
+
+ +## 参考文献 + +[https://zhouyifan.net/2024/07/27/20240717-ar-wo-vq/](https://zhouyifan.net/2024/07/27/20240717-ar-wo-vq/) diff --git a/i18n/ja/docusaurus-plugin-content-docs-papers/current/image-generation/_category_.json b/i18n/ja/docusaurus-plugin-content-docs-papers/current/image-generation/_category_.json new file mode 100644 index 00000000000..85bc3954950 --- /dev/null +++ b/i18n/ja/docusaurus-plugin-content-docs-papers/current/image-generation/_category_.json @@ -0,0 +1,6 @@ +{ + "label": "Image Generation", + "link": { + "type": "generated-index" + } +} diff --git a/papers/image-generation/2406-mar/img/img1.jpg b/papers/image-generation/2406-mar/img/img1.jpg new file mode 100644 index 00000000000..b3275300e2c Binary files /dev/null and b/papers/image-generation/2406-mar/img/img1.jpg differ diff --git a/papers/image-generation/2406-mar/img/img10.jpg b/papers/image-generation/2406-mar/img/img10.jpg new file mode 100644 index 00000000000..578f316d23c Binary files /dev/null and b/papers/image-generation/2406-mar/img/img10.jpg differ diff --git a/papers/image-generation/2406-mar/img/img11.jpg b/papers/image-generation/2406-mar/img/img11.jpg new file mode 100644 index 00000000000..e59de2b97eb Binary files /dev/null and b/papers/image-generation/2406-mar/img/img11.jpg differ diff --git a/papers/image-generation/2406-mar/img/img12.jpg b/papers/image-generation/2406-mar/img/img12.jpg new file mode 100644 index 00000000000..fc19bc5bc15 Binary files /dev/null and b/papers/image-generation/2406-mar/img/img12.jpg differ diff --git a/papers/image-generation/2406-mar/img/img13.jpg b/papers/image-generation/2406-mar/img/img13.jpg new file mode 100644 index 00000000000..cefad30cdfc Binary files /dev/null and b/papers/image-generation/2406-mar/img/img13.jpg differ diff --git a/papers/image-generation/2406-mar/img/img14.jpg b/papers/image-generation/2406-mar/img/img14.jpg new file mode 100644 index 00000000000..ee745038b69 Binary files /dev/null and b/papers/image-generation/2406-mar/img/img14.jpg differ diff --git a/papers/image-generation/2406-mar/img/img15.jpg b/papers/image-generation/2406-mar/img/img15.jpg new file mode 100644 index 00000000000..e095988085a Binary files /dev/null and b/papers/image-generation/2406-mar/img/img15.jpg differ diff --git a/papers/image-generation/2406-mar/img/img16.jpg b/papers/image-generation/2406-mar/img/img16.jpg new file mode 100644 index 00000000000..4b2d10a1e5e Binary files /dev/null and b/papers/image-generation/2406-mar/img/img16.jpg differ diff --git a/papers/image-generation/2406-mar/img/img17.jpg b/papers/image-generation/2406-mar/img/img17.jpg new file mode 100644 index 00000000000..b0b29804523 Binary files /dev/null and b/papers/image-generation/2406-mar/img/img17.jpg differ diff --git a/papers/image-generation/2406-mar/img/img18.jpg b/papers/image-generation/2406-mar/img/img18.jpg new file mode 100644 index 00000000000..1d5fa70f1b1 Binary files /dev/null and b/papers/image-generation/2406-mar/img/img18.jpg differ diff --git a/papers/image-generation/2406-mar/img/img19.jpg b/papers/image-generation/2406-mar/img/img19.jpg new file mode 100644 index 00000000000..504121111b7 Binary files /dev/null and b/papers/image-generation/2406-mar/img/img19.jpg differ diff --git a/papers/image-generation/2406-mar/img/img2.jpg b/papers/image-generation/2406-mar/img/img2.jpg new file mode 100644 index 00000000000..7d295a6baa0 Binary files /dev/null and b/papers/image-generation/2406-mar/img/img2.jpg differ diff --git a/papers/image-generation/2406-mar/img/img3.jpg b/papers/image-generation/2406-mar/img/img3.jpg new file mode 100644 index 00000000000..13d02eccc31 Binary files /dev/null and b/papers/image-generation/2406-mar/img/img3.jpg differ diff --git a/papers/image-generation/2406-mar/img/img4.jpg b/papers/image-generation/2406-mar/img/img4.jpg new file mode 100644 index 00000000000..461284a0a04 Binary files /dev/null and b/papers/image-generation/2406-mar/img/img4.jpg differ diff --git a/papers/image-generation/2406-mar/img/img5.jpg b/papers/image-generation/2406-mar/img/img5.jpg new file mode 100644 index 00000000000..412bf3801e9 Binary files /dev/null and b/papers/image-generation/2406-mar/img/img5.jpg differ diff --git a/papers/image-generation/2406-mar/img/img6.jpg b/papers/image-generation/2406-mar/img/img6.jpg new file mode 100644 index 00000000000..1faf46bddfb Binary files /dev/null and b/papers/image-generation/2406-mar/img/img6.jpg differ diff --git a/papers/image-generation/2406-mar/img/img7.jpg b/papers/image-generation/2406-mar/img/img7.jpg new file mode 100644 index 00000000000..c60b5f2bc54 Binary files /dev/null and b/papers/image-generation/2406-mar/img/img7.jpg differ diff --git a/papers/image-generation/2406-mar/img/img8.jpg b/papers/image-generation/2406-mar/img/img8.jpg new file mode 100644 index 00000000000..693e2695661 Binary files /dev/null and b/papers/image-generation/2406-mar/img/img8.jpg differ diff --git a/papers/image-generation/2406-mar/img/img9.jpg b/papers/image-generation/2406-mar/img/img9.jpg new file mode 100644 index 00000000000..3821a9fef43 Binary files /dev/null and b/papers/image-generation/2406-mar/img/img9.jpg differ diff --git a/papers/image-generation/2406-mar/index.md b/papers/image-generation/2406-mar/index.md new file mode 100644 index 00000000000..76c073021af --- /dev/null +++ b/papers/image-generation/2406-mar/index.md @@ -0,0 +1,244 @@ +--- +title: "[24.06] MAR" +authors: nbswords +--- + +## 重鑄生成的秩序 + +[**Autoregressive Image Generation without Vector Quantization**](https://arxiv.org/abs/2406.11838) + +--- + +:::info +本篇同步發表於 [**nbswords 的 Medium**](https://medium.com/@nbswords/autoregressive-image-generation-without-vector-quantization-516b68b5acfa) +::: + +當前的自迴歸圖片生成模型常使用 vector quantized(VQ)將圖片離散化成一個個 token 來模擬自迴歸模型在 NLP 領域上的成功,但作者認為這種離散值空間對於自迴歸圖片生成模型並不是必要的,因此提出一種基於連續空間的自迴歸圖片生成模型,兼具更高的準度以及快速的推論時間。 + +## 背景知識 + +### Vector Quantization (VQ) + +這是一種行之有年的向量搜尋加速技巧,作法是將一個特徵空間向量(embedding vector)切割成不同的群組,每個群組以一個代表向量來當做 index,再透過一份包含所有代表向量索引的 codebook 來存取這群向量。 + +
+
+![VQ](./img/img1.jpg) +
+
+ +詳細作法可參考 [Survey Of Vector Space Search](https://medium.com/@nbswords/survey-of-vector-space-search-26555890ca5e) 或是 [Vector quantization wiki](https://en.wikipedia.org/wiki/Vector_quantization) + +### Auto-regressive image generation + +早期的 Visual Autoregressive Model (VAR)是直接將圖片生成任務模擬成 GPT-like 的自迴歸文字生成,透過將圖片每一個像素都視為一個種類,模型的任務使用 categorical cross entropy 做多類別預測,如 Google 的[Image Transformer, 2018](https://arxiv.org/abs/1802.05751)和 OpenAI 的[ImageGPT, 2020](https://cdn.openai.com/papers/Generative_Pretraining_from_Pixels_V2.pdf)都屬於這種方法 + +而為了加速圖片生成的速度,現在的自迴歸圖片生成模型的常見作法是引入 VQ 做兩階段訓練,第一階段先在 latent space 中學習用來做圖片重建的 codebook,第二階段則基於這個 learned codebook 去自迴歸地生成圖片 + +- 以[VQ-VAE, 2017](https://arxiv.org/abs/1711.00937)為例 + - Encode 階段先透過是一個 CNN 擷取出圖片特徵,然後對 feature map $z_e$ 做 VQ,得到每個群組的代表向量(紫色的向量, $e_1$ ~ $e_K$),接下來計算 $z_e$ 中每一個特徵點跟代表向量之間的距離,取距離最近的代表向量 index 來取代掉他們得到 $q(z|x)$,這個 $q$ 正是離散的特徵。 + - Decode 階段則使用從 $q$ 映射回來的 $Z_q$ 做圖片生成。 + +
+
+![VQ-VAE](./img/img2.jpg) +
+
+ +自迴歸模型透過 VQ 來加速圖片生成聽起來很棒,但是,這樣的方法難道沒有什麼缺點嗎? + +當然有,1. VQ 編碼器很難訓練 2. VQ 會降低重建出來的圖片品質 + +## Method + +### 放棄 VQ,擁抱 Diffusion + +既然 diffusion model 可以用來表現一張圖片中所有 pixels 或 tokens 的聯合機率分布,那為何不能用來表現每一個 token 的機率分布呢? + +- Diffusion 可以將輸入的 prompt/image 作為條件從 noise 中產圖 +
+
+![diffusion+text_prompt](./img/img3.jpg) +
+
+- 而現在是改為將一個 transformer 的輸出作為條件從 noise 中產圖 (這個 transformer 的輸入是什麼等等會講到) +
+
+![diffusion+transformer](./img/img4.jpg) +
+
+ +他們的做法是自迴歸地去預測出每一個 token 的條件 z,然後利用一個 diffusion model(small MLP)做 denosing 來得到 output x 的機率分布 p(x|z) + +
+
+![autoregressive_diffusion](./img/img5.jpg) +
+
+ +提出 Diffusion Loss 來取代原本的 categorical cross entropy + +
+
+![diffusion_cal](./img/img6.jpg) +
+
+
+
+![diffusion_cal_2](./img/img7.jpg) +
+
+- ε 是一個從(0, 1)常態分布中抽樣出來的noise sample +- εθ 是一個 small MLP,$ε_θ(x_t|t, z)$ 的意思是這個模型會基於t跟z作為生成條件,然後輸入x_t來得到noise vector ε_θ + - t是noise schedule的 timestamp,z是由Transformer基於上下文產生出來的condition vector + +最後,跟 DDPM 一樣,在推論階段,模型會透過 reverse diffusion process 來產生出圖片 (從 x_t -> x_0) + +
+
+![reverse_diff](./img/img8.jpg) +
+
+ +除此之外,若讀者們有使用過 diffusion model 來做圖片生成肯定曾看到過一個超參數叫做溫度(temperature),溫度越高表示有越高的隨機性,溫度越低則隨機性越低,這是一個控制採樣隨機性的參數,在本作中採用溫度 $\tau$ \* $\sigma_t \delta$ 來控制,這個方法出自[Diffusion Models Beat GANs on Image Synthesis](https://arxiv.org/abs/2105.05233) + +仔細想就會發現,這是一個將 VAE 的條件生成以及 DDPM 的降噪結合在一起使用的方法,用 VAE 壓縮圖像特徵並用 DDPM 來生成圖片,非常精妙 + +
+
+![VAE_and_Diffusion](./img/img9.jpg) +
+
+ +### 融入 Autoregressive 和 Masked Generative Models 的概念 + +愷明作為[MAE(Masked Autoencoders Are Scalable Vision Learners), 2021](https://arxiv.org/abs/2111.06377)的作者,接下來想到的當然是將 MAE 的概念一起融入到模型當中 + +- 主要的流程參考 MAE 的後續作品[MaskGIT, 2022](https://arxiv.org/abs/2202.04200)和[MAGE, 2022](https://arxiv.org/abs/2211.09117),兩者都是拿 Transformer 做 Masked Autoregressive(MAR),其中 MaskGIT 的貢獻是使用 bidirectional transformer decoder 去同時預測多個 token 而 MAGE 的貢獻是統一圖片表徵學習與圖片合成 +- 在本作中同樣採用了 MAE 的雙向注意力機制,將 mask token [m]放在中間層讓所有 token 能看見彼此,而不是只能看到前面的 token,並且只對沒有被 mask 到的 token 計算 loss - 請注意,這並不是一般認為的 causal vs bidirectional attention,若想要深入理解請去閱讀 MAE 原始論文 - 這樣做的好處是圖片生成的表現會變好,壞處是這會讓 transformer 的訓練和推論無法使用 kv cache 加速,但也由於可以同時預測多個 token,這個方法並不會太慢 +
+
+![bidirect](./img/img10.jpg) +
+
+ +- 下圖是一般順序的 AR、隨機順序的 AR(random masking one token)和他們使用的同時預測多個 token 的 AR(random masking multiple token)的比較 - 這裡就揭曉了前面那個 transformer 的輸入是一個 masked image +
+
+![AR_and_MAR](./img/img11.jpg) +
+
+ +## Implementation + +- Diffusion Loss:cosine 形狀的 noise schedule,訓練時 DDPM 有 1000 step 而推論則僅有 100 step +- Denoising MLP(small MLP):3 層 1024 個 channel 的 block,每一個 block 包含 LayerNorm, linear layer, SiLU 激活函數並使用 residual connection 連接,實作上是使用 AdaLN 將 transformer 的輸出 z 加入到 LayerNorm 層當中 +- Tokenizer:使用 LDM 提供的公開 tokenizer,包括 VQ-16 和 KL-16。其中 VQ-16 是基於 VQ-GAN 的量化模型,使用 GAN loss 和感知 loss,KL-16 則透過 KL 散度做 regularization 且不依賴 VQ +- Transformer:使用 ViT 來接收 tokenizer 處理後的 token sequence,加上位置編碼和類別 token [CLS],然後通過 32 層 1024 個 channel 的 transformer block +- Masked autoregressive models:在訓練階段使用 [0.7, 1.0] 的 masking ratio,0.7 代表隨機遮蔽掉 70%的 token,另外為了避免抽樣出來的序列太短,他們始終 pad 64 個[cls] token 到其中。在推理階段會逐步將 1.0 的 masking ratio 降低到 0,並使用 cosine schedule 來調整步數,預設是 64 步 +- Baseline Autoregressive Model: causal attention 的 GPT 模型,輸入有 append 一個[cls],並且有使用 kv cache 以及溫度參數 + +## Experiments + +模型實驗設定都是 AR/MAR-L (∼400M parameters), 400 epochs, ImageNet 256×256 + +### Diffusion Loss vs. Cross-entropy Loss + +在所有 AR/MAR 的變體之下,使用 Diff Loss 的表現都比較好,其中 AR 的改進最少,越往下改進越多,MAR+bidirect+>1 preds 的改進最多,可見 Diff Loss 對於 MAR 模型的重要性 + +- Diff Loss 也有使用其他擴散模型也有的 CFG(Classifier-Free Guidance)來提升生成效果 +- Fréchet inception distance (FID) 是越低越好,Inception Score (IS)則是越高越好,兩個 metrics 都是在衡量生成的圖片品質 + +
+
+![Diffusion Loss](./img/img12.jpg) +
+
+ +### Tokenizers + +實驗不同的 tokenizers 搭配 Diff loss 的效果,由於從離散空間脫離到了連續空間,tokenizer 不再受限,Diff loss 可以適用在連續/離散的 tokenizer 上 + +- VQ-16 指的是將 VQ-VAE 中作 VQ 之前的連續值 latent 作為 token,前面也有提過這裡的 VQ-16 跟 KL-16 都是使用 LDM 提供的,只是他們改為在 ImageNet 上訓練而非 OpenImages 上 +- Consistency Decoder 是一個非 VQ 的 Tokenizer,來自[DALL·E 3](https://github.com/openai/consistencydecoder) +- reconstruction FID (rFID) 越低越好,用來評估 tokenizer + +
+
+![Tokenizers](./img/img13.jpg) +
+
+ +結果顯示使用連續型的 tokenizer 如 KL-16 會比離散型的 VQ-16 表現好,另外也順便展示了一下能兼容在不同的 tokenizer 如 Consistency Decoder 上 + +### Denosing MLP + +不同參數大小的 MLP 表現差異,width=1024 的表現最好 + +
+
+![Denosing MLP](./img/img14.jpg) +
+
+ +### Sampling steps of Diffusion Loss + +不同 Diffusion steps 對於生成品質的影響,steps=100 就可以獲得很好的效果 + +
+
+![Sampling steps of Diffusion Loss](./img/img15.jpg) +
+
+ +### Temperature of Diffusion Loss + +對 Diff Loss 來說溫度也很重要 + +
+
+![Temperature](./img/img16.jpg) +
+
+ +### Speed/accuracy trade-off + +由於無法使用 kv cache,這個評估非常重要,這裡是用一個 A100 然後 batch size=256 來測試 + +
+
+![Speed](./img/img17.jpg) +
+
+ +- MAR: 每一個點代表不同的 autoregressive step(8 to 128) +- DiT: 每一個點代表不同的 diffusion steps(50, 75, 150, 250),這裡使用的 DiT-XL +- AR: 套用了 kv cache + +可見即使無法使用 kv cache 這個模型也有不差的推論速度,不過 default 設定(step=64)還是慢了不少 + +### System-level comparison + +與其他模型的比較,最大的模型 MAR-H 才能獲得最好的表現,但 MAR-L 表現也已經不錯 + +
+
+![System-level comparison](./img/img18.jpg) +
+
+ +## 結論 + +總的來說,本作打破了自迴歸圖片生成模型的舊有做法,以結合 Diffusion 和 MAE 開闢出新的方向,而且只使用了最簡單的 DDPM 來作展示,若改用更好的擴散模型肯定能獲得更好的結果,期待後續有更多衍生作品 + +最後放上一部份模型產生出來的圖 + +
+
+![result](./img/img19.jpg) +
+
+ +## Reference + +https://zhouyifan.net/2024/07/27/20240717-ar-wo-vq/ diff --git a/papers/image-generation/_category_.json b/papers/image-generation/_category_.json new file mode 100644 index 00000000000..85bc3954950 --- /dev/null +++ b/papers/image-generation/_category_.json @@ -0,0 +1,6 @@ +{ + "label": "Image Generation", + "link": { + "type": "generated-index" + } +}