diff --git a/Demo/styleTTS_inference_ljspeech_colab.ipynb b/Demo/styleTTS_inference_ljspeech_colab.ipynb new file mode 100644 index 0000000..7cc7d53 --- /dev/null +++ b/Demo/styleTTS_inference_ljspeech_colab.ipynb @@ -0,0 +1,1046 @@ +{ + "nbformat": 4, + "nbformat_minor": 0, + "metadata": { + "colab": { + "provenance": [] + }, + "kernelspec": { + "name": "python3", + "display_name": "Python 3" + }, + "language_info": { + "name": "python" + }, + "accelerator": "GPU", + "gpuClass": "standard" + }, + "cells": [ + { + "cell_type": "markdown", + "source": [], + "metadata": { + "id": "eBBMOwCXTIlQ" + } + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "erRp8X9VACPU", + "outputId": "e8141a7f-3d05-4f0e-d93f-f645ed526e0c" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Cloning into 'StyleTTS'...\n", + "remote: Enumerating objects: 184, done.\u001b[K\n", + "remote: Counting objects: 100% (50/50), done.\u001b[K\n", + "remote: Compressing objects: 100% (5/5), done.\u001b[K\n", + "remote: Total 184 (delta 46), reused 45 (delta 45), pack-reused 134\u001b[K\n", + "Receiving objects: 100% (184/184), 113.75 MiB | 17.83 MiB/s, done.\n", + "Resolving deltas: 100% (79/79), done.\n", + "Updating files: 100% (29/29), done.\n", + "/content/StyleTTS\n" + ] + } + ], + "source": [ + "!git clone https://github.com/yl4579/StyleTTS.git\n", + "%cd StyleTTS" + ] + }, + { + "cell_type": "code", + "source": [ + "model_path = \"https://drive.google.com/file/d/1aqOExU7NroGHdIVjgkzqRYrK5q_694cj/view?usp=sharing\"\n", + "vocoder_path = \"https://drive.google.com/file/d/1h_h0GFdC6VOiZ-oFDClqy2bVonA1xDiw/view?usp=sharing\"\n", + "ljspeech_sample_path = \"https://drive.google.com/file/d/1aL8uAi-h6WlqDAbjauq5Cfs70gKmsHmb/view?usp=sharing\"" + ], + "metadata": { + "id": "bDUL56ZT6vkZ" + }, + "execution_count": 1, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "import gdown\n", + "gdown.download(model_path, \"Models.zip\", quiet=False, fuzzy=True)\n", + "gdown.download(vocoder_path, \"Vocoder.zip\", quiet=False, fuzzy=True)\n", + "gdown.download(ljspeech_sample_path, \"LJSpeech_sample.zip\", quiet=False, fuzzy=True)" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 239 + }, + "id": "zfcl3xMFELkl", + "outputId": "9a467866-c664-43cf-e3c9-6ffe4602473b" + }, + "execution_count": 6, + "outputs": [ + { + "output_type": "stream", + "name": "stderr", + "text": [ + "Downloading...\n", + "From: https://drive.google.com/uc?id=1aqOExU7NroGHdIVjgkzqRYrK5q_694cj\n", + "To: /content/StyleTTS/Models.zip\n", + "100%|██████████| 303M/303M [00:07<00:00, 40.7MB/s]\n", + "Downloading...\n", + "From: https://drive.google.com/uc?id=1h_h0GFdC6VOiZ-oFDClqy2bVonA1xDiw\n", + "To: /content/StyleTTS/Vocoder.zip\n", + "100%|██████████| 53.1M/53.1M [00:01<00:00, 42.6MB/s]\n", + "Downloading...\n", + "From: https://drive.google.com/uc?id=1aL8uAi-h6WlqDAbjauq5Cfs70gKmsHmb\n", + "To: /content/StyleTTS/LJSpeech_sample.zip\n", + "100%|██████████| 2.02M/2.02M [00:00<00:00, 130MB/s]\n" + ] + }, + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "'LJSpeech_sample.zip'" + ], + "application/vnd.google.colaboratory.intrinsic+json": { + "type": "string" + } + }, + "metadata": {}, + "execution_count": 6 + } + ] + }, + { + "cell_type": "code", + "source": [ + "!unzip -o /content/StyleTTS/Models.zip -d /content/StyleTTS/\n", + "!unzip -o /content/StyleTTS/Vocoder.zip -d /content/StyleTTS/\n", + "!unzip -o /content/StyleTTS/LJSpeech_sample.zip -d /content/StyleTTS/Data/" + ], + "metadata": { + "id": "wLj8822pEENp", + "colab": { + "base_uri": "https://localhost:8080/" + }, + "outputId": "906342ec-9b4e-4a9a-97d4-1e5bdc5d1385" + }, + "execution_count": 7, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Archive: /content/StyleTTS/Models.zip\n", + " creating: /content/StyleTTS/Models/\n", + " creating: /content/StyleTTS/Models/LJSpeech/\n", + " inflating: /content/StyleTTS/Models/LJSpeech/config.yml \n", + " inflating: /content/StyleTTS/Models/LJSpeech/epoch_2nd_00180.pth \n", + "Archive: /content/StyleTTS/Vocoder.zip\n", + " creating: /content/StyleTTS/Vocoder/\n", + " inflating: /content/StyleTTS/Vocoder/config.json \n", + " inflating: /content/StyleTTS/Vocoder/g_00750000 \n", + "Archive: /content/StyleTTS/LJSpeech_sample.zip\n", + " creating: /content/StyleTTS/Data/LJSpeech-1.1/wavs/\n", + " inflating: /content/StyleTTS/Data/LJSpeech-1.1/wavs/LJ001-0001.wav \n", + " inflating: /content/StyleTTS/Data/LJSpeech-1.1/wavs/LJ001-0002.wav \n", + " inflating: /content/StyleTTS/Data/LJSpeech-1.1/wavs/LJ001-0003.wav \n", + " inflating: /content/StyleTTS/Data/LJSpeech-1.1/wavs/LJ001-0004.wav \n", + " inflating: /content/StyleTTS/Data/LJSpeech-1.1/wavs/LJ001-0005.wav \n", + " inflating: /content/StyleTTS/Data/train_list.txt \n", + " inflating: /content/StyleTTS/Data/val_list.txt \n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "!pip install SoundFile torchaudio munch torch pydub pyyaml librosa phonemizer attrdict git+https://github.com/resemble-ai/monotonic_align.git" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "BkfOde_6ATJ8", + "outputId": "28d5b16b-dd74-4965-c382-8dda0510e8f4" + }, + "execution_count": 8, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/\n", + "Collecting git+https://github.com/resemble-ai/monotonic_align.git\n", + " Cloning https://github.com/resemble-ai/monotonic_align.git to /tmp/pip-req-build-wpy8n2fx\n", + " Running command git clone --filter=blob:none --quiet https://github.com/resemble-ai/monotonic_align.git /tmp/pip-req-build-wpy8n2fx\n", + " Resolved https://github.com/resemble-ai/monotonic_align.git to commit 78b985be210a03d08bc3acc01c4df0442105366f\n", + " Installing build dependencies ... \u001b[?25l\u001b[?25hdone\n", + " Getting requirements to build wheel ... \u001b[?25l\u001b[?25hdone\n", + " Preparing metadata (pyproject.toml) ... \u001b[?25l\u001b[?25hdone\n", + "Requirement already satisfied: SoundFile in /usr/local/lib/python3.9/dist-packages (0.12.1)\n", + "Requirement already satisfied: torchaudio in /usr/local/lib/python3.9/dist-packages (2.0.1+cu118)\n", + "Collecting munch\n", + " Downloading munch-2.5.0-py2.py3-none-any.whl (10 kB)\n", + "Requirement already satisfied: torch in /usr/local/lib/python3.9/dist-packages (2.0.0+cu118)\n", + "Collecting pydub\n", + " Downloading pydub-0.25.1-py2.py3-none-any.whl (32 kB)\n", + "Requirement already satisfied: pyyaml in /usr/local/lib/python3.9/dist-packages (6.0)\n", + "Requirement already satisfied: librosa in /usr/local/lib/python3.9/dist-packages (0.10.0.post2)\n", + "Collecting phonemizer\n", + " Downloading phonemizer-3.2.1-py3-none-any.whl (90 kB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m90.6/90.6 kB\u001b[0m \u001b[31m13.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hCollecting attrdict\n", + " Downloading attrdict-2.0.1-py2.py3-none-any.whl (9.9 kB)\n", + "Requirement already satisfied: cffi>=1.0 in /usr/local/lib/python3.9/dist-packages (from SoundFile) (1.15.1)\n", + "Requirement already satisfied: triton==2.0.0 in /usr/local/lib/python3.9/dist-packages (from torch) (2.0.0)\n", + "Requirement already satisfied: jinja2 in /usr/local/lib/python3.9/dist-packages (from torch) (3.1.2)\n", + "Requirement already satisfied: typing-extensions in /usr/local/lib/python3.9/dist-packages (from torch) (4.5.0)\n", + "Requirement already satisfied: networkx in /usr/local/lib/python3.9/dist-packages (from torch) (3.1)\n", + "Requirement already satisfied: filelock in /usr/local/lib/python3.9/dist-packages (from torch) (3.11.0)\n", + "Requirement already satisfied: sympy in /usr/local/lib/python3.9/dist-packages (from torch) (1.11.1)\n", + "Requirement already satisfied: lit in /usr/local/lib/python3.9/dist-packages (from triton==2.0.0->torch) (16.0.1)\n", + "Requirement already satisfied: cmake in /usr/local/lib/python3.9/dist-packages (from triton==2.0.0->torch) (3.25.2)\n", + "Requirement already satisfied: six in /usr/local/lib/python3.9/dist-packages (from munch) (1.16.0)\n", + "Requirement already satisfied: soxr>=0.3.2 in /usr/local/lib/python3.9/dist-packages (from librosa) (0.3.5)\n", + "Requirement already satisfied: pooch<1.7,>=1.0 in /usr/local/lib/python3.9/dist-packages (from librosa) (1.6.0)\n", + "Requirement already satisfied: audioread>=2.1.9 in /usr/local/lib/python3.9/dist-packages (from librosa) (3.0.0)\n", + "Requirement already satisfied: numpy!=1.22.0,!=1.22.1,!=1.22.2,>=1.20.3 in /usr/local/lib/python3.9/dist-packages (from librosa) (1.22.4)\n", + "Requirement already satisfied: msgpack>=1.0 in /usr/local/lib/python3.9/dist-packages (from librosa) (1.0.5)\n", + "Requirement already satisfied: numba>=0.51.0 in /usr/local/lib/python3.9/dist-packages (from librosa) (0.56.4)\n", + "Requirement already satisfied: scikit-learn>=0.20.0 in /usr/local/lib/python3.9/dist-packages (from librosa) (1.2.2)\n", + "Requirement already satisfied: scipy>=1.2.0 in /usr/local/lib/python3.9/dist-packages (from librosa) (1.10.1)\n", + "Requirement already satisfied: joblib>=0.14 in /usr/local/lib/python3.9/dist-packages (from librosa) (1.2.0)\n", + "Requirement already satisfied: decorator>=4.3.0 in /usr/local/lib/python3.9/dist-packages (from librosa) (4.4.2)\n", + "Requirement already satisfied: lazy-loader>=0.1 in /usr/local/lib/python3.9/dist-packages (from librosa) (0.2)\n", + "Requirement already satisfied: attrs>=18.1 in /usr/local/lib/python3.9/dist-packages (from phonemizer) (23.1.0)\n", + "Collecting dlinfo\n", + " Downloading dlinfo-1.2.1-py3-none-any.whl (3.6 kB)\n", + "Collecting segments\n", + " Downloading segments-2.2.1-py2.py3-none-any.whl (15 kB)\n", + "Requirement already satisfied: pycparser in /usr/local/lib/python3.9/dist-packages (from cffi>=1.0->SoundFile) (2.21)\n", + "Requirement already satisfied: llvmlite<0.40,>=0.39.0dev0 in /usr/local/lib/python3.9/dist-packages (from numba>=0.51.0->librosa) (0.39.1)\n", + "Requirement already satisfied: setuptools in /usr/local/lib/python3.9/dist-packages (from numba>=0.51.0->librosa) (67.6.1)\n", + "Requirement already satisfied: appdirs>=1.3.0 in /usr/local/lib/python3.9/dist-packages (from pooch<1.7,>=1.0->librosa) (1.4.4)\n", + "Requirement already satisfied: requests>=2.19.0 in /usr/local/lib/python3.9/dist-packages (from pooch<1.7,>=1.0->librosa) (2.27.1)\n", + "Requirement already satisfied: packaging>=20.0 in /usr/local/lib/python3.9/dist-packages (from pooch<1.7,>=1.0->librosa) (23.1)\n", + "Requirement already satisfied: threadpoolctl>=2.0.0 in /usr/local/lib/python3.9/dist-packages (from scikit-learn>=0.20.0->librosa) (3.1.0)\n", + "Requirement already satisfied: MarkupSafe>=2.0 in /usr/local/lib/python3.9/dist-packages (from jinja2->torch) (2.1.2)\n", + "Collecting clldutils>=1.7.3\n", + " Downloading clldutils-3.19.0-py2.py3-none-any.whl (1.7 MB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.7/1.7 MB\u001b[0m \u001b[31m80.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hRequirement already satisfied: regex in /usr/local/lib/python3.9/dist-packages (from segments->phonemizer) (2022.10.31)\n", + "Collecting csvw>=1.5.6\n", + " Downloading csvw-3.1.3-py2.py3-none-any.whl (56 kB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m56.7/56.7 kB\u001b[0m \u001b[31m8.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hRequirement already satisfied: mpmath>=0.19 in /usr/local/lib/python3.9/dist-packages (from sympy->torch) (1.3.0)\n", + "Requirement already satisfied: tabulate>=0.7.7 in /usr/local/lib/python3.9/dist-packages (from clldutils>=1.7.3->segments->phonemizer) (0.8.10)\n", + "Requirement already satisfied: lxml in /usr/local/lib/python3.9/dist-packages (from clldutils>=1.7.3->segments->phonemizer) (4.9.2)\n", + "Requirement already satisfied: markdown in /usr/local/lib/python3.9/dist-packages (from clldutils>=1.7.3->segments->phonemizer) (3.4.3)\n", + "Collecting pylatexenc\n", + " Downloading pylatexenc-2.10.tar.gz (162 kB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m162.6/162.6 kB\u001b[0m \u001b[31m24.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25h Preparing metadata (setup.py) ... \u001b[?25l\u001b[?25hdone\n", + "Collecting colorlog\n", + " Downloading colorlog-6.7.0-py2.py3-none-any.whl (11 kB)\n", + "Requirement already satisfied: python-dateutil in /usr/local/lib/python3.9/dist-packages (from clldutils>=1.7.3->segments->phonemizer) (2.8.2)\n", + "Requirement already satisfied: jsonschema in /usr/local/lib/python3.9/dist-packages (from csvw>=1.5.6->segments->phonemizer) (4.3.3)\n", + "Collecting rfc3986<2\n", + " Downloading rfc3986-1.5.0-py2.py3-none-any.whl (31 kB)\n", + "Collecting rdflib\n", + " Downloading rdflib-6.3.2-py3-none-any.whl (528 kB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m528.1/528.1 kB\u001b[0m \u001b[31m45.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hCollecting colorama\n", + " Downloading colorama-0.4.6-py2.py3-none-any.whl (25 kB)\n", + "Collecting language-tags\n", + " Downloading language_tags-1.2.0-py3-none-any.whl (213 kB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m213.4/213.4 kB\u001b[0m \u001b[31m24.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hRequirement already satisfied: uritemplate>=3.0.0 in /usr/local/lib/python3.9/dist-packages (from csvw>=1.5.6->segments->phonemizer) (4.1.1)\n", + "Collecting isodate\n", + " Downloading isodate-0.6.1-py2.py3-none-any.whl (41 kB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m41.7/41.7 kB\u001b[0m \u001b[31m5.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hRequirement already satisfied: babel in /usr/local/lib/python3.9/dist-packages (from csvw>=1.5.6->segments->phonemizer) (2.12.1)\n", + "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.9/dist-packages (from requests>=2.19.0->pooch<1.7,>=1.0->librosa) (2022.12.7)\n", + "Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.9/dist-packages (from requests>=2.19.0->pooch<1.7,>=1.0->librosa) (3.4)\n", + "Requirement already satisfied: urllib3<1.27,>=1.21.1 in /usr/local/lib/python3.9/dist-packages (from requests>=2.19.0->pooch<1.7,>=1.0->librosa) (1.26.15)\n", + "Requirement already satisfied: charset-normalizer~=2.0.0 in /usr/local/lib/python3.9/dist-packages (from requests>=2.19.0->pooch<1.7,>=1.0->librosa) (2.0.12)\n", + "Requirement already satisfied: pyrsistent!=0.17.0,!=0.17.1,!=0.17.2,>=0.14.0 in /usr/local/lib/python3.9/dist-packages (from jsonschema->csvw>=1.5.6->segments->phonemizer) (0.19.3)\n", + "Requirement already satisfied: importlib-metadata>=4.4 in /usr/local/lib/python3.9/dist-packages (from markdown->clldutils>=1.7.3->segments->phonemizer) (6.4.1)\n", + "Requirement already satisfied: pyparsing<4,>=2.1.0 in /usr/local/lib/python3.9/dist-packages (from rdflib->csvw>=1.5.6->segments->phonemizer) (3.0.9)\n", + "Requirement already satisfied: zipp>=0.5 in /usr/local/lib/python3.9/dist-packages (from importlib-metadata>=4.4->markdown->clldutils>=1.7.3->segments->phonemizer) (3.15.0)\n", + "Building wheels for collected packages: monotonic-align, pylatexenc\n", + " Building wheel for monotonic-align (pyproject.toml) ... \u001b[?25l\u001b[?25hdone\n", + " Created wheel for monotonic-align: filename=monotonic_align-1.2-cp39-cp39-linux_x86_64.whl size=1357737 sha256=a6030d91825e22010b2dc8992a921a253890c6e0cfc9481ec81c4d58b8eaa3bd\n", + " Stored in directory: /tmp/pip-ephem-wheel-cache-_uj2ew4q/wheels/d2/9d/1a/a00ae88b1bbef548fee5e752305e489f9f07265fef339b55d1\n", + " Building wheel for pylatexenc (setup.py) ... \u001b[?25l\u001b[?25hdone\n", + " Created wheel for pylatexenc: filename=pylatexenc-2.10-py3-none-any.whl size=136831 sha256=6aef1da7f86be60bec87c49ba6a7460feca15bf96be17e3309e44b5857dd7a1b\n", + " Stored in directory: /root/.cache/pip/wheels/a3/68/66/2f15abd0673d83c02f354115feedeb89c3daed2ac319b11090\n", + "Successfully built monotonic-align pylatexenc\n", + "Installing collected packages: rfc3986, pylatexenc, pydub, language-tags, dlinfo, munch, monotonic-align, isodate, colorlog, colorama, attrdict, rdflib, csvw, clldutils, segments, phonemizer\n", + "Successfully installed attrdict-2.0.1 clldutils-3.19.0 colorama-0.4.6 colorlog-6.7.0 csvw-3.1.3 dlinfo-1.2.1 isodate-0.6.1 language-tags-1.2.0 monotonic-align-1.2 munch-2.5.0 phonemizer-3.2.1 pydub-0.25.1 pylatexenc-2.10 rdflib-6.3.2 rfc3986-1.5.0 segments-2.2.1\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "!sudo apt-get install espeak-ng" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "6Ko9aBJrCkVN", + "outputId": "fd673db2-5755-4090-f07b-85ac468ef119" + }, + "execution_count": 9, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Reading package lists... Done\n", + "Building dependency tree \n", + "Reading state information... Done\n", + "The following additional packages will be installed:\n", + " espeak-ng-data libespeak-ng1 libpcaudio0 libsonic0\n", + "The following NEW packages will be installed:\n", + " espeak-ng espeak-ng-data libespeak-ng1 libpcaudio0 libsonic0\n", + "0 upgraded, 5 newly installed, 0 to remove and 24 not upgraded.\n", + "Need to get 4,215 kB of archives.\n", + "After this operation, 12.0 MB of additional disk space will be used.\n", + "Get:1 http://archive.ubuntu.com/ubuntu focal/main amd64 libpcaudio0 amd64 1.1-4 [7,908 B]\n", + "Get:2 http://archive.ubuntu.com/ubuntu focal/main amd64 libsonic0 amd64 0.2.0-8 [13.1 kB]\n", + "Get:3 http://archive.ubuntu.com/ubuntu focal/main amd64 espeak-ng-data amd64 1.50+dfsg-6 [3,682 kB]\n", + "Get:4 http://archive.ubuntu.com/ubuntu focal/main amd64 libespeak-ng1 amd64 1.50+dfsg-6 [189 kB]\n", + "Get:5 http://archive.ubuntu.com/ubuntu focal/universe amd64 espeak-ng amd64 1.50+dfsg-6 [322 kB]\n", + "Fetched 4,215 kB in 2s (2,210 kB/s)\n", + "debconf: unable to initialize frontend: Dialog\n", + "debconf: (No usable dialog-like program is installed, so the dialog based frontend cannot be used. at /usr/share/perl5/Debconf/FrontEnd/Dialog.pm line 76, <> line 5.)\n", + "debconf: falling back to frontend: Readline\n", + "debconf: unable to initialize frontend: Readline\n", + "debconf: (This frontend requires a controlling tty.)\n", + "debconf: falling back to frontend: Teletype\n", + "dpkg-preconfigure: unable to re-open stdin: \n", + "Selecting previously unselected package libpcaudio0:amd64.\n", + "(Reading database ... 122352 files and directories currently installed.)\n", + "Preparing to unpack .../libpcaudio0_1.1-4_amd64.deb ...\n", + "Unpacking libpcaudio0:amd64 (1.1-4) ...\n", + "Selecting previously unselected package libsonic0:amd64.\n", + "Preparing to unpack .../libsonic0_0.2.0-8_amd64.deb ...\n", + "Unpacking libsonic0:amd64 (0.2.0-8) ...\n", + "Selecting previously unselected package espeak-ng-data:amd64.\n", + "Preparing to unpack .../espeak-ng-data_1.50+dfsg-6_amd64.deb ...\n", + "Unpacking espeak-ng-data:amd64 (1.50+dfsg-6) ...\n", + "Selecting previously unselected package libespeak-ng1:amd64.\n", + "Preparing to unpack .../libespeak-ng1_1.50+dfsg-6_amd64.deb ...\n", + "Unpacking libespeak-ng1:amd64 (1.50+dfsg-6) ...\n", + "Selecting previously unselected package espeak-ng.\n", + "Preparing to unpack .../espeak-ng_1.50+dfsg-6_amd64.deb ...\n", + "Unpacking espeak-ng (1.50+dfsg-6) ...\n", + "Setting up libpcaudio0:amd64 (1.1-4) ...\n", + "Setting up libsonic0:amd64 (0.2.0-8) ...\n", + "Setting up espeak-ng-data:amd64 (1.50+dfsg-6) ...\n", + "Setting up libespeak-ng1:amd64 (1.50+dfsg-6) ...\n", + "Setting up espeak-ng (1.50+dfsg-6) ...\n", + "Processing triggers for man-db (2.9.1-1) ...\n", + "Processing triggers for libc-bin (2.31-0ubuntu9.9) ...\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "# load packages\n", + "import random\n", + "import yaml\n", + "from munch import Munch\n", + "import numpy as np\n", + "import torch\n", + "from torch import nn\n", + "import torch.nn.functional as F\n", + "import torchaudio\n", + "import librosa\n", + "\n", + "from models import *\n", + "from utils import *" + ], + "metadata": { + "id": "mTmIwun_AVqH" + }, + "execution_count": 10, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "## Check if cuda is available, if not change runtime type" + ], + "metadata": { + "id": "VcTC0nIW6PVp" + } + }, + { + "cell_type": "code", + "source": [ + "device = 'cuda' if torch.cuda.is_available() else 'cpu'" + ], + "metadata": { + "id": "csttlZZfAg14" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "device" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 35 + }, + "id": "oTrdOK7oUM4N", + "outputId": "0affd406-0332-4c3f-e517-4c294b67be72" + }, + "execution_count": null, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "'cuda'" + ], + "application/vnd.google.colaboratory.intrinsic+json": { + "type": "string" + } + }, + "metadata": {}, + "execution_count": 11 + } + ] + }, + { + "cell_type": "code", + "source": [ + "_pad = \"$\"\n", + "_punctuation = ';:,.!?¡¿—…\"«»“” '\n", + "_letters = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz'\n", + "_letters_ipa = \"ɑɐɒæɓʙβɔɕçɗɖðʤəɘɚɛɜɝɞɟʄɡɠɢʛɦɧħɥʜɨɪʝɭɬɫɮʟɱɯɰŋɳɲɴøɵɸθœɶʘɹɺɾɻʀʁɽʂʃʈʧʉʊʋⱱʌɣɤʍχʎʏʑʐʒʔʡʕʢǀǁǂǃˈˌːˑʼʴʰʱʲʷˠˤ˞↓↑→↗↘'̩'ᵻ\"\n", + "\n", + "\n", + "# Export all symbols:\n", + "symbols = [_pad] + list(_punctuation) + list(_letters) + list(_letters_ipa)\n", + "\n", + "dicts = {}\n", + "for i in range(len((symbols))):\n", + " dicts[symbols[i]] = i\n", + "\n", + "class TextCleaner:\n", + " def __init__(self, dummy=None):\n", + " self.word_index_dictionary = dicts\n", + " def __call__(self, text):\n", + " indexes = []\n", + " for char in text:\n", + " try:\n", + " indexes.append(self.word_index_dictionary[char])\n", + " except KeyError:\n", + " print(char)\n", + " return indexes\n", + "\n", + "textclenaer = TextCleaner()" + ], + "metadata": { + "id": "aqEDFLPfAkQ_" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "to_mel = torchaudio.transforms.MelSpectrogram(\n", + " n_mels=80, n_fft=2048, win_length=1200, hop_length=300)\n", + "mean, std = -4, 4\n", + "\n", + "def length_to_mask(lengths):\n", + " mask = torch.arange(lengths.max()).unsqueeze(0).expand(lengths.shape[0], -1).type_as(lengths)\n", + " mask = torch.gt(mask+1, lengths.unsqueeze(1))\n", + " return mask\n", + "\n", + "def preprocess(wave):\n", + " wave_tensor = torch.from_numpy(wave).float()\n", + " mel_tensor = to_mel(wave_tensor)\n", + " mel_tensor = (torch.log(1e-5 + mel_tensor.unsqueeze(0)) - mean) / std\n", + " return mel_tensor\n", + "\n", + "def compute_style(ref_dicts):\n", + " reference_embeddings = {}\n", + " for key, path in ref_dicts.items():\n", + " wave, sr = librosa.load(path, sr=24000)\n", + " audio, index = librosa.effects.trim(wave, top_db=30)\n", + " if sr != 24000:\n", + " audio = librosa.resample(audio, sr, 24000)\n", + " mel_tensor = preprocess(audio).to(device)\n", + "\n", + " with torch.no_grad():\n", + " ref = model.style_encoder(mel_tensor.unsqueeze(1))\n", + " reference_embeddings[key] = (ref.squeeze(1), audio)\n", + " \n", + " return reference_embeddings" + ], + "metadata": { + "id": "Tgs-cZU8AvDs" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "# load phonemizer\n", + "import phonemizer\n", + "global_phonemizer = phonemizer.backend.EspeakBackend(language='en-us', preserve_punctuation=True, with_stress=True)" + ], + "metadata": { + "id": "jRzzC9NnAzC2" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "# load hifi-gan\n", + "\n", + "import sys\n", + "sys.path.insert(0, \"./Demo/hifi-gan\")\n", + "\n", + "import glob\n", + "import os\n", + "import argparse\n", + "import json\n", + "import torch\n", + "from scipy.io.wavfile import write\n", + "from attrdict import AttrDict\n", + "from vocoder import Generator\n", + "import librosa\n", + "import numpy as np\n", + "import torchaudio\n", + "\n", + "h = None\n", + "\n", + "def load_checkpoint(filepath, device):\n", + " assert os.path.isfile(filepath)\n", + " print(\"Loading '{}'\".format(filepath))\n", + " checkpoint_dict = torch.load(filepath, map_location=device)\n", + " print(\"Complete.\")\n", + " return checkpoint_dict\n", + "\n", + "def scan_checkpoint(cp_dir, prefix):\n", + " pattern = os.path.join(cp_dir, prefix + '*')\n", + " cp_list = glob.glob(pattern)\n", + " if len(cp_list) == 0:\n", + " return ''\n", + " return sorted(cp_list)[-1]\n", + "\n", + "cp_g = scan_checkpoint(\"Vocoder/\", 'g_')\n", + "\n", + "config_file = os.path.join(os.path.split(cp_g)[0], 'config.json')\n", + "with open(config_file) as f:\n", + " data = f.read()\n", + "json_config = json.loads(data)\n", + "h = AttrDict(json_config)\n", + "\n", + "device = torch.device(device)\n", + "generator = Generator(h).to(device)\n", + "\n", + "state_dict_g = load_checkpoint(cp_g, device)\n", + "generator.load_state_dict(state_dict_g['generator'])\n", + "generator.eval()\n", + "generator.remove_weight_norm()" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "MCrw-PUqCNFy", + "outputId": "18d1dc84-3dd1-46c8-91f6-5a6ccd5d0839" + }, + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Loading 'Vocoder/g_00750000'\n", + "Complete.\n", + "Removing weight norm...\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "# load StyleTTS\n", + "model_path = \"./Models/LJSpeech/epoch_2nd_00180.pth\"\n", + "model_config_path = \"./Models/LJSpeech/config.yml\"\n", + "\n", + "config = yaml.safe_load(open(model_config_path))\n", + "\n", + "# load pretrained ASR model\n", + "ASR_config = config.get('ASR_config', False)\n", + "ASR_path = config.get('ASR_path', False)\n", + "text_aligner = load_ASR_models(ASR_path, ASR_config)\n", + "\n", + "# load pretrained F0 model\n", + "F0_path = config.get('F0_path', False)\n", + "pitch_extractor = load_F0_models(F0_path)\n", + "\n", + "model = build_model(Munch(config['model_params']), text_aligner, pitch_extractor)\n", + "\n", + "params = torch.load(model_path, map_location='cpu')\n", + "params = params['net']\n", + "for key in model:\n", + " if key in params:\n", + " if not \"discriminator\" in key:\n", + " print('%s loaded' % key)\n", + " model[key].load_state_dict(params[key])\n", + "_ = [model[key].eval() for key in model]\n", + "_ = [model[key].to(device) for key in model]" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "V_fAsBFDC3g6", + "outputId": "0bf43a95-e931-4639-b23b-532e84e3101b" + }, + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "name": "stderr", + "text": [ + "/usr/local/lib/python3.9/dist-packages/torch/nn/modules/rnn.py:71: UserWarning: dropout option adds dropout after all but last recurrent layer, so non-zero dropout expects num_layers greater than 1, but got dropout=0.2 and num_layers=1\n", + " warnings.warn(\"dropout option adds dropout after all but last \"\n" + ] + }, + { + "output_type": "stream", + "name": "stdout", + "text": [ + "predictor loaded\n", + "decoder loaded\n", + "pitch_extractor loaded\n", + "text_encoder loaded\n", + "style_encoder loaded\n", + "text_aligner loaded\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "def synthesize(text, reference_embeddings):\n", + "\n", + " ps = global_phonemizer.phonemize([text])\n", + " tokens = textclenaer(ps[0])\n", + " tokens.insert(0, 0)\n", + " tokens.append(0)\n", + " tokens = torch.LongTensor(tokens).to(device).unsqueeze(0)\n", + " converted_samples = {}\n", + "\n", + " with torch.no_grad():\n", + " input_lengths = torch.LongTensor([tokens.shape[-1]]).to(device)\n", + " m = length_to_mask(input_lengths).to(device)\n", + " t_en = model.text_encoder(tokens, input_lengths, m)\n", + " \n", + " for key, (ref, _) in reference_embeddings.items():\n", + " s = ref.squeeze(1)\n", + " style = s\n", + " \n", + " d = model.predictor.text_encoder(t_en, style, input_lengths, m)\n", + "\n", + " x, _ = model.predictor.lstm(d)\n", + " duration = model.predictor.duration_proj(x)\n", + " pred_dur = torch.round(duration.squeeze()).clamp(min=1)\n", + " \n", + " pred_aln_trg = torch.zeros(input_lengths, int(pred_dur.sum().data))\n", + " c_frame = 0\n", + " for i in range(pred_aln_trg.size(0)):\n", + " pred_aln_trg[i, c_frame:c_frame + int(pred_dur[i].data)] = 1\n", + " c_frame += int(pred_dur[i].data)\n", + "\n", + " # encode prosody\n", + " en = (d.transpose(-1, -2) @ pred_aln_trg.unsqueeze(0).to(device))\n", + " style = s.expand(en.shape[0], en.shape[1], -1)\n", + "\n", + " F0_pred, N_pred = model.predictor.F0Ntrain(en, s)\n", + "\n", + " out = model.decoder((t_en @ pred_aln_trg.unsqueeze(0).to(device)), \n", + " F0_pred, N_pred, ref.squeeze().unsqueeze(0))\n", + "\n", + "\n", + " c = out.squeeze()\n", + " y_g_hat = generator(c.unsqueeze(0))\n", + " y_out = y_g_hat.squeeze().cpu().numpy()\n", + "\n", + " c = out.squeeze()\n", + " y_g_hat = generator(c.unsqueeze(0))\n", + " y_out = y_g_hat.squeeze()\n", + " \n", + " converted_samples[key] = y_out.cpu().numpy()\n", + " return converted_samples" + ], + "metadata": { + "id": "0WtqpW-RWXHq" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "# # Copy custom files to give style sample.\n", + "# !cp -r /content/gdrive/MyDrive/voice/custom_sample /content/StyleTTS/" + ], + "metadata": { + "id": "6ctNbvbbjpH5" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "# get first 3 training sample as references\n", + "\n", + "train_path = config.get('train_data', None)\n", + "val_path = config.get('val_data', None)\n", + "train_list, val_list = get_data_path_list(train_path, val_path)\n", + "\n", + "ref_dicts = {}\n", + "for j in range(5):\n", + " filename = train_list[j].split('|')[0]\n", + " name = filename.split('/')[-1].replace('.wav', '')\n", + " ref_dicts[name] = \"./Data/\" + filename\n", + "\n", + "# # Manually refer to custom data\n", + "# ref_dicts = {\n", + "# \"0_0_14160\" : \"/content/StyleTTS/custom_sample/wavs/0_0_14160.wav\",\n", + "# \"0_21000_32040\" : \"/content/StyleTTS/custom_sample/wavs/0_21000_32040.wav\",\n", + "# \"0_32040_38240\" : \"/content/StyleTTS/custom_sample/wavs/0_32040_38240.wav\",\n", + "# \"10_569880_577880\" :\"/content/StyleTTS/custom_sample/wavs/10_569880_577880.wav\",\n", + "# \"4_1329160_1333160\" : \"/content/StyleTTS/custom_sample/wavs/4_1329160_1333160.wav\"\n", + "# }\n", + " \n", + "reference_embeddings = compute_style(ref_dicts)" + ], + "metadata": { + "id": "kSe5tN8AMvex" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "converted_samples = synthesize('''Hi, my name is redacted, so I won't tell you. This is for sure not fake by the way.''', reference_embeddings)\n", + "\n", + "import IPython.display as ipd\n", + "for key, wave in converted_samples.items():\n", + " print('Synthesized: %s' % key)\n", + " display(ipd.Audio(wave, rate=24000))\n", + " try:\n", + " print('Reference: %s' % key)\n", + " display(ipd.Audio(reference_embeddings[key][-1], rate=24000))\n", + " except:\n", + " continue" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 633 + }, + "id": "2GA0sQMwV0_6", + "outputId": "5b61479d-deb6-485a-ed63-edc243607116" + }, + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Synthesized: LJ001-0001\n" + ] + }, + { + "output_type": "display_data", + "data": { + "text/plain": [ + "" + ], + "text/html": [ + "\n", + " \n", + " " + ] + }, + "metadata": {} + }, + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Reference: LJ001-0001\n" + ] + }, + { + "output_type": "display_data", + "data": { + "text/plain": [ + "" + ], + "text/html": [ + "\n", + " \n", + " " + ] + }, + "metadata": {} + }, + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Synthesized: LJ001-0002\n" + ] + }, + { + "output_type": "display_data", + "data": { + "text/plain": [ + "" + ], + "text/html": [ + "\n", + " \n", + " " + ] + }, + "metadata": {} + }, + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Reference: LJ001-0002\n" + ] + }, + { + "output_type": "display_data", + "data": { + "text/plain": [ + "" + ], + "text/html": [ + "\n", + " \n", + " " + ] + }, + "metadata": {} + }, + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Synthesized: LJ001-0003\n" + ] + }, + { + "output_type": "display_data", + "data": { + "text/plain": [ + "" + ], + "text/html": [ + "\n", + " \n", + " " + ] + }, + "metadata": {} + }, + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Reference: LJ001-0003\n" + ] + }, + { + "output_type": "display_data", + "data": { + "text/plain": [ + "" + ], + "text/html": [ + "\n", + " \n", + " " + ] + }, + "metadata": {} + }, + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Synthesized: LJ001-0004\n" + ] + }, + { + "output_type": "display_data", + "data": { + "text/plain": [ + "" + ], + "text/html": [ + "\n", + " \n", + " " + ] + }, + "metadata": {} + }, + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Reference: LJ001-0004\n" + ] + }, + { + "output_type": "display_data", + "data": { + "text/plain": [ + "" + ], + "text/html": [ + "\n", + " \n", + " " + ] + }, + "metadata": {} + }, + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Synthesized: LJ001-0005\n" + ] + }, + { + "output_type": "display_data", + "data": { + "text/plain": [ + "" + ], + "text/html": [ + "\n", + " \n", + " " + ] + }, + "metadata": {} + }, + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Reference: LJ001-0005\n" + ] + }, + { + "output_type": "display_data", + "data": { + "text/plain": [ + "" + ], + "text/html": [ + "\n", + " \n", + " " + ] + }, + "metadata": {} + } + ] + }, + { + "cell_type": "markdown", + "source": [ + "# TODO Sythesise function that takes custom audio path as sample." + ], + "metadata": { + "id": "VhHQLAwAiyyU" + } + }, + { + "cell_type": "code", + "source": [], + "metadata": { + "id": "QmUTdyRxi4Pi" + }, + "execution_count": null, + "outputs": [] + } + ] +} \ No newline at end of file diff --git a/README.md b/README.md index 2a4e971..ccccce4 100644 --- a/README.md +++ b/README.md @@ -15,9 +15,10 @@ Audio samples: [https://styletts.github.io/](https://styletts.github.io/) git clone https://github.com/yl4579/StyleTTS.git cd StyleTTS ``` -3. Install python requirements: +3. Install requirements: ```bash -pip install SoundFile torchaudio munch torch pydub pyyaml librosa git+https://github.com/resemble-ai/monotonic_align.git +pip install SoundFile torchaudio munch torch pydub pyyaml librosa phonemizer attrdict git+https://github.com/resemble-ai/monotonic_align.git +sudo apt-get install espeak-ng ``` 4. Download and extract the [LJSpeech dataset](https://keithito.com/LJ-Speech-Dataset/), unzip to the data folder and upsample the data to 24 kHz. The vocoder, text aligner and pitch extractor are pre-trained on 24 kHz data, but you can easily change the preprocessing and re-train them using your own preprocessing. I will provide more receipes and pre-trained models later if I have time. If you are willing to help, feel free to work on other preprocessing methods. For LibriTTS, you will need to combine train-clean-360 with train-clean-100 and rename the folder train-clean-460 (see [val_list_libritts.txt](https://github.com/yl4579/StyleTTS/blob/main/Data/val_list_libritts.txt) as an example). @@ -37,13 +38,15 @@ The data list format needs to be `filename.wav|transcription`, see [val_list_lib ## Inference -Please refer to [inference.ipynb](https://github.com/yl4579/StyleTTS/blob/main/Demo/Inference_LJSpeech.ipynb) for details. +### Colab -The pretrained StyleTTS and Hifi-GAN on LJSpeech corpus in 24 kHz can be downloaded at [StyleTTS Link](https://drive.google.com/file/d/1aqOExU7NroGHdIVjgkzqRYrK5q_694cj/view?usp=sharing) and [Hifi-GAN Link](https://drive.google.com/file/d/1h_h0GFdC6VOiZ-oFDClqy2bVonA1xDiw/view?usp=sharing). +Please refer to this [colab notebook](https://colab.research.google.com/drive/18xNehSLiClZC3ub3eXiPCOnR4bz3r5Ur?usp=sharing) which runs end to end inference for the LJSpeech model. Adapting for the LibriTTS models should be quite easy. A copy of this notebook is in the demo folder. + +### Pretrained Models -The pretrained StyleTTS and Hifi-GAN on LibriTTS corpus can be downloaded at [StyleTTS Link](https://drive.google.com/file/d/1nm0yB6Y5QWF3FYGfJCwQ6zYNlOAYVSet/view?usp=sharing) and [Hifi-GAN Link](https://drive.google.com/file/d/1RDxYknrzncGzusYeVeDo38ErNdczzbik/view?usp=sharing). You also need to download test-clean from LibriTTS if you want to run the zero-shot demo. +The pretrained StyleTTS and Hifi-GAN on LJSpeech corpus in 24 kHz can be downloaded at [StyleTTS Link](https://drive.google.com/file/d/1aqOExU7NroGHdIVjgkzqRYrK5q_694cj/view?usp=sharing) and [Hifi-GAN Link](https://drive.google.com/file/d/1h_h0GFdC6VOiZ-oFDClqy2bVonA1xDiw/view?usp=sharing). -Please unzip to `Models` and `Vocoder` respectivey and run each cell in the notebook. You will also need to install [phonemizer](https://github.com/bootphon/phonemizer) to run this inference demo. +The pretrained StyleTTS and Hifi-GAN on LibriTTS corpus can be downloaded at [StyleTTS Link](https://drive.google.com/file/d/1nm0yB6Y5QWF3FYGfJCwQ6zYNlOAYVSet/view?usp=sharing) and [Hifi-GAN Link](https://drive.google.com/file/d/1RDxYknrzncGzusYeVeDo38ErNdczzbik/view?usp=sharing). You also need to download test-clean from LibriTTS if you want to run the zero-shot demo. ## Preprocessing