Huanshere · liaozd · Nov 21, 2024 · Nov 27, 2024 · Mar 3, 2025 · Mar 3, 2025
diff --git a/.dockerignore b/.dockerignore
@@ -0,0 +1 @@
+.git/
diff --git a/Dockerfile b/Dockerfile
@@ -32,9 +32,6 @@ RUN git clone https://github.com/Huanshere/VideoLingo.git .
 # Install PyTorch and torchaudio
 RUN pip install torch==2.0.0 torchaudio==2.0.0 --index-url https://download.pytorch.org/whl/cu118
 
-# Clean up unnecessary files
-RUN rm -rf .git
-
 # Upgrade pip and install basic dependencies
 RUN pip config set global.index-url https://pypi.tuna.tsinghua.edu.cn/simple && \
     pip install --no-cache-dir --upgrade pip requests rich ruamel.yaml

diff --git a/OneKeyInstall&Start.bat b/OneKeyInstall&Start.bat
@@ -2,6 +2,26 @@
 
 cd /D "%~dp0"
 
+set INSTALL_ENV_DIR=%cd%\installer_files\env
+set CONDA_ROOT_PREFIX=%cd%\installer_files\conda
+
+set PYTHONNOUSERSITE=1
+set PYTHONPATH=
+set PYTHONHOME=
+set "CUDA_PATH=%INSTALL_ENV_DIR%"
+set "CUDA_HOME=%CUDA_PATH%"
+
+@rem Check if conda environment exists
+if exist "%INSTALL_ENV_DIR%\python.exe" (
+    echo Conda environment found, starting directly...
+    echo If startup fails, please delete the 'installer_files' folder and reinstall.
+    @rem Activate environment
+    call "%CONDA_ROOT_PREFIX%\condabin\conda.bat" activate "%INSTALL_ENV_DIR%" || ( echo. && echo Miniconda hook not found. && goto end )
+    python -m streamlit run st.py
+    goto end
+)
+
+@rem Original installation path continues...
 set PATH=%PATH%;%SystemRoot%\system32
 
 echo "%CD%"| findstr /C:" " >nul && echo This script relies on Miniconda which can not be silently installed under a path with spaces. && goto end
@@ -40,28 +60,17 @@ if "%conda_exists%" == "F" (
 @rem create the installer env
 if not exist "%INSTALL_ENV_DIR%" (
   echo Packages to install: python=3.10.0 requests rich ruamel.yaml
-  call "%CONDA_ROOT_PREFIX%\_conda.exe" create --no-shortcuts -y -k --prefix "%INSTALL_ENV_DIR%" python=3.10.0 requests rich "ruamel.yaml" || ( echo. && echo Conda environment creation failed. && goto end )
+  call "%CONDA_ROOT_PREFIX%\_conda.exe" create --no-shortcuts -y -k --prefix "%INSTALL_ENV_DIR%" python=3.10.0 requests rich ruamel.yaml || ( echo. && echo Conda environment creation failed. && goto end )
 )
 
 @rem check if conda environment was actually created
 if not exist "%INSTALL_ENV_DIR%\python.exe" ( echo. && echo Conda environment is empty. && goto end )
 
-@rem environment isolation
-set PYTHONNOUSERSITE=1
-set PYTHONPATH=
-set PYTHONHOME=
-@rem ! may cause error if we use cudnn on windows
-set "CUDA_PATH=%INSTALL_ENV_DIR%"
-set "CUDA_HOME=%CUDA_PATH%"
-
-@rem activate installer env
-call "%CONDA_ROOT_PREFIX%\condabin\conda.bat" activate "%INSTALL_ENV_DIR%" || ( echo. && echo Miniconda hook not found. && goto end )
-
-@rem Run pip setup
+:start
 call python pip_setup.py
 
 echo.
-echo Done!
+echo ✅ Done!
 
 :end
 pause
diff --git a/OneKeyStart.bat b/OneKeyStart.bat
diff --git a/README.md b/README.md
@@ -68,9 +68,14 @@ https://github.com/user-attachments/assets/47d965b2-b4ab-4a0b-9d08-b49a7bf3508c
 
 ## Installation
 
-Windows users can double-click `OneKeyInstall&Start.bat` to install (requires Git). The script will download Miniconda and install the complete environment. For NVIDIA GPU users, you need to first install [CUDA 12.6](https://developer.download.nvidia.com/compute/cuda/12.6.0/local_installers/cuda_12.6.0_560.76_windows.exe) and [CUDNN 9.3.0](https://developer.download.nvidia.com/compute/cudnn/9.3.0/local_installers/cudnn_9.3.0_windows.exe), then add `C:\Program Files\NVIDIA\CUDNN\v9.3\bin\12.6` to system environment variables and restart.
+### Windows
+Simply double-click `OneKeyInstall&Start.bat` to get started. The script will:
+- Download and install Miniconda automatically
+- Install all required dependencies for both GPU and CPU
 
-MacOS/Linux users should install from source, requiring `python=3.10.0` environment.
+Prerequisites: Git must be installed on your system.
+
+### macOS/Linux 
 
 1. Clone the repository
 
@@ -79,10 +84,10 @@ git clone https://github.com/Huanshere/VideoLingo.git
 cd VideoLingo
 ```
 
-2. Install dependencies
+2. Install dependencies(requires `python=3.10.0`)
 
 ```bash
-conda create -n videolingo python=3.10.0
+conda create -n videolingo python=3.10.0 -y
 conda activate videolingo
 python install.py
 ```
@@ -93,6 +98,7 @@ python install.py
 streamlit run st.py
 ```
 
+### Docker
 Alternatively, you can use Docker (requires CUDA 12.4 and NVIDIA Driver version >550), see [Docker docs](/docs/pages/docs/docker.en-US.md):
 
 ```bash

diff --git a/core/step2_whisperX.py b/core/step2_whisperX.py
@@ -50,13 +50,24 @@ def check_hf_mirror() -> str:
         rprint("[yellow]⚠️ All mirrors failed, using default[/yellow]")
     rprint(f"[cyan]🚀 Selected mirror:[/cyan] {fastest_url} ({best_time:.2f}s)")
     return fastest_url
-
 def transcribe_audio(audio_file: str, start: float, end: float) -> Dict:
+    """
+    使用WhisperX模型对音频文件进行转录。
+
+    参数:
+        audio_file (str): 要转录的音频文件路径。
+        start (float): 音频片段的起始时间（以秒为单位）。
+        end (float): 音频片段的结束时间（以秒为单位）。
+
+    返回:
+        Dict: 包含转录结果的字典，包括文本、时间戳等信息。
+    """
     os.environ['HF_ENDPOINT'] = check_hf_mirror() #? don't know if it's working...
     WHISPER_LANGUAGE = load_key("whisper.language")
     device = "cuda" if torch.cuda.is_available() else "cpu"
     rprint(f"🚀 Starting WhisperX using device: {device} ...")
-
+
+    # 根据GPU内存设置批处理大小和计算类型
     if device == "cuda":
         gpu_mem = torch.cuda.get_device_properties(0).total_memory / (1024**3)
         batch_size = 16 if gpu_mem > 8 else 2
@@ -67,52 +78,59 @@ def transcribe_audio(audio_file: str, start: float, end: float) -> Dict:
         compute_type = "int8"
         rprint(f"[cyan]📦 Batch size:[/cyan] {batch_size}, [cyan]⚙️ Compute type:[/cyan] {compute_type}")
     rprint(f"[green]▶️ Starting WhisperX for segment {start:.2f}s to {end:.2f}s...[/green]")
-    
+
     try:
+        # 根据语言选择Whisper模型
         if WHISPER_LANGUAGE == 'zh':
             model_name = "Huan69/Belle-whisper-large-v3-zh-punct-fasterwhisper"
             local_model = os.path.join(MODEL_DIR, "Belle-whisper-large-v3-zh-punct-fasterwhisper")
         else:
             model_name = load_key("whisper.model")
             local_model = os.path.join(MODEL_DIR, model_name)
-
+
+        # 加载本地或远程的Whisper模型
         if os.path.exists(local_model):
             rprint(f"[green]📥 Loading local WHISPER model:[/green] {local_model} ...")
             model_name = local_model
         else:
             rprint(f"[green]📥 Using WHISPER model from HuggingFace:[/green] {model_name} ...")
 
+        # 设置VAD和ASR选项
         vad_options = {"vad_onset": 0.500,"vad_offset": 0.363}
         asr_options = {"temperatures": [0],"initial_prompt": "",}
         whisper_language = None if 'auto' in WHISPER_LANGUAGE else WHISPER_LANGUAGE
         rprint("[bold yellow]**You can ignore warning of `Model was trained with torch 1.10.0+cu102, yours is 2.0.0+cu118...`**[/bold yellow]")
         model = whisperx.load_model(model_name, device, compute_type=compute_type, language=whisper_language, vad_options=vad_options, asr_options=asr_options, download_root=MODEL_DIR)
 
-        # Create temporary file to store audio segment
-        temp_audio = tempfile.NamedTemporaryFile(suffix='.mp3', delete=False)
-        temp_audio_path = temp_audio.name
-        temp_audio.close()
-        # Use ffmpeg to cut audio
-        ffmpeg_cmd = f'ffmpeg -y -i "{audio_file}" -ss {start} -t {end-start} -vn -b:a 64k -ar 16000 -ac 1 -metadata encoding=UTF-8 -f mp3 "{temp_audio_path}"'
+        # 创建临时WAV文件以提高兼容性
+        with tempfile.NamedTemporaryFile(suffix='.wav', delete=False) as temp_audio:
+            temp_audio_path = temp_audio.name
+
+        # 使用ffmpeg提取音频片段
+        ffmpeg_cmd = f'ffmpeg -y -i "{audio_file}" -ss {start} -t {end-start} -vn -ar 16000 -ac 1 "{temp_audio_path}"'
         subprocess.run(ffmpeg_cmd, shell=True, check=True, capture_output=True)
-        # Load the cut audio
-        audio_segment, sample_rate = librosa.load(temp_audio_path, sr=16000)
-        # Delete temporary file
-        os.unlink(temp_audio_path)
+
+        try:
+            # 使用librosa加载音频片段
+            audio_segment, sample_rate = librosa.load(temp_audio_path, sr=16000)
+        finally:
+            # 清理临时文件
+            if os.path.exists(temp_audio_path):
+                os.unlink(temp_audio_path)
 
         rprint("[bold green]note: You will see Progress if working correctly[/bold green]")
         result = model.transcribe(audio_segment, batch_size=batch_size, print_progress=True)
 
-        # Free GPU resources
+        # 释放GPU资源
         del model
         torch.cuda.empty_cache()
 
-        # Save language
+        # 保存语言信息并检查是否与指定语言一致
         save_language(result['language'])
         if result['language'] == 'zh' and WHISPER_LANGUAGE != 'zh':
-            raise ValueError("请指定转录语言为 zh 后重试！")
+            raise ValueError("Please specify the transcription language as zh and try again!")
 
-        # Align whisper output
+        # 对齐Whisper输出
         model_a, metadata = whisperx.load_align_model(language_code=result["language"], device=device)
         result = whisperx.align(result["segments"], model_a, metadata, audio_segment, device, return_char_alignments=False)
 

diff --git a/docs/pages/docs/start.en-US.md b/docs/pages/docs/start.en-US.md
@@ -114,13 +114,6 @@ After configuration, select `Reference Audio Mode` in the sidebar (see Yuque doc
 
 VideoLingo supports Windows, macOS and Linux systems, and can run on CPU or GPU.
 
-For GPU acceleration on Windows, install these dependencies:
-
-- [CUDA Toolkit 12.6](https://developer.download.nvidia.com/compute/cuda/12.6.0/local_installers/cuda_12.6.0_560.76_windows.exe)
-- [CUDNN 9.3.0](https://developer.download.nvidia.com/compute/cudnn/9.3.0/local_installers/cudnn_9.3.0_windows.exe)
-
-> Note: After installing, add `C:\Program Files\NVIDIA\CUDNN\v9.3\bin\12.6` to system path and restart computer 🔄
-
 ### Windows One-Click Install
 
 Make sure [Git](https://git-scm.com/downloads) is installed,
@@ -131,48 +124,37 @@ Make sure [Git](https://git-scm.com/downloads) is installed,
 
 ### Source Installation
 
-Before installing VideoLingo, ensure:
-1. **25GB** free disk space
-2. [Anaconda](https://www.anaconda.com/download) installed (for Python environment management)
-3. [Git](https://git-scm.com/downloads) installed (for cloning project code, or download manually)
+Before installing VideoLingo, ensure you have **25GB** free disk space and installed Git and Anaconda.
 
-Basic Python knowledge required. For any issues, ask the AI assistant at [videolingo.io](https://videolingo.io) bottom right~
 
-1. Open Anaconda Prompt and navigate to installation directory, e.g. desktop:
-   ```bash
-   cd desktop
-   ```
-
-2. Clone project and enter directory:
+1. Clone the project:
    ```bash
    git clone https://github.com/Huanshere/VideoLingo.git
    cd VideoLingo
    ```
 
-3. Create and activate virtual environment (**must be 3.10.0**):
+2. Create and activate virtual environment (**must be python=3.10.0**):
    ```bash
    conda create -n videolingo python=3.10.0 -y
    conda activate videolingo
    ```
 
-4. Run installation script:
+3. Run installation script:
    ```bash
    python install.py
    ```
    Script will automatically install appropriate torch version
 
-5. 🎉 Enter command to launch Streamlit app:
+4. 🎉 Launch Streamlit app:
    ```bash
    streamlit run st.py
    ```
 
-6. Set key in sidebar of popup webpage and start using~
+5. Set key in sidebar of popup webpage and start using~
 
    ![tutorial](https://github.com/user-attachments/assets/983ba58b-5ae3-4132-90f5-6d48801465dd)
 
-7. Transcription step will automatically download models from huggingface, or you can download manually and place `_model_cache` folder in VideoLingo directory: [Baidu Drive](https://pan.baidu.com/s/1Igo_FvFV4Xcb8tSYT0ktpA?pwd=e1c7)
-
-8. (Optional) More settings can be manually modified in `config.yaml`, watch command line output during operation
+6. (Optional) More settings can be manually modified in `config.yaml`, watch command line output during operation
 
 ## 🏭 Batch Mode (beta)
 

diff --git a/docs/pages/docs/start.zh-CN.md b/docs/pages/docs/start.zh-CN.md
@@ -115,13 +115,6 @@ VideoLingo提供了多种 tts 接入方式，以下是对比（如不使用配
 
 VideoLingo 支持 Windows、macOS 和 Linux 系统，可使用 CPU 或 GPU 运行。
 
-对于 Windows 系统使用 GPU 加速，需要安装以下依赖：
-
-- [CUDA Toolkit 12.6](https://developer.download.nvidia.com/compute/cuda/12.6.0/local_installers/cuda_12.6.0_560.76_windows.exe)
-- [CUDNN 9.3.0](https://developer.download.nvidia.com/compute/cudnn/9.3.0/local_installers/cudnn_9.3.0_windows.exe)
-
-> 注意：安装后需要将 `C:\Program Files\NVIDIA\CUDNN\v9.3\bin\12.6` 添加至系统环境变量，并重启计算机 🔄
-
 ### Windows 一键安装
 
 请确保已安装 [Git](https://git-scm.com/downloads)，
@@ -135,56 +128,44 @@ VideoLingo 支持 Windows、macOS 和 Linux 系统，可使用 CPU 或 GPU 运
 
 3. 双击 `OneKeyInstall&Start.bat` 即可完成安装并启动网页
 
-### 源码安装
-
-开始安装 VideoLingo 之前，请确保:
-1. 预留 **25G** 硬盘空间
-2. 已安装 [Anaconda](https://www.anaconda.com/download) (用于 Python 环境管理)
-3. 已安装 [Git](https://git-scm.com/downloads) (用于克隆项目代码，也可以手动下载)
+### macOS/Linux 源码安装
 
-需要一定的 python 基础，遇到任何问题可以询问官方网站 [videolingo.io](https://videolingo.io) 右下角的AI助手~
+开始安装 VideoLingo 之前，请确保预留 **25G** 硬盘空间，并安装了 Git 和 Anaconda。
 
-1. 打开 `Anaconda Prompt` 并切换到你想安装的目录，例如桌面：
-   ```bash
-   cd desktop
-   ```
-
-2. 克隆项目并切换至项目目录：
+1. 克隆项目：
    ```bash
    git clone https://github.com/Huanshere/VideoLingo.git
    cd VideoLingo
    ```
 
-3. 创建并激活虚拟环境（**必须 3.10.0**）：
+2. 创建并激活虚拟环境（**必须 3.10.0**）：
    ```bash
    conda create -n videolingo python=3.10.0 -y
    conda activate videolingo
    ```
 
-4. （可选）应用汉化补丁：
+3. （可选）应用汉化补丁：
 
     参照 **一键安装** 中的说明
 
    （注意：Mac系统会删除整个目标文件夹后再复制，而Windows只会替换重复的文件。Mac用户建议手动将文件逐个移动到目标位置）
 
-5. 运行安装脚本：
+4. 运行安装脚本：
    ```bash
    python install.py
    ```
    脚本将自动安装相应的 torch 版本
 
-6. 🎉 输入命令或点击 `一键启动.bat` 启动 Streamlit 应用：
+5. 🎉 输入命令或点击 `一键启动.bat` 启动 Streamlit 应用：
    ```bash
    streamlit run st.py
    ```
 
-7. 在弹出网页的侧边栏中设置key，开始使用~
+6. 在弹出网页的侧边栏中设置key，开始使用~
 
    ![tutorial](https://github.com/user-attachments/assets/983ba58b-5ae3-4132-90f5-6d48801465dd)
 
-8. 转录步骤会自动从 huggingface 下载模型，也可以手动下载，将 `_model_cache` 文件夹放置在 VideoLingo 目录下：[百度网盘](https://pan.baidu.com/s/1Igo_FvFV4Xcb8tSYT0ktpA?pwd=e1c7)
-
-9. （可选）更多设置可以在 `config.yaml` 中手动修改，运行过程请注意命令行输出
+7. （可选）更多设置可以在 `config.yaml` 中手动修改，运行过程请注意命令行输出
 
 ## 🏭 批量模式（beta）