From 23ac93e3a40c53c86b747c417b5bbc83898c926d Mon Sep 17 00:00:00 2001 From: azman0101 Date: Sun, 24 Aug 2025 13:26:46 +0000 Subject: [PATCH 1/7] feat: use python module --- app.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/app.py b/app.py index 811ad58..06e84cf 100644 --- a/app.py +++ b/app.py @@ -39,12 +39,12 @@ import json import os import re -import subprocess import tempfile from pathlib import Path from typing import List, Optional, Tuple import streamlit as st +from yt_dlp import YoutubeDL def run_yt_dlp(args: List[str]) -> None: From 8ce17e94d110025c7e74df0ba710b710a4021b82 Mon Sep 17 00:00:00 2001 From: azman0101 Date: Sun, 24 Aug 2025 13:26:55 +0000 Subject: [PATCH 2/7] feat: use python module --- app.py | 70 +++++++++++++++++----------------------------------------- 1 file changed, 20 insertions(+), 50 deletions(-) diff --git a/app.py b/app.py index 06e84cf..7d49d80 100644 --- a/app.py +++ b/app.py @@ -47,44 +47,18 @@ from yt_dlp import YoutubeDL -def run_yt_dlp(args: List[str]) -> None: - """Run a yt‑dlp command and raise an exception if it fails. - - The function is separated for easier mocking during tests. It calls - ``subprocess.run`` with the provided arguments and checks the return code. - - Parameters - ---------- - args: - A list of command line arguments to pass directly to ``yt‑dlp``. - - Raises - ------ - RuntimeError - If ``yt‑dlp`` returns a non‑zero exit status. - """ - result = subprocess.run(args, capture_output=True, text=True) - if result.returncode != 0: - raise RuntimeError( - f"yt‑dlp failed with status {result.returncode}:\n" - f"STDOUT:\n{result.stdout}\nSTDERR:\n{result.stderr}" - ) - - def fetch_comments(video_url: str, work_dir: Path) -> List[str]: """Download and parse YouTube comments using yt‑dlp. - yt‑dlp can write comments into the JSON metadata file when invoked with - ``--write-comments``. We specify a custom output template so that the - resulting ``.info.json`` file has a predictable name based on the video ID. + yt‑dlp can extract comments directly when configured with appropriate options. + We use the extract_info method to get video information including comments. Parameters ---------- video_url: The full YouTube URL provided by the user. work_dir: - A directory in which to store temporary files. The JSON file will be - created here. + A directory in which to store temporary files if needed. Returns ------- @@ -99,29 +73,25 @@ def fetch_comments(video_url: str, work_dir: Path) -> List[str]: if not video_id_match: raise ValueError("Impossible d'extraire l'identifiant de la vidéo.") video_id = video_id_match.group(0) - json_path = work_dir / f"{video_id}.info.json" - - # Construct yt‑dlp command. We always skip downloading the media. - cmd = [ - "yt-dlp", - "--skip-download", - "--write-info-json", - "--write-comments", - "-o", - str(work_dir / f"{video_id}"), - video_url, - ] - run_yt_dlp(cmd) - if not json_path.exists(): - raise FileNotFoundError( - f"Fichier JSON introuvable : {json_path}. Assurez-vous que yt‑dlp est correctement installé." - ) - - with json_path.open("r", encoding="utf-8") as f: - info = json.load(f) + # Configure yt-dlp options for extracting comments and info + ydl_opts = { + 'skip_download': True, # Don't download the video file + 'writeinfojson': True, # Write video info to JSON + 'writecomments': True, # Extract comments + 'outtmpl': str(work_dir / f"{video_id}"), # Output template + 'quiet': True, # Reduce output noise + 'no_warnings': True, # Suppress warnings in the logs + } + + with YoutubeDL(ydl_opts) as ydl: + try: + info_dict = ydl.extract_info(video_url, download=False) + except Exception as e: + raise RuntimeError(f"yt-dlp failed to extract video info: {str(e)}") - comments_raw = info.get("comments") or [] + # Extract comments from the info dict + comments_raw = info_dict.get("comments") or [] comments_text: List[str] = [] for comment in comments_raw: # Some entries use the key "text", others use "txt". We normalise From 62bf308940cf6f5f7e57a701ac2b48beee3ab41d Mon Sep 17 00:00:00 2001 From: azman0101 Date: Sun, 24 Aug 2025 13:27:32 +0000 Subject: [PATCH 3/7] feat: use python module --- app.py | 33 ++++++++++++++++++--------------- 1 file changed, 18 insertions(+), 15 deletions(-) diff --git a/app.py b/app.py index 7d49d80..cb11c9c 100644 --- a/app.py +++ b/app.py @@ -168,21 +168,24 @@ def fetch_transcript(video_url: str, work_dir: Path, language: str = "fr") -> Tu video_id = video_id_match.group(0) base_output = work_dir / f"{video_id}" - # Try to fetch subtitles in the specified language - cmd = [ - "yt-dlp", - "--skip-download", - "--write-sub", - "--write-auto-subs", - "--sub-format", - "srt", - "--sub-lang", - language, - "-o", - str(base_output), - video_url, - ] - run_yt_dlp(cmd) + + # Configure yt-dlp options for extracting subtitles + ydl_opts = { + 'skip_download': True, # Don't download the video file + 'writesubtitles': True, # Download manual subtitles + 'writeautomaticsub': True, # Download auto-generated subtitles as fallback + 'subtitlesformat': 'srt', # Request SRT format + 'subtitleslangs': [language], # Request specific language + 'outtmpl': str(base_output), # Output template + 'quiet': True, # Reduce output noise + 'no_warnings': True, # Suppress warnings in the logs + } + + with YoutubeDL(ydl_opts) as ydl: + try: + info_dict = ydl.extract_info(video_url, download=False) + except Exception as e: + raise RuntimeError(f"yt-dlp failed to extract video info: {str(e)}") # Look for files like ..srt possible_files = list(work_dir.glob(f"{video_id}.*.srt")) From b964ff3f758eae7ac1832fc5fac473e69974b3c3 Mon Sep 17 00:00:00 2001 From: azman0101 Date: Sun, 24 Aug 2025 13:28:40 +0000 Subject: [PATCH 4/7] feat: use python module --- app.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/app.py b/app.py index cb11c9c..9c8b8ee 100644 --- a/app.py +++ b/app.py @@ -4,7 +4,7 @@ This Streamlit application provides a simple user interface for retrieving the transcript (subtitles) and top‑level comments from a YouTube video. It uses -``yt‑dlp`` under the hood to download the comments and subtitles for a given +the ``yt‑dlp`` Python module to download the comments and subtitles for a given video URL. The resulting transcript and comments are displayed directly in the browser and can optionally be downloaded as plain text files. @@ -224,8 +224,7 @@ def main() -> None: st.markdown( """ Entrez un lien **YouTube** ci‑dessous pour obtenir sa transcription et - ses commentaires. Le traitement utilise l'outil `yt‑dlp` en interne. - Veillez donc à ce que celui‑ci soit installé sur votre machine. + ses commentaires. Le traitement utilise le module Python `yt‑dlp` en interne. """ ) From 031dfed64ce1799339c9b53548270a6a3a7b7607 Mon Sep 17 00:00:00 2001 From: azman0101 Date: Sun, 24 Aug 2025 13:33:26 +0000 Subject: [PATCH 5/7] chore: change approche --- README.md | 12 +++++-- app.py | 95 +++++++++++++++++++++++++++++++++++++------------------ 2 files changed, 75 insertions(+), 32 deletions(-) diff --git a/README.md b/README.md index e0b5672..cbddfb6 100644 --- a/README.md +++ b/README.md @@ -1,8 +1,16 @@ -# Transcripteur et commentaires YouTube avec Streamlit +# Transcr1. **Python 3.8 ou supérieur** doit être installé sur votre machine. +2. Les dépendances Python nécessaires sont listées dans + `requirements.txt`. Installez‑les avec : + + ```bash + pip install -r requirements.txt + ``` + + Cela installera notamment `streamlit`, `yt-dlp` et `pysrt` pour le parsing des sous‑titres.mmentaires YouTube avec Streamlit Ce projet propose une petite application web réalisée avec **Streamlit** pour récupérer la transcription (sous‑titres) et les commentaires associés à une -vidéo YouTube. L'outil s'appuie sur le programme ligne de commande +vidéo YouTube. L'outil s'appuie sur le module Python [`yt-dlp`](https://github.com/yt-dlp/yt-dlp) pour extraire les données. ## Prérequis diff --git a/app.py b/app.py index 9c8b8ee..62f3dd0 100644 --- a/app.py +++ b/app.py @@ -77,9 +77,7 @@ def fetch_comments(video_url: str, work_dir: Path) -> List[str]: # Configure yt-dlp options for extracting comments and info ydl_opts = { 'skip_download': True, # Don't download the video file - 'writeinfojson': True, # Write video info to JSON - 'writecomments': True, # Extract comments - 'outtmpl': str(work_dir / f"{video_id}"), # Output template + 'getcomments': True, # Extract comments into info dict 'quiet': True, # Reduce output noise 'no_warnings': True, # Suppress warnings in the logs } @@ -169,14 +167,12 @@ def fetch_transcript(video_url: str, work_dir: Path, language: str = "fr") -> Tu base_output = work_dir / f"{video_id}" - # Configure yt-dlp options for extracting subtitles + # Configure yt-dlp options for extracting subtitles info ydl_opts = { 'skip_download': True, # Don't download the video file - 'writesubtitles': True, # Download manual subtitles - 'writeautomaticsub': True, # Download auto-generated subtitles as fallback - 'subtitlesformat': 'srt', # Request SRT format - 'subtitleslangs': [language], # Request specific language - 'outtmpl': str(base_output), # Output template + 'writesubtitles': False, # Don't write subtitle files + 'writeautomaticsub': False, # Don't write auto-generated subtitle files + 'listsubtitles': False, # We don't just want a list 'quiet': True, # Reduce output noise 'no_warnings': True, # Suppress warnings in the logs } @@ -187,29 +183,68 @@ def fetch_transcript(video_url: str, work_dir: Path, language: str = "fr") -> Tu except Exception as e: raise RuntimeError(f"yt-dlp failed to extract video info: {str(e)}") - # Look for files like ..srt - possible_files = list(work_dir.glob(f"{video_id}.*.srt")) + # Try to get subtitles from info dict first + subtitles = info_dict.get('subtitles', {}) + automatic_captions = info_dict.get('automatic_captions', {}) + selected_lang = "" - subtitle_path: Optional[Path] = None - for file in possible_files: - suffix_parts = file.name.split(".") - if len(suffix_parts) >= 3: - lang_code = suffix_parts[-2] # filename format: ..srt - if lang_code == language: - subtitle_path = file - selected_lang = lang_code - break - # If not found, fall back to the first available language - if subtitle_path is None and possible_files: - subtitle_path = possible_files[0] - parts = subtitle_path.name.split(".") - selected_lang = parts[-2] if len(parts) >= 3 else language - transcript = "" - if subtitle_path and subtitle_path.exists(): - with subtitle_path.open("r", encoding="utf-8") as f: - contents = f.read() - transcript = parse_srt_contents(contents) + + # First try manual subtitles in the requested language + if language in subtitles and subtitles[language]: + selected_lang = language + subtitle_info = subtitles[language][0] # Take first available format + # Download the subtitle content + sub_url = subtitle_info.get('url') + if sub_url: + try: + with YoutubeDL({'quiet': True, 'no_warnings': True}) as sub_ydl: + sub_content = sub_ydl.urlopen(sub_url).read().decode('utf-8') + transcript = parse_srt_contents(sub_content) + except Exception: + transcript = "" + + # If no manual subtitles, try automatic captions + if not transcript and language in automatic_captions and automatic_captions[language]: + selected_lang = language + caption_info = automatic_captions[language][0] # Take first available format + # Download the caption content + cap_url = caption_info.get('url') + if cap_url: + try: + with YoutubeDL({'quiet': True, 'no_warnings': True}) as cap_ydl: + cap_content = cap_ydl.urlopen(cap_url).read().decode('utf-8') + transcript = parse_srt_contents(cap_content) + except Exception: + transcript = "" + + # If still no transcript, try fallback to English + if not transcript: + for fallback_lang in ['en', 'en-US', 'en-GB']: + if fallback_lang in subtitles and subtitles[fallback_lang]: + selected_lang = fallback_lang + subtitle_info = subtitles[fallback_lang][0] + sub_url = subtitle_info.get('url') + if sub_url: + try: + with YoutubeDL({'quiet': True, 'no_warnings': True}) as sub_ydl: + sub_content = sub_ydl.urlopen(sub_url).read().decode('utf-8') + transcript = parse_srt_contents(sub_content) + break + except Exception: + continue + elif fallback_lang in automatic_captions and automatic_captions[fallback_lang]: + selected_lang = fallback_lang + caption_info = automatic_captions[fallback_lang][0] + cap_url = caption_info.get('url') + if cap_url: + try: + with YoutubeDL({'quiet': True, 'no_warnings': True}) as cap_ydl: + cap_content = cap_ydl.urlopen(cap_url).read().decode('utf-8') + transcript = parse_srt_contents(cap_content) + break + except Exception: + continue return selected_lang, transcript From e18222a17f368e038a8066d7db0bfecadae1df97 Mon Sep 17 00:00:00 2001 From: azman0101 Date: Sun, 24 Aug 2025 13:36:38 +0000 Subject: [PATCH 6/7] chore: change approche --- app.py | 107 +++++++++++++++++++---------------------------- requirements.txt | 4 +- 2 files changed, 44 insertions(+), 67 deletions(-) diff --git a/app.py b/app.py index 62f3dd0..4c55b24 100644 --- a/app.py +++ b/app.py @@ -77,19 +77,30 @@ def fetch_comments(video_url: str, work_dir: Path) -> List[str]: # Configure yt-dlp options for extracting comments and info ydl_opts = { 'skip_download': True, # Don't download the video file - 'getcomments': True, # Extract comments into info dict + 'writecomments': True, # Extract comments + 'writeinfojson': True, # Write info to JSON to capture comments + 'outtmpl': str(work_dir / f"{video_id}"), # Output template 'quiet': True, # Reduce output noise 'no_warnings': True, # Suppress warnings in the logs } with YoutubeDL(ydl_opts) as ydl: try: + # This will create a .info.json file with comments info_dict = ydl.extract_info(video_url, download=False) except Exception as e: raise RuntimeError(f"yt-dlp failed to extract video info: {str(e)}") - # Extract comments from the info dict + # Check if we have comments in the info dict directly comments_raw = info_dict.get("comments") or [] + + # If no comments in info dict, try to read from the JSON file + if not comments_raw: + json_path = work_dir / f"{video_id}.info.json" + if json_path.exists(): + with json_path.open("r", encoding="utf-8") as f: + info = json.load(f) + comments_raw = info.get("comments") or [] comments_text: List[str] = [] for comment in comments_raw: # Some entries use the key "text", others use "txt". We normalise @@ -167,84 +178,50 @@ def fetch_transcript(video_url: str, work_dir: Path, language: str = "fr") -> Tu base_output = work_dir / f"{video_id}" - # Configure yt-dlp options for extracting subtitles info + # Configure yt-dlp options for extracting subtitles ydl_opts = { 'skip_download': True, # Don't download the video file - 'writesubtitles': False, # Don't write subtitle files - 'writeautomaticsub': False, # Don't write auto-generated subtitle files - 'listsubtitles': False, # We don't just want a list + 'writesubtitles': True, # Download manual subtitles + 'writeautomaticsub': True, # Download auto-generated subtitles as fallback + 'subtitlesformat': 'srt', # Request SRT format + 'subtitleslangs': [language], # Request specific language + 'outtmpl': str(base_output), # Output template 'quiet': True, # Reduce output noise 'no_warnings': True, # Suppress warnings in the logs } with YoutubeDL(ydl_opts) as ydl: try: + # This will download subtitle files to the work directory info_dict = ydl.extract_info(video_url, download=False) except Exception as e: raise RuntimeError(f"yt-dlp failed to extract video info: {str(e)}") - # Try to get subtitles from info dict first - subtitles = info_dict.get('subtitles', {}) - automatic_captions = info_dict.get('automatic_captions', {}) - + # Look for files like ..srt + possible_files = list(work_dir.glob(f"{video_id}.*.srt")) selected_lang = "" - transcript = "" + subtitle_path: Optional[Path] = None - # First try manual subtitles in the requested language - if language in subtitles and subtitles[language]: - selected_lang = language - subtitle_info = subtitles[language][0] # Take first available format - # Download the subtitle content - sub_url = subtitle_info.get('url') - if sub_url: - try: - with YoutubeDL({'quiet': True, 'no_warnings': True}) as sub_ydl: - sub_content = sub_ydl.urlopen(sub_url).read().decode('utf-8') - transcript = parse_srt_contents(sub_content) - except Exception: - transcript = "" + for file in possible_files: + suffix_parts = file.name.split(".") + if len(suffix_parts) >= 3: + lang_code = suffix_parts[-2] # filename format: ..srt + if lang_code == language: + subtitle_path = file + selected_lang = lang_code + break - # If no manual subtitles, try automatic captions - if not transcript and language in automatic_captions and automatic_captions[language]: - selected_lang = language - caption_info = automatic_captions[language][0] # Take first available format - # Download the caption content - cap_url = caption_info.get('url') - if cap_url: - try: - with YoutubeDL({'quiet': True, 'no_warnings': True}) as cap_ydl: - cap_content = cap_ydl.urlopen(cap_url).read().decode('utf-8') - transcript = parse_srt_contents(cap_content) - except Exception: - transcript = "" - - # If still no transcript, try fallback to English - if not transcript: - for fallback_lang in ['en', 'en-US', 'en-GB']: - if fallback_lang in subtitles and subtitles[fallback_lang]: - selected_lang = fallback_lang - subtitle_info = subtitles[fallback_lang][0] - sub_url = subtitle_info.get('url') - if sub_url: - try: - with YoutubeDL({'quiet': True, 'no_warnings': True}) as sub_ydl: - sub_content = sub_ydl.urlopen(sub_url).read().decode('utf-8') - transcript = parse_srt_contents(sub_content) - break - except Exception: - continue - elif fallback_lang in automatic_captions and automatic_captions[fallback_lang]: - selected_lang = fallback_lang - caption_info = automatic_captions[fallback_lang][0] - cap_url = caption_info.get('url') - if cap_url: - try: - with YoutubeDL({'quiet': True, 'no_warnings': True}) as cap_ydl: - cap_content = cap_ydl.urlopen(cap_url).read().decode('utf-8') - transcript = parse_srt_contents(cap_content) - break - except Exception: - continue + # If not found, fall back to the first available language + if subtitle_path is None and possible_files: + subtitle_path = possible_files[0] + parts = subtitle_path.name.split(".") + selected_lang = parts[-2] if len(parts) >= 3 else language + + transcript = "" + if subtitle_path and subtitle_path.exists(): + with subtitle_path.open("r", encoding="utf-8") as f: + contents = f.read() + transcript = parse_srt_contents(contents) return selected_lang, transcript diff --git a/requirements.txt b/requirements.txt index 41c95c6..471be01 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,3 +1,3 @@ streamlit>=1.26 -yt-dlp>=2025.3.0 -pysrt>=1.1.2 \ No newline at end of file +yt-dlp>=2025.08.22 +pysrt>=1.48.1 \ No newline at end of file From 23a0b1b9bfbc8e36545f7d45deba928463dba4f7 Mon Sep 17 00:00:00 2001 From: azman0101 Date: Sun, 24 Aug 2025 13:54:42 +0000 Subject: [PATCH 7/7] fix: change error handling --- README.md | 3 +- app.py | 377 +++++++++++++++++++++++++++++++++++++++++------------- 2 files changed, 289 insertions(+), 91 deletions(-) diff --git a/README.md b/README.md index cbddfb6..34cdc33 100644 --- a/README.md +++ b/README.md @@ -11,7 +11,8 @@ Ce projet propose une petite application web réalisée avec **Streamlit** pour récupérer la transcription (sous‑titres) et les commentaires associés à une vidéo YouTube. L'outil s'appuie sur le module Python -[`yt-dlp`](https://github.com/yt-dlp/yt-dlp) pour extraire les données. +[`yt-dlp`](https://github.com/yt-dlp/yt-dlp) pour extraire les données de manière +intégrée dans l'application. ## Prérequis diff --git a/app.py b/app.py index 4c55b24..29e6897 100644 --- a/app.py +++ b/app.py @@ -8,13 +8,16 @@ video URL. The resulting transcript and comments are displayed directly in the browser and can optionally be downloaded as plain text files. +This version includes multiple fallback strategies to handle YouTube's bot +detection and authentication requirements. + Requirements ------------ To run this application you must have the following tools installed on your machine: -* **Python 3.8 or newer** +* **Python 3.8 or newer** * **yt‑dlp** – a fork of youtube‑dl capable of downloading comments and subtitles. Installation instructions are available in the official repository: https://github.com/yt‑dlp/yt‑dlp @@ -48,10 +51,10 @@ def fetch_comments(video_url: str, work_dir: Path) -> List[str]: - """Download and parse YouTube comments using yt‑dlp. + """Download and parse YouTube comments using yt‑dlp with bot detection workarounds. - yt‑dlp can extract comments directly when configured with appropriate options. - We use the extract_info method to get video information including comments. + This function implements multiple fallback strategies to handle YouTube's + bot detection and authentication requirements. Parameters ---------- @@ -74,42 +77,113 @@ def fetch_comments(video_url: str, work_dir: Path) -> List[str]: raise ValueError("Impossible d'extraire l'identifiant de la vidéo.") video_id = video_id_match.group(0) - # Configure yt-dlp options for extracting comments and info - ydl_opts = { - 'skip_download': True, # Don't download the video file - 'writecomments': True, # Extract comments - 'writeinfojson': True, # Write info to JSON to capture comments - 'outtmpl': str(work_dir / f"{video_id}"), # Output template - 'quiet': True, # Reduce output noise - 'no_warnings': True, # Suppress warnings in the logs - } - - with YoutubeDL(ydl_opts) as ydl: + # Multiple approaches to try in case of YouTube bot detection + approaches = [ + # Approach 1: Use Chrome browser cookies + { + 'skip_download': True, + 'writecomments': True, + 'writeinfojson': True, + 'outtmpl': str(work_dir / f"{video_id}"), + 'quiet': True, + 'no_warnings': True, + 'cookiesfrombrowser': ('chrome',), + }, + # Approach 2: Use Firefox browser cookies + { + 'skip_download': True, + 'writecomments': True, + 'writeinfojson': True, + 'outtmpl': str(work_dir / f"{video_id}"), + 'quiet': True, + 'no_warnings': True, + 'cookiesfrombrowser': ('firefox',), + }, + # Approach 3: Custom headers to simulate real browser + { + 'skip_download': True, + 'writecomments': True, + 'writeinfojson': True, + 'outtmpl': str(work_dir / f"{video_id}"), + 'quiet': True, + 'no_warnings': True, + 'http_headers': { + 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36', + 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', + 'Accept-Language': 'en-US,en;q=0.5', + 'Accept-Encoding': 'gzip, deflate', + 'Connection': 'keep-alive', + } + }, + # Approach 4: YouTube extractor arguments + { + 'skip_download': True, + 'writecomments': True, + 'writeinfojson': True, + 'outtmpl': str(work_dir / f"{video_id}"), + 'quiet': True, + 'no_warnings': True, + 'extractor_args': { + 'youtube': { + 'player_client': ['tv', 'web', 'ios'], + 'skip': ['hls', 'dash'], + } + } + }, + # Approach 5: Basic configuration (last resort) + { + 'skip_download': True, + 'writecomments': True, + 'writeinfojson': True, + 'outtmpl': str(work_dir / f"{video_id}"), + 'quiet': True, + 'no_warnings': True, + } + ] + + last_error = None + + for approach_num, ydl_opts in enumerate(approaches, 1): try: - # This will create a .info.json file with comments - info_dict = ydl.extract_info(video_url, download=False) + with YoutubeDL(ydl_opts) as ydl: + info_dict = ydl.extract_info(video_url, download=False) + + # Check if we have comments in the info dict directly + comments_raw = info_dict.get("comments") or [] + + # If no comments in info dict, try to read from the JSON file + if not comments_raw: + json_path = work_dir / f"{video_id}.info.json" + if json_path.exists(): + with json_path.open("r", encoding="utf-8") as f: + info = json.load(f) + comments_raw = info.get("comments") or [] + + # Extract text from comments + comments_text: List[str] = [] + for comment in comments_raw: + # Some entries use the key "text", others use "txt". We normalise + # both. + text = comment.get("text") or comment.get("txt") or "" + if text: + comments_text.append(text.strip()) + + return comments_text + except Exception as e: - raise RuntimeError(f"yt-dlp failed to extract video info: {str(e)}") - - # Check if we have comments in the info dict directly - comments_raw = info_dict.get("comments") or [] - - # If no comments in info dict, try to read from the JSON file - if not comments_raw: - json_path = work_dir / f"{video_id}.info.json" - if json_path.exists(): - with json_path.open("r", encoding="utf-8") as f: - info = json.load(f) - comments_raw = info.get("comments") or [] - comments_text: List[str] = [] - for comment in comments_raw: - # Some entries use the key "text", others use "txt". We normalise - # both. - text = comment.get("text") or comment.get("txt") or "" - if text: - comments_text.append(text.strip()) - - return comments_text + last_error = e + error_msg = str(e) + + # If it's not the bot detection error, re-raise immediately + if "Sign in to confirm" not in error_msg and "bot" not in error_msg.lower(): + raise RuntimeError(f"yt-dlp failed to extract video info: {str(e)}") + + # Otherwise, continue to next approach + continue + + # If all approaches failed, raise the last error with context + raise RuntimeError(f"Failed to extract comments after trying {len(approaches)} approaches. " + f"YouTube may be blocking access. Last error: {last_error}") def parse_srt_contents(contents: str) -> str: @@ -145,13 +219,10 @@ def parse_srt_contents(contents: str) -> str: def fetch_transcript(video_url: str, work_dir: Path, language: str = "fr") -> Tuple[str, str]: - """Download and parse YouTube subtitles using yt‑dlp. + """Download and parse YouTube subtitles using yt‑dlp with bot detection workarounds. - yt‑dlp will attempt to download manually provided subtitles first and fall - back to automatically generated subtitles when ``--write-auto-subs`` is - specified. We request subtitles in the desired language and fall back to - English if none are available. The resulting transcript is returned as a - string. + This function implements multiple fallback strategies to handle YouTube's + bot detection and authentication requirements when fetching subtitles. Parameters ---------- @@ -178,52 +249,128 @@ def fetch_transcript(video_url: str, work_dir: Path, language: str = "fr") -> Tu base_output = work_dir / f"{video_id}" - # Configure yt-dlp options for extracting subtitles - ydl_opts = { - 'skip_download': True, # Don't download the video file - 'writesubtitles': True, # Download manual subtitles - 'writeautomaticsub': True, # Download auto-generated subtitles as fallback - 'subtitlesformat': 'srt', # Request SRT format - 'subtitleslangs': [language], # Request specific language - 'outtmpl': str(base_output), # Output template - 'quiet': True, # Reduce output noise - 'no_warnings': True, # Suppress warnings in the logs - } - - with YoutubeDL(ydl_opts) as ydl: + # Multiple approaches to try in case of YouTube bot detection + approaches = [ + # Approach 1: Use Chrome browser cookies + { + 'skip_download': True, + 'writesubtitles': True, + 'writeautomaticsub': True, + 'subtitlesformat': 'srt', + 'subtitleslangs': [language], + 'outtmpl': str(base_output), + 'quiet': True, + 'no_warnings': True, + 'cookiesfrombrowser': ('chrome',), + }, + # Approach 2: Use Firefox browser cookies + { + 'skip_download': True, + 'writesubtitles': True, + 'writeautomaticsub': True, + 'subtitlesformat': 'srt', + 'subtitleslangs': [language], + 'outtmpl': str(base_output), + 'quiet': True, + 'no_warnings': True, + 'cookiesfrombrowser': ('firefox',), + }, + # Approach 3: Custom headers to simulate real browser + { + 'skip_download': True, + 'writesubtitles': True, + 'writeautomaticsub': True, + 'subtitlesformat': 'srt', + 'subtitleslangs': [language], + 'outtmpl': str(base_output), + 'quiet': True, + 'no_warnings': True, + 'http_headers': { + 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36', + 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', + 'Accept-Language': 'en-US,en;q=0.5', + 'Accept-Encoding': 'gzip, deflate', + 'Connection': 'keep-alive', + } + }, + # Approach 4: YouTube extractor arguments + { + 'skip_download': True, + 'writesubtitles': True, + 'writeautomaticsub': True, + 'subtitlesformat': 'srt', + 'subtitleslangs': [language], + 'outtmpl': str(base_output), + 'quiet': True, + 'no_warnings': True, + 'extractor_args': { + 'youtube': { + 'player_client': ['tv', 'web', 'ios'], + 'skip': ['hls', 'dash'], + } + } + }, + # Approach 5: Basic configuration (last resort) + { + 'skip_download': True, + 'writesubtitles': True, + 'writeautomaticsub': True, + 'subtitlesformat': 'srt', + 'subtitleslangs': [language], + 'outtmpl': str(base_output), + 'quiet': True, + 'no_warnings': True, + } + ] + + last_error = None + + for approach_num, ydl_opts in enumerate(approaches, 1): try: - # This will download subtitle files to the work directory - info_dict = ydl.extract_info(video_url, download=False) + with YoutubeDL(ydl_opts) as ydl: + info_dict = ydl.extract_info(video_url, download=False) + + # Look for files like ..srt + possible_files = list(work_dir.glob(f"{video_id}.*.srt")) + selected_lang = "" + subtitle_path: Optional[Path] = None + + for file in possible_files: + suffix_parts = file.name.split(".") + if len(suffix_parts) >= 3: + lang_code = suffix_parts[-2] # filename format: ..srt + if lang_code == language: + subtitle_path = file + selected_lang = lang_code + break + + # If not found, fall back to the first available language + if subtitle_path is None and possible_files: + subtitle_path = possible_files[0] + parts = subtitle_path.name.split(".") + selected_lang = parts[-2] if len(parts) >= 3 else language + + transcript = "" + if subtitle_path and subtitle_path.exists(): + with subtitle_path.open("r", encoding="utf-8") as f: + contents = f.read() + transcript = parse_srt_contents(contents) + + return selected_lang, transcript + except Exception as e: - raise RuntimeError(f"yt-dlp failed to extract video info: {str(e)}") - - # Look for files like ..srt - possible_files = list(work_dir.glob(f"{video_id}.*.srt")) - selected_lang = "" - subtitle_path: Optional[Path] = None - - for file in possible_files: - suffix_parts = file.name.split(".") - if len(suffix_parts) >= 3: - lang_code = suffix_parts[-2] # filename format: ..srt - if lang_code == language: - subtitle_path = file - selected_lang = lang_code - break - - # If not found, fall back to the first available language - if subtitle_path is None and possible_files: - subtitle_path = possible_files[0] - parts = subtitle_path.name.split(".") - selected_lang = parts[-2] if len(parts) >= 3 else language - - transcript = "" - if subtitle_path and subtitle_path.exists(): - with subtitle_path.open("r", encoding="utf-8") as f: - contents = f.read() - transcript = parse_srt_contents(contents) - - return selected_lang, transcript + last_error = e + error_msg = str(e) + + # If it's not the bot detection error, re-raise immediately + if "Sign in to confirm" not in error_msg and "bot" not in error_msg.lower(): + raise RuntimeError(f"yt-dlp failed to extract video info: {str(e)}") + + # Otherwise, continue to next approach + continue + + # If all approaches failed, return empty result with warning + return language, "" def main() -> None: @@ -237,6 +384,15 @@ def main() -> None: """ Entrez un lien **YouTube** ci‑dessous pour obtenir sa transcription et ses commentaires. Le traitement utilise le module Python `yt‑dlp` en interne. + + ⚠️ **Note importante :** YouTube a récemment renforcé ses protections anti-bot. + Si vous rencontrez des erreurs, voici les solutions : + + 1. **Connectez-vous à YouTube dans votre navigateur** (Chrome ou Firefox) + 2. **Essayez avec des vidéos moins populaires** + 3. **Attendez quelques minutes entre les tentatives** + + L'application essaiera automatiquement plusieurs méthodes pour contourner ces restrictions. """ ) @@ -305,8 +461,49 @@ def main() -> None: else: st.warning("Aucun commentaire n'a pu être récupéré pour cette vidéo.") except Exception as exc: - st.error(f"Une erreur est survenue : {exc}") + error_msg = str(exc) + if "Sign in to confirm" in error_msg or "bot" in error_msg.lower(): + st.error("❌ YouTube a détecté une activité automatisée") + st.markdown(""" + **Solutions recommandées :** + + 1. **Connectez-vous à YouTube** dans votre navigateur (Chrome ou Firefox) + 2. **Réessayez dans quelques minutes** + 3. **Utilisez une vidéo différente** (moins populaire) + 4. **Vérifiez votre connexion internet** + + Cette erreur est temporaire et liée aux protections anti-bot de YouTube. + """) + elif "Failed to extract comments after trying" in error_msg: + st.error("❌ Impossible de récupérer les données après plusieurs tentatives") + st.markdown(""" + **Que s'est-il passé ?** + + L'application a essayé plusieurs méthodes pour contourner les protections YouTube, + mais toutes ont échoué. Ceci peut arriver avec des vidéos très populaires ou + lorsque YouTube renforce temporairement ses restrictions. + + **Solutions :** + + 1. Réessayez avec une autre vidéo + 2. Attendez 10-15 minutes avant de réessayer + 3. Vérifiez que l'URL est correcte et que la vidéo est publique + """) + else: + st.error(f"Une erreur est survenue : {exc}") + st.markdown(""" + **Aide au débogage :** + + Si cette erreur persiste, vérifiez : + - L'URL YouTube est correcte + - La vidéo est publique (pas privée ou supprimée) + - Votre connexion internet fonctionne + """) + + # Afficher les détails techniques en cas de besoin + with st.expander("Détails techniques de l'erreur"): + st.code(str(exc)) if __name__ == "__main__": - main() \ No newline at end of file + main()