diff --git a/llvd/__init__.py b/llvd/__init__.py index dd77ffc..6c8ca67 100755 --- a/llvd/__init__.py +++ b/llvd/__init__.py @@ -1,3 +1,3 @@ # -*- encoding: utf-8 -*- """Linkedin Learning Video Downloader.""" -__version__ = "3.0.9" +__version__ = "3.1.0" diff --git a/llvd/app.py b/llvd/app.py index 06183b3..805cb62 100644 --- a/llvd/app.py +++ b/llvd/app.py @@ -22,7 +22,7 @@ class App: def __init__( - self, email, password, course_slug, resolution, caption, exercise, throttle + self, email, password, course_slug, resolution, caption, exercise, throttle, proxies=None ): self.email = email self.password = password @@ -44,6 +44,28 @@ def __init__( self.throttle = throttle self.debug_mode = True + # Initialize proxy support + self.proxies = proxies or [] + self.current_proxy_idx = 0 + + # Initialize summary tracking + self.summary = { + 'courses_processed': 0, + 'videos': { + 'total': 0, + 'downloaded': 0, + 'skipped': 0, + 'failed': 0, + 'already_exist': 0 + }, + 'chapters': { + 'total': 0, + 'empty': 0, + 'deleted': 0 + }, + 'errors': [] + } + # Initialize summary tracking self.summary = { 'courses_processed': 0, @@ -118,6 +140,17 @@ def login(self, session, login_data): ) ) + def get_session(self): + session = Session() + if self.proxies: + click.echo(click.style("\nUsing proxies for requests", fg="green")) + proxy = self.proxies() if callable(self.proxies) else self.proxies + session.proxies = { + "http": proxy, + "https": proxy, + } + return session + def run(self, cookies=None, headers={}): """ Main function, tries to login the user and when it succeeds, tries to download the course @@ -138,7 +171,7 @@ def run(self, cookies=None, headers={}): # proceed to download self.download() else: - with Session() as session: + with self.get_session() as session: site = session.get(config.login_url) bs_content = bs(site.content, "html.parser") @@ -213,10 +246,10 @@ def download(self): self._save_debug_info(e, self.course_slug, "download") def download_courses_from_path(self): - try: page_url = config.path_url.format(self.course_slug) - page = requests.get(page_url) + with self.get_session() as session: + page = session.get(page_url) soup = bs(page.content, "html.parser") course_list = soup.select('script[type="application/ld+json"]') @@ -259,30 +292,34 @@ def fetch_video(self, video): video_name = re.sub(r'[\\/*?:"<>|]', "", video["title"]) self.current_video_name = video_name video_slug = video["slug"] - + original_format = self.video_format # <-- Add this line + resolutions_to_try = [self.video_format] if self.video_format == "1080": resolutions_to_try = ["1080", "720"] - + last_exception = None - + page_json = None + download_url = None + download_success = False + for resolution in resolutions_to_try: self.video_format = resolution video_url = config.video_url.format( self.course_slug, resolution, video_slug ) - + try: - page_data = requests.get( - video_url, - cookies=self.cookies, - headers=self.headers, - allow_redirects=False, - timeout=30 - ) + with self.get_session() as session: + page_data = session.get( + video_url, + cookies=self.cookies, + headers=self.headers, + allow_redirects=False, + timeout=30 + ) try: page_json = page_data.json() - # Check for locked/premium content if "elements" in page_json and page_json["elements"] and \ isinstance(page_json["elements"][0], dict): @@ -334,15 +371,12 @@ def fetch_video(self, video): if resolution == "1080" and "720" in resolutions_to_try: continue raise ValueError("No video URL found") - break - except ValueError as e: last_exception = e if resolution == "1080" and "720" in resolutions_to_try: continue raise - except requests.exceptions.RequestException as e: last_exception = e if resolution == "1080" and "720" in resolutions_to_try: @@ -352,7 +386,7 @@ def fetch_video(self, video): if last_exception: raise last_exception raise ValueError("Failed to fetch video data") - + # Get subtitles if available (from the last successful response) subtitles = page_json["elements"][0].get("selectedVideo", {}).get("transcript") duration_in_ms = int(page_json["elements"][0].get("selectedVideo", {}).get("durationInSeconds", 0)) * 1000 @@ -364,16 +398,16 @@ def fetch_video(self, video): ) ) try: - download_video( + download_success = download_video( download_url, self.current_video_index, video_name, self.chapter_path, self.throttle, ) - + # Only try to download subtitles if video download was successful - if subtitles and self.caption: + if subtitles and self.caption and download_success: try: download_subtitles( self.current_video_index, @@ -384,10 +418,10 @@ def fetch_video(self, video): ) except Exception as e: click.echo(click.style( - f"[WARNING] Failed to download subtitles: {str(e)}", + f"[WARNING] Failed to download subtitles: {str(e)}", fg="yellow" )) - + except Exception as e: self._start_modified_spinner("...") if self.debug_mode: @@ -408,6 +442,8 @@ def fetch_video(self, video): # Restore the original video format self.video_format = original_format + return page_json, video_name, download_success + def _extract_video_url(self, page_json, video_slug, video_name): """Extract video URL from the JSON response with multiple fallback methods""" if not page_json or "elements" not in page_json or not page_json["elements"]: @@ -468,9 +504,6 @@ def _save_debug_info(self, response_data, video_slug, error_type): return None def fetch_chapter(self, chapter, chapters_pad_length, delay): - """ - Downloads all videos in a chapter with enhanced error handling and debugging - """ chapter_name = chapter["title"] videos = chapter["videos"] chapters_index_padded = str(self.current_chapter_index).rjust( @@ -506,6 +539,9 @@ def fetch_chapter(self, chapter, chapters_pad_length, delay): self.summary['videos']['total'] += len(videos) + # Track how many videos were downloaded in this chapter + downloaded_in_chapter = 0 + for video in videos_to_download: self.current_video_index = video_index + len(current_files) video_name = re.sub(r'[\\/*?:"<>|]', "", video["title"]) @@ -525,7 +561,7 @@ def fetch_chapter(self, chapter, chapters_pad_length, delay): try: # Fetch video data try: - page_json, video_name = self.fetch_video(video) + page_json, video_name, download_success = self.fetch_video(video) except ValueError as e: if "locked" in str(e).lower() or "premium" in str(e).lower(): click.echo(click.style( @@ -549,60 +585,14 @@ def fetch_chapter(self, chapter, chapters_pad_length, delay): video_index += 1 continue - # Get subtitles if available - selected_video = page_json["elements"][0].get("selectedVideo", {}) - subtitles = selected_video.get("transcript") - duration_in_ms = int(selected_video.get("durationInSeconds", 0)) * 1000 - - click.echo( - click.style( - f"\nCurrent: {chapters_index_padded}. {clean_name(chapter_name)}/" - f"{self.current_video_index:02d}. {video_name}.mp4 @{self.video_format}p" - ) - ) - - # Download the video - try: - download_video( - download_url, - self.current_video_index, - video_name, - chapter_path, - delay, - ) - - # Only try to download subtitles if video download was successful - if subtitles and self.caption: - try: - download_subtitles( - self.current_video_index, - subtitles.get("lines", []), - video_name, - chapter_path, - duration_in_ms, - ) - except Exception as e: - click.echo(click.style( - f"[WARNING] Failed to download subtitles: {str(e)}", - fg="yellow" - )) - - # Mark video as successfully downloaded + # Mark video as successfully downloaded + if download_success: self._mark_video_downloaded(video_name) self.summary['videos']['downloaded'] += 1 - - except Exception as e: - self._start_modified_spinner("...") + downloaded_in_chapter += 1 # <-- Track successful downloads + else: self.summary['videos']['failed'] += 1 - self.summary['errors'].append(f"Failed to download '{video_name}': {str(e)}") - if self.debug_mode: - self._save_debug_info( - {"error": str(e), "download_url": download_url}, - video_slug, - "download_failed" - ) - - + except Exception as e: self._start_modified_spinner("...") self.summary['videos']['failed'] += 1 @@ -614,30 +604,36 @@ def fetch_chapter(self, chapter, chapters_pad_length, delay): "processing_error" ) video_index += 1 - - # Check if chapter is empty after processing + + # Optionally, still check if the directory is empty and log an error if so try: entries = os.listdir(chapter_path) if not entries: + # Directory is empty, but this should be rare now + pass + except OSError as e: + self.summary['errors'].append(f"Error checking chapter directory {chapter_path}: {str(e)}") + + # After all downloads, check if the chapter is truly empty (no .mp4 files at all) + try: + mp4_files = [f for f in os.listdir(chapter_path) if f.endswith(".mp4")] + if len(videos) > 0 and not mp4_files: self.summary['chapters']['empty'] += 1 except OSError as e: self.summary['errors'].append(f"Error checking chapter directory {chapter_path}: {str(e)}") - + def download_entire_course(self, *args, **kwargs): skip_done_alert = kwargs.get("skip_done_alert", False) try: - # Initialize spinner for course initialization self._start_spinner("Initializing course download...") - course_url = config.course_url.format(self.course_slug) - - r = requests.get( + r = self._request_with_proxies( + "get", course_url, cookies=self.cookies, headers=self.headers, allow_redirects=True, ) - try: response_json = r.json() @@ -713,6 +709,59 @@ def download_entire_course(self, *args, **kwargs): if not skip_done_alert: self._print_summary() + def _get_next_proxy(self): + """Get the next proxy from the list in a round-robin fashion""" + if not self.proxies: + return None + + proxy = self.proxies[self.current_proxy_idx] + self.current_proxy_idx = (self.current_proxy_idx + 1) % len(self.proxies) + return { + 'http': proxy, + 'https': proxy, + } + + def _make_request(self, method, url, **kwargs): + """Wrapper around requests to handle proxy rotation and retries""" + max_retries = len(self.proxies) if self.proxies else 1 + last_exception = None + + for attempt in range(max_retries): + proxy = self._get_next_proxy() + try: + if proxy: + kwargs['proxies'] = proxy + if self.debug_mode: + click.echo(click.style(f"Using proxy: {proxy}", fg="cyan")) + + if method.upper() == 'GET': + response = requests.get(url, **kwargs) + elif method.upper() == 'POST': + response = requests.post(url, **kwargs) + elif method.upper() == 'PUT': + response = requests.put(url, **kwargs) + elif method.upper() == 'DELETE': + response = requests.delete(url, **kwargs) + else: + raise ValueError(f"Unsupported HTTP method: {method}") + + response.raise_for_status() + return response + + except requests.exceptions.RequestException as e: + last_exception = e + if self.debug_mode: + click.echo(click.style(f"Attempt {attempt + 1} failed with proxy {proxy}: {str(e)}", fg="yellow")) + continue + + # If we get here, all retries failed + if last_exception: + if self.debug_mode: + click.echo(click.style(f"All proxy attempts failed. Last error: {str(last_exception)}", fg="red")) + raise last_exception + + raise Exception("Failed to make request") + def _print_summary(self): """Print a formatted summary of the download process in a table format""" summary = self.summary @@ -784,3 +833,23 @@ def _stop_spinner(self): """Ensure output ends with a newline""" sys.stdout.write("\n") sys.stdout.flush() + + def _request_with_proxies(self, method, url, **kwargs): + """ + Try all proxies in self.proxies for a request, return the first successful response. + If no proxies or all fail, raise the last exception. + """ + proxies = self.proxies if self.proxies else [None] + last_exc = None + for proxy in proxies: + try: + with Session() as session: + if proxy: + session.proxies = {"http": proxy, "https": proxy} + resp = session.request(method, url, **kwargs) + resp.raise_for_status() + return resp + except Exception as exc: + last_exc = exc + continue + raise last_exc if last_exc else Exception("All proxies failed or no proxies provided.") diff --git a/llvd/cli.py b/llvd/cli.py index 27dee88..d5ae9ea 100644 --- a/llvd/cli.py +++ b/llvd/cli.py @@ -1,10 +1,12 @@ import click from typing import Optional +import random from llvd import config, __version__ from llvd.app import App from llvd.process_io import parse_cookie_file, parse_header_file from llvd.validators import validate_course_and_path, parse_throttle +from llvd.utils import load_proxies, get_random_proxy BOLD = "\033[1m" RED_COLOR = "\u001b[31m" @@ -60,6 +62,12 @@ "-t", help="Min,max wait in seconds between downloads (e.g., '10,30' or '5')", ) +@click.option( + "--proxy-file", + "proxy_file", + default=None, + help="Path to a file containing a list of proxies (one per line)", +) @click.pass_context def main( ctx: click.Context, @@ -72,6 +80,7 @@ def main( course: Optional[str], path: Optional[str], throttle: Optional[str], + proxy_file: Optional[str], ) -> None: """ LinkedIn Learning Video Downloader (LLVD) @@ -89,11 +98,22 @@ def main( return try: + # Parse proxy file if provided + proxies = [] + if proxy_file: + try: + with open(proxy_file, 'r') as f: + proxies = [line.strip() for line in f if line.strip()] + click.echo(click.style(f"Loaded {len(proxies)} proxies from {proxy_file}", fg="green")) + except Exception as e: + click.echo(click.style(f"Failed to load proxies from {proxy_file}: {str(e)}", fg="red")) + return + # Validate and process course/path course_slug, is_path = validate_course_and_path(course, path) # Parse throttle values - throttle_values = parse_throttle(throttle) + throttle_values = parse_throttle(throttle) if throttle else None # Validate path requires throttle if is_path and not throttle_values: @@ -108,6 +128,7 @@ def main( caption=caption, exercise=exercise, throttle=throttle_values, + proxies=proxies ) if cookies: @@ -132,4 +153,4 @@ def main( except Exception as e: click.echo(click.style(f"Error: {str(e)}", fg="red"), err=True) - ctx.exit(1) \ No newline at end of file + ctx.exit(1) diff --git a/llvd/downloader.py b/llvd/downloader.py index fa357a2..0dd678e 100644 --- a/llvd/downloader.py +++ b/llvd/downloader.py @@ -1,49 +1,72 @@ from tqdm import tqdm import requests import click -import re +import os from llvd.utils import clean_name, subtitles_time_format, throttle -def download_video(url, index, filename, path, delay=None): +def download_video(url, index, filename, path, delay=None, proxies=None): """ - Downloads a video and saves it by its name plus index for easy sorting + Download video file with progress bar and retry logic. + + Args: + url (str): The URL of the video to download + index (int): The index of the video in the course + filename (str): The name to save the file as + path (str): The directory to save the file in + delay (tuple, optional): Min,max wait in seconds between chunks + proxies (list, optional): List of proxy URLs to use for the download """ if delay: throttle(delay) + + filename = clean_name(filename) + filepath = os.path.join(path, f"{index:02d} {filename}.mp4") + + # Skip if file already exists and has content + if os.path.exists(filepath) and os.path.getsize(filepath) > 0: + click.echo(click.style(f"✓ {filename} already exists", fg="green")) + return True + + # Create a session for connection pooling + session = requests.Session() + + # Configure proxy if available + if proxies and len(proxies) > 0: + proxy = proxies[0] if isinstance(proxies, list) else proxies + session.proxies = {'http': proxy, 'https': proxy} + maximum_retries = 5 - with open(f"{path}/{index:0=2d}. {clean_name(filename)}.mp4", "wb") as f: - download_size = 0 - while maximum_retries > 0: - adapter = requests.adapters.HTTPAdapter(max_retries=maximum_retries) - session = requests.Session() - session.mount(url, adapter) - response = session.get( - url, - stream=True, - headers={"Accept-Encoding": None, "Content-Encoding": "gzip"}, - ) - download_size = response.headers.get("content-length") - if download_size is None and maximum_retries > 0: - maximum_retries -= 1 - else: - break - pbar = tqdm( - total=int(download_size), - initial=0, - unit="B", + + try: + # Get the file size for the progress bar + response = session.get(url, stream=True, timeout=30) + response.raise_for_status() + total_size = int(response.headers.get('content-length', 0)) + + # Download the file in chunks and show progress + with open(filepath, 'wb') as f, tqdm( + desc=f"{index:02d} {filename}", + total=total_size, + unit='iB', unit_scale=True, - position=0, - leave=True, - miniters=1, - ascii=" ✹", - ) - for chunk in response.iter_content(chunk_size=1024): - if chunk: - f.write(chunk) - pbar.set_description("Downloading video... ") - pbar.update(len(chunk)) - pbar.close() + unit_divisor=1024, + bar_format='{l_bar}{bar:20}{r_bar}' + ) as bar: + for chunk in response.iter_content(chunk_size=8192): + if chunk: + size = f.write(chunk) + bar.update(size) + + click.echo(click.style(f"✓ {filename} downloaded", fg="green")) + return True + + except Exception as e: + # Clean up partially downloaded file + if os.path.exists(filepath): + os.remove(filepath) + click.echo(click.style(f"✗ Failed to download {filename}: {str(e)}", fg="red")) + return False def download_subtitles(index, subs, video_name, path, video_duration): """Write to a file (subtitle file) caption matching the right time.""" @@ -56,44 +79,54 @@ def download_subtitles(index, subs, video_name, path, video_duration): line = f"{i}\n{subtitles_time_format(starts_at)} --> {subtitles_time_format(ends_at)}\n{caption}\n\n" f.write(line.encode("utf8")) -def download_exercises(links, path): - """Download exercises.""" +def download_exercises(links, path, proxies=None): + """ + Download exercise files. + + Args: + links (list): List of exercise file URLs + path (str): Directory path to save the exercise files + proxies (list, optional): List of proxy URLs to use for the download + """ + if not os.path.exists(path): + os.makedirs(path) + + session = requests.Session() + + # Configure proxy if available + if proxies and len(proxies) > 0: + proxy = proxies[0] if isinstance(proxies, list) else proxies + session.proxies = {'http': proxy, 'https': proxy} + for link in links: - filename = re.split("exercises/(.+).zip", link)[1] - filepath = f"{path}/{clean_name(filename)}.zip" - with open(filepath, "wb") as f: - # Set up the request - request = requests.get( - link, - stream=True, - headers={"Accept-Encoding": None, "Content-Encoding": "gzip"}, - ) - download_size = request.headers.get("content-length") - - # Show a progress bar while downloading the file - if download_size: - pbar = tqdm( - total=int(download_size), - initial=0, - unit="B", + try: + filename = os.path.basename(link.split('?')[0]) + filepath = os.path.join(path, filename) + + # Skip if file already exists and has content + if os.path.exists(filepath) and os.path.getsize(filepath) > 0: + click.echo(click.style(f"✓ Exercise {filename} already exists", fg="green")) + continue + + with session.get(link, stream=True, timeout=30) as r: + r.raise_for_status() + total_size = int(r.headers.get('content-length', 0)) + + with open(filepath, 'wb') as f, tqdm( + desc=f"Downloading {filename}", + total=total_size, + unit='iB', unit_scale=True, - position=0, - leave=True, - miniters=1, - ascii=" ✹", - desc="Downloading exercise files...", - ) - else: - pbar = None - - # Write the file in chunks to the disk - for chunk in request.iter_content(chunk_size=1024): - if chunk: - f.write(chunk) - if pbar: - pbar.update(len(chunk)) - - # Close the progress bar - if pbar: - pbar.close() - print("\n") \ No newline at end of file + unit_divisor=1024, + bar_format='{l_bar}{bar:20}{r_bar}' + ) as bar: + for chunk in r.iter_content(chunk_size=8192): + if chunk: + size = f.write(chunk) + bar.update(size) + + click.echo(click.style(f"✓ Downloaded exercise: {filename}", fg="green")) + + except Exception as e: + click.echo(click.style(f"✗ Failed to download exercise {link}: {str(e)}", fg="red")) + continue \ No newline at end of file diff --git a/llvd/utils.py b/llvd/utils.py index 9f99cff..ad2af12 100644 --- a/llvd/utils.py +++ b/llvd/utils.py @@ -2,6 +2,7 @@ import re from random import randint from time import sleep +import random def subtitles_time_format(ms): @@ -49,7 +50,6 @@ def throttle(wait_time=None): print(f'{cursor_up}{clear_line}{cursor_up}{cursor_home}') - def cleanup_empty_directories(path, errors=None): """Recursively remove empty directories. Returns the number of removed directories.""" if not os.path.isdir(path): @@ -72,3 +72,15 @@ def cleanup_empty_directories(path, errors=None): if errors is not None: errors.append(f"Failed to clean up directory {path}: {str(e)}") return removed + + +def load_proxies(proxy_file_path): + """Load proxies from a file, return a list of proxy URLs.""" + with open(proxy_file_path, "r") as f: + proxies = [line.strip() for line in f if line.strip()] + return proxies + + +def get_random_proxy(proxies): + """Return a random proxy from the list.""" + return random.choice(proxies) if proxies else None diff --git a/proxies.example b/proxies.example new file mode 100644 index 0000000..a5e6edf --- /dev/null +++ b/proxies.example @@ -0,0 +1,20 @@ +# Example proxy file for LLVD +# Add one proxy per line in the format: +# protocol://username:password@host:port +# or +# protocol://host:port + +# HTTP/HTTPS proxies (with authentication) +# http://user:pass@proxy1.example.com:8080 +# https://user:pass@proxy2.example.com:3128 + +# SOCKS proxies (with authentication) +# socks5://user:pass@proxy3.example.com:1080 +# socks4://user:pass@proxy4.example.com:1080 + +# Example without authentication +# http://proxy.example.com:8080 +# https://proxy.example.com:3128 + +# To use with LLVD, save this file as 'proxies.txt' and run: +# llvd -c "course-slug" --proxy-file proxies.txt