From c05076db357d00b969b16955926776aa12764464 Mon Sep 17 00:00:00 2001 From: Simon Kobler Date: Fri, 23 Jan 2026 08:02:00 +0100 Subject: [PATCH 1/2] Enhance asset downloading and processing logic to handle directory creation and file naming more robustly --- webexp/cli.py | 131 ++++++++++++++++++++++++++++++++++++++------------ 1 file changed, 101 insertions(+), 30 deletions(-) diff --git a/webexp/cli.py b/webexp/cli.py index 8bbe495..01299f3 100644 --- a/webexp/cli.py +++ b/webexp/cli.py @@ -275,7 +275,15 @@ def download_file(url, output_path, asset_type): try: response = requests.get(url, stream=True, timeout=10) response.raise_for_status() - os.makedirs(os.path.dirname(output_path), exist_ok=True) + + # Ensure the directory exists; handle case where a file exists at the dir path + dir_path = os.path.dirname(output_path) + if dir_path: + if os.path.exists(dir_path) and not os.path.isdir(dir_path): + logger.error("Cannot create directory %s: file exists at this path", dir_path) + return + os.makedirs(dir_path, exist_ok=True) + with open(output_path, 'wb') as file: for chunk in response.iter_content(chunk_size=8192): file.write(chunk) @@ -291,20 +299,21 @@ def download_file(url, output_path, asset_type): for url in urls: # Create the output path by preserving the folder structure parsed_uri = urlparse(url) - relative_path = url.replace( - parsed_uri.scheme + "://", "" - ).replace(parsed_uri.netloc, "") - if asset_type != 'html': - relative_path = re.sub( - SCAN_CDN_REGEX, - asset_type + "/", - url - ) if asset_type == 'html': + relative_path = url.replace( + parsed_uri.scheme + "://", "" + ).replace(parsed_uri.netloc, "") if relative_path == "": relative_path = "index.html" else: relative_path = f"{relative_path}.html" + else: + # For non-HTML assets, preserve the filename from the URL + filename = os.path.basename(parsed_uri.path) + if not filename: + # If no filename in path, use a hash of the URL + filename = f"asset_{hash(url) & 0xFFFFFFFF:08x}" + relative_path = os.path.join(asset_type, filename) output_path = os.path.join(output_folder, relative_path.strip("/")) @@ -320,27 +329,38 @@ def process_html(file): # Process JS for tag in soup.find_all([ 'script']): if tag.has_attr('src') and re.match(CDN_URL_REGEX, tag['src']) is not None: - tag['src'] = re.sub(SCAN_CDN_REGEX, "/js/", tag['src']) + filename = os.path.basename(urlparse(tag['src']).path) + tag['src'] = f"/js/{filename}" + # Remove integrity attribute since the file content may have been modified + if tag.has_attr('integrity'): + del tag['integrity'] # Process CSS for tag in soup.find_all([ 'link'], rel="stylesheet"): if tag.has_attr('href') and re.match(CDN_URL_REGEX, tag['href']) is not None: - tag['href'] = re.sub(SCAN_CDN_REGEX, "/css/", tag['href']) + filename = os.path.basename(urlparse(tag['href']).path) + tag['href'] = f"/css/{filename}" + # Remove integrity attribute since the file content may have been modified + if tag.has_attr('integrity'): + del tag['integrity'] # Process links like favicons for tag in soup.find_all([ 'link'], rel=["apple-touch-icon", "shortcut icon"]): if tag.has_attr('href') and re.match(CDN_URL_REGEX, tag['href']) is not None: - tag['href'] = re.sub(SCAN_CDN_REGEX, "/images/", tag['href']).replace("//", "/") + filename = os.path.basename(urlparse(tag['href']).path) + tag['href'] = f"/images/{filename}" # Process IMG for tag in soup.find_all([ 'img']): if tag.has_attr('src') and re.match(CDN_URL_REGEX, tag['src']) is not None: - tag['src'] = re.sub(SCAN_CDN_REGEX, "/images/", tag['src']).replace("//", "/") + filename = os.path.basename(urlparse(tag['src']).path) + tag['src'] = f"/images/{filename}" # Process Media for tag in soup.find_all([ 'video', 'audio']): if tag.has_attr('src') and re.match(CDN_URL_REGEX, tag['src']) is not None: - tag['src'] = re.sub(SCAN_CDN_REGEX, "/media/", tag['src']) + filename = os.path.basename(urlparse(tag['src']).path) + tag['src'] = f"/media/{filename}" # Format and unminify the HTML formatted_html = soup.prettify() @@ -362,27 +382,78 @@ def process_css(file_path, output_folder): content = f.read() logger.info("Processing CSS file: %s", file_path) - # Find all image URLs in the CSS content - image_urls = re.findall(SCAN_CDN_REGEX, content) - print("Found %d image URLs in CSS file", image_urls) - for match in image_urls: - full_url = match[0] + # Find all asset URLs in the CSS content (images, fonts, etc.) + asset_urls = re.findall(SCAN_CDN_REGEX, content) + logger.info("Found %d asset URLs in CSS file", len(asset_urls)) + for full_url in asset_urls: if full_url: - # Download the image to the output path/images - image_output_path = os.path.join(os.path.dirname(output_folder), "images", os.path.basename(full_url)) + # Clean URL: strip whitespace and trailing %20 (URL-encoded space) + full_url = full_url.rstrip() + while full_url.endswith('%20'): + full_url = full_url[:-3] + + # Skip URLs without file extensions (likely incomplete/malformed) + parsed_path = urlparse(full_url).path + filename = os.path.basename(parsed_path) + if '.' not in filename: + logger.warning("Skipping URL without file extension: %s", full_url) + continue + + # Determine asset type by file extension and download to appropriate folder + ext = filename.lower().split('.')[-1] + if ext in ['woff', 'woff2', 'ttf', 'eot', 'otf']: + asset_folder = "fonts" + elif ext in ['js']: + asset_folder = "js" + elif ext in ['css']: + asset_folder = "css" + elif ext in ['mp4', 'webm', 'ogg', 'mp3', 'wav']: + asset_folder = "media" + else: + asset_folder = "images" + + asset_output_path = os.path.join(output_folder, asset_folder, filename) try: response = requests.get(full_url, stream=True, timeout=10) response.raise_for_status() - os.makedirs(os.path.dirname(image_output_path), exist_ok=True) - with open(image_output_path, 'wb') as img_file: + os.makedirs(os.path.dirname(asset_output_path), exist_ok=True) + with open(asset_output_path, 'wb') as asset_file: for chunk in response.iter_content(chunk_size=8192): - img_file.write(chunk) - logger.info("Downloaded image: %s", full_url) + asset_file.write(chunk) + logger.info("Downloaded %s asset: %s", asset_folder, full_url) except requests.RequestException as e: - logger.error("Failed to download image %s: %s", full_url, e) - - # Replace CDN URLs with local paths for images - updated_content = re.sub(SCAN_CDN_REGEX, "/images/", content).replace("//", "/") + logger.error("Failed to download asset %s: %s", full_url, e) + + # Replace CDN URLs with local paths, preserving filenames and categorizing by type + def replace_cdn_url(match): + url = match.group(0).rstrip() + while url.endswith('%20'): + url = url[:-3] + + parsed_url = urlparse(url) + filename = os.path.basename(parsed_url.path) + + # Only replace URLs with file extensions (others are malformed/incomplete) + if '.' not in filename: + return url + + # Determine asset type by file extension + ext = filename.lower().split('.')[-1] + if ext in ['js']: + return f"/js/{filename}" + elif ext in ['css']: + return f"/css/{filename}" + elif ext in ['woff', 'woff2', 'ttf', 'eot', 'otf']: + return f"/fonts/{filename}" + elif ext in ['jpg', 'jpeg', 'png', 'gif', 'svg', 'webp', 'ico']: + return f"/images/{filename}" + elif ext in ['mp4', 'webm', 'ogg', 'mp3', 'wav']: + return f"/media/{filename}" + else: + # Default to images for unknown types + return f"/images/{filename}" + + updated_content = re.sub(SCAN_CDN_REGEX, replace_cdn_url, content) f.seek(0) f.write(updated_content) f.truncate() From 4a62cf9555f9f6803ef283248ca33f509bd2419b Mon Sep 17 00:00:00 2001 From: Simon Kobler Date: Fri, 23 Jan 2026 09:33:02 +0100 Subject: [PATCH 2/2] Enhance asset downloading to use UUID-based filenames and improve internal reference handling in HTML and CSS processing --- webexp/cli.py | 206 ++++++++++++++++++++++++++++++++++---------------- 1 file changed, 141 insertions(+), 65 deletions(-) diff --git a/webexp/cli.py b/webexp/cli.py index 01299f3..5c35660 100644 --- a/webexp/cli.py +++ b/webexp/cli.py @@ -11,6 +11,7 @@ import os import sys import logging +import uuid from datetime import datetime from importlib.metadata import version import requests @@ -271,6 +272,9 @@ def recursive_scan(current_url): def download_assets(assets, output_folder): """Download assets from the CDN and save them to the output folder.""" + # Create a mapping of original URLs to UUID-based filenames + url_to_filename = {} + def download_file(url, output_path, asset_type): try: response = requests.get(url, stream=True, timeout=10) @@ -281,46 +285,84 @@ def download_file(url, output_path, asset_type): if dir_path: if os.path.exists(dir_path) and not os.path.isdir(dir_path): logger.error("Cannot create directory %s: file exists at this path", dir_path) - return + return None os.makedirs(dir_path, exist_ok=True) with open(output_path, 'wb') as file: for chunk in response.iter_content(chunk_size=8192): file.write(chunk) - if asset_type == 'html': - process_html(output_path) - elif asset_type == 'css': - process_css(output_path, output_folder) + return output_path except requests.RequestException as e: logger.error("Failed to download asset %s: %s", url, e) + return None + # First pass: download all non-HTML assets with UUID names and build mapping for asset_type, urls in assets.items(): + if asset_type == 'html': + continue logger.debug("Downloading %s assets...", asset_type) for url in urls: - # Create the output path by preserving the folder structure parsed_uri = urlparse(url) - if asset_type == 'html': - relative_path = url.replace( - parsed_uri.scheme + "://", "" - ).replace(parsed_uri.netloc, "") - if relative_path == "": - relative_path = "index.html" - else: - relative_path = f"{relative_path}.html" + original_filename = os.path.basename(parsed_uri.path) + if not original_filename: + original_filename = f"asset_{hash(url) & 0xFFFFFFFF:08x}" + + # Get file extension + if '.' in original_filename: + ext = '.' + original_filename.split('.')[-1] else: - # For non-HTML assets, preserve the filename from the URL - filename = os.path.basename(parsed_uri.path) - if not filename: - # If no filename in path, use a hash of the URL - filename = f"asset_{hash(url) & 0xFFFFFFFF:08x}" - relative_path = os.path.join(asset_type, filename) + ext = '' + + # Generate UUID-based filename + uuid_filename = str(uuid.uuid4()) + ext + relative_path = os.path.join(asset_type, uuid_filename) + output_path = os.path.join(output_folder, relative_path.strip("/")) + + logger.info("Downloading %s to %s", url, output_path) + result = download_file(url, output_path, asset_type) - output_path = os.path.join(output_folder, relative_path.strip("/")) + if result: + # Store mapping: original_filename -> new path relative to output folder + url_to_filename[url] = f"/{asset_type}/{uuid_filename}" + # Also store by original filename for CSS @import and url() references + url_to_filename[original_filename] = f"/{asset_type}/{uuid_filename}" + + # Process CSS files to update internal references and download additional assets + for asset_type, urls in assets.items(): + if asset_type == 'css': + for url in urls: + css_path = None + # Find the downloaded CSS file path from our mapping + for orig_url, mapped_path in url_to_filename.items(): + if orig_url == url: + css_path = os.path.join(output_folder, mapped_path.lstrip('/')) + break + if css_path and os.path.exists(css_path): + process_css(css_path, output_folder, url_to_filename) + + # Second pass: download and process HTML files + for asset_type, urls in assets.items(): + if asset_type != 'html': + continue + logger.debug("Downloading %s assets...", asset_type) + for url in urls: + parsed_uri = urlparse(url) + relative_path = url.replace( + parsed_uri.scheme + "://", "" + ).replace(parsed_uri.netloc, "") + if relative_path == "": + relative_path = "index.html" + else: + relative_path = f"{relative_path}.html" + output_path = os.path.join(output_folder, relative_path.strip("/")) logger.info("Downloading %s to %s", url, output_path) - download_file(url, output_path, asset_type) + result = download_file(url, output_path, asset_type) + + if result: + process_html(output_path, url_to_filename) -def process_html(file): +def process_html(file, url_to_filename): """Process the HTML file to fix asset links and format the HTML.""" with open(file, 'r', encoding='utf-8') as f: @@ -329,8 +371,15 @@ def process_html(file): # Process JS for tag in soup.find_all([ 'script']): if tag.has_attr('src') and re.match(CDN_URL_REGEX, tag['src']) is not None: - filename = os.path.basename(urlparse(tag['src']).path) - tag['src'] = f"/js/{filename}" + original_url = tag['src'] + # Look up the mapped UUID filename + if original_url in url_to_filename: + tag['src'] = url_to_filename[original_url] + else: + # Fallback: try to find by original filename + filename = os.path.basename(urlparse(original_url).path) + if filename in url_to_filename: + tag['src'] = url_to_filename[filename] # Remove integrity attribute since the file content may have been modified if tag.has_attr('integrity'): del tag['integrity'] @@ -338,8 +387,15 @@ def process_html(file): # Process CSS for tag in soup.find_all([ 'link'], rel="stylesheet"): if tag.has_attr('href') and re.match(CDN_URL_REGEX, tag['href']) is not None: - filename = os.path.basename(urlparse(tag['href']).path) - tag['href'] = f"/css/{filename}" + original_url = tag['href'] + # Look up the mapped UUID filename + if original_url in url_to_filename: + tag['href'] = url_to_filename[original_url] + else: + # Fallback: try to find by original filename + filename = os.path.basename(urlparse(original_url).path) + if filename in url_to_filename: + tag['href'] = url_to_filename[filename] # Remove integrity attribute since the file content may have been modified if tag.has_attr('integrity'): del tag['integrity'] @@ -347,20 +403,38 @@ def process_html(file): # Process links like favicons for tag in soup.find_all([ 'link'], rel=["apple-touch-icon", "shortcut icon"]): if tag.has_attr('href') and re.match(CDN_URL_REGEX, tag['href']) is not None: - filename = os.path.basename(urlparse(tag['href']).path) - tag['href'] = f"/images/{filename}" + original_url = tag['href'] + # Look up the mapped UUID filename + if original_url in url_to_filename: + tag['href'] = url_to_filename[original_url] + else: + filename = os.path.basename(urlparse(original_url).path) + if filename in url_to_filename: + tag['href'] = url_to_filename[filename] # Process IMG for tag in soup.find_all([ 'img']): if tag.has_attr('src') and re.match(CDN_URL_REGEX, tag['src']) is not None: - filename = os.path.basename(urlparse(tag['src']).path) - tag['src'] = f"/images/{filename}" + original_url = tag['src'] + # Look up the mapped UUID filename + if original_url in url_to_filename: + tag['src'] = url_to_filename[original_url] + else: + filename = os.path.basename(urlparse(original_url).path) + if filename in url_to_filename: + tag['src'] = url_to_filename[filename] # Process Media for tag in soup.find_all([ 'video', 'audio']): if tag.has_attr('src') and re.match(CDN_URL_REGEX, tag['src']) is not None: - filename = os.path.basename(urlparse(tag['src']).path) - tag['src'] = f"/media/{filename}" + original_url = tag['src'] + # Look up the mapped UUID filename + if original_url in url_to_filename: + tag['src'] = url_to_filename[original_url] + else: + filename = os.path.basename(urlparse(original_url).path) + if filename in url_to_filename: + tag['src'] = url_to_filename[filename] # Format and unminify the HTML formatted_html = soup.prettify() @@ -371,8 +445,8 @@ def process_html(file): logger.debug("Processed %s", file) -def process_css(file_path, output_folder): - """Process the CSS file to fix asset links.""" +def process_css(file_path, output_folder, url_to_filename): + """Process the CSS file to fix asset links and download referenced assets.""" if not os.path.exists(file_path): logger.error("CSS folder does not exist: %s", file_path) @@ -392,27 +466,35 @@ def process_css(file_path, output_folder): while full_url.endswith('%20'): full_url = full_url[:-3] + # Skip if already downloaded + if full_url in url_to_filename: + continue + # Skip URLs without file extensions (likely incomplete/malformed) parsed_path = urlparse(full_url).path - filename = os.path.basename(parsed_path) - if '.' not in filename: + original_filename = os.path.basename(parsed_path) + if '.' not in original_filename: logger.warning("Skipping URL without file extension: %s", full_url) continue - # Determine asset type by file extension and download to appropriate folder - ext = filename.lower().split('.')[-1] - if ext in ['woff', 'woff2', 'ttf', 'eot', 'otf']: + # Determine asset type by file extension + ext = '.' + original_filename.lower().split('.')[-1] + ext_name = ext[1:] # Without the dot + if ext_name in ['woff', 'woff2', 'ttf', 'eot', 'otf']: asset_folder = "fonts" - elif ext in ['js']: + elif ext_name in ['js']: asset_folder = "js" - elif ext in ['css']: + elif ext_name in ['css']: asset_folder = "css" - elif ext in ['mp4', 'webm', 'ogg', 'mp3', 'wav']: + elif ext_name in ['mp4', 'webm', 'ogg', 'mp3', 'wav']: asset_folder = "media" else: asset_folder = "images" - asset_output_path = os.path.join(output_folder, asset_folder, filename) + # Generate UUID-based filename + uuid_filename = str(uuid.uuid4()) + ext + asset_output_path = os.path.join(output_folder, asset_folder, uuid_filename) + try: response = requests.get(full_url, stream=True, timeout=10) response.raise_for_status() @@ -420,38 +502,32 @@ def process_css(file_path, output_folder): with open(asset_output_path, 'wb') as asset_file: for chunk in response.iter_content(chunk_size=8192): asset_file.write(chunk) - logger.info("Downloaded %s asset: %s", asset_folder, full_url) + logger.info("Downloaded %s asset: %s -> %s", asset_folder, full_url, uuid_filename) + # Add to mapping + url_to_filename[full_url] = f"/{asset_folder}/{uuid_filename}" + url_to_filename[original_filename] = f"/{asset_folder}/{uuid_filename}" except requests.RequestException as e: logger.error("Failed to download asset %s: %s", full_url, e) - # Replace CDN URLs with local paths, preserving filenames and categorizing by type + # Replace CDN URLs with UUID-based local paths def replace_cdn_url(match): url = match.group(0).rstrip() while url.endswith('%20'): url = url[:-3] + # Look up in mapping + if url in url_to_filename: + return url_to_filename[url] + + # Try by filename parsed_url = urlparse(url) filename = os.path.basename(parsed_url.path) + if filename in url_to_filename: + return url_to_filename[filename] - # Only replace URLs with file extensions (others are malformed/incomplete) - if '.' not in filename: - return url - - # Determine asset type by file extension - ext = filename.lower().split('.')[-1] - if ext in ['js']: - return f"/js/{filename}" - elif ext in ['css']: - return f"/css/{filename}" - elif ext in ['woff', 'woff2', 'ttf', 'eot', 'otf']: - return f"/fonts/{filename}" - elif ext in ['jpg', 'jpeg', 'png', 'gif', 'svg', 'webp', 'ico']: - return f"/images/{filename}" - elif ext in ['mp4', 'webm', 'ogg', 'mp3', 'wav']: - return f"/media/{filename}" - else: - # Default to images for unknown types - return f"/images/{filename}" + # If not found, return original (shouldn't happen) + logger.warning("No mapping found for URL in CSS: %s", url) + return url updated_content = re.sub(SCAN_CDN_REGEX, replace_cdn_url, content) f.seek(0)