From c05076db357d00b969b16955926776aa12764464 Mon Sep 17 00:00:00 2001
From: Simon Kobler <github@kobler.me>
Date: Fri, 23 Jan 2026 08:02:00 +0100
Subject: [PATCH 1/2] Enhance asset downloading and processing logic to handle
 directory creation and file naming more robustly

---
 webexp/cli.py | 131 ++++++++++++++++++++++++++++++++++++++------------
 1 file changed, 101 insertions(+), 30 deletions(-)

diff --git a/webexp/cli.py b/webexp/cli.py
index 8bbe495..01299f3 100644
--- a/webexp/cli.py
+++ b/webexp/cli.py
@@ -275,7 +275,15 @@ def download_file(url, output_path, asset_type):
         try:
             response = requests.get(url, stream=True, timeout=10)
             response.raise_for_status()
-            os.makedirs(os.path.dirname(output_path), exist_ok=True)
+
+            # Ensure the directory exists; handle case where a file exists at the dir path
+            dir_path = os.path.dirname(output_path)
+            if dir_path:
+                if os.path.exists(dir_path) and not os.path.isdir(dir_path):
+                    logger.error("Cannot create directory %s: file exists at this path", dir_path)
+                    return
+                os.makedirs(dir_path, exist_ok=True)
+
             with open(output_path, 'wb') as file:
                 for chunk in response.iter_content(chunk_size=8192):
                     file.write(chunk)
@@ -291,20 +299,21 @@ def download_file(url, output_path, asset_type):
         for url in urls:
             # Create the output path by preserving the folder structure
             parsed_uri = urlparse(url)
-            relative_path = url.replace(
-                parsed_uri.scheme + "://", ""
-            ).replace(parsed_uri.netloc, "")
-            if asset_type != 'html':
-                relative_path = re.sub(
-                    SCAN_CDN_REGEX,
-                    asset_type + "/",
-                    url
-                )
             if asset_type == 'html':
+                relative_path = url.replace(
+                    parsed_uri.scheme + "://", ""
+                ).replace(parsed_uri.netloc, "")
                 if relative_path == "":
                     relative_path = "index.html"
                 else:
                     relative_path = f"{relative_path}.html"
+            else:
+                # For non-HTML assets, preserve the filename from the URL
+                filename = os.path.basename(parsed_uri.path)
+                if not filename:
+                    # If no filename in path, use a hash of the URL
+                    filename = f"asset_{hash(url) & 0xFFFFFFFF:08x}"
+                relative_path = os.path.join(asset_type, filename)
 
             output_path = os.path.join(output_folder,  relative_path.strip("/"))
 
@@ -320,27 +329,38 @@ def process_html(file):
     # Process JS
     for tag in soup.find_all([ 'script']):
         if tag.has_attr('src') and re.match(CDN_URL_REGEX, tag['src']) is not None:
-            tag['src'] = re.sub(SCAN_CDN_REGEX, "/js/", tag['src'])
+            filename = os.path.basename(urlparse(tag['src']).path)
+            tag['src'] = f"/js/{filename}"
+            # Remove integrity attribute since the file content may have been modified
+            if tag.has_attr('integrity'):
+                del tag['integrity']
 
     # Process CSS
     for tag in soup.find_all([ 'link'], rel="stylesheet"):
         if tag.has_attr('href') and re.match(CDN_URL_REGEX, tag['href']) is not None:
-            tag['href'] = re.sub(SCAN_CDN_REGEX, "/css/", tag['href'])
+            filename = os.path.basename(urlparse(tag['href']).path)
+            tag['href'] = f"/css/{filename}"
+            # Remove integrity attribute since the file content may have been modified
+            if tag.has_attr('integrity'):
+                del tag['integrity']
 
     # Process links like favicons
     for tag in soup.find_all([ 'link'], rel=["apple-touch-icon", "shortcut icon"]):
         if tag.has_attr('href') and re.match(CDN_URL_REGEX, tag['href']) is not None:
-            tag['href'] = re.sub(SCAN_CDN_REGEX, "/images/", tag['href']).replace("//", "/")
+            filename = os.path.basename(urlparse(tag['href']).path)
+            tag['href'] = f"/images/{filename}"
 
     # Process IMG
     for tag in soup.find_all([ 'img']):
         if tag.has_attr('src') and re.match(CDN_URL_REGEX, tag['src']) is not None:
-            tag['src'] = re.sub(SCAN_CDN_REGEX, "/images/", tag['src']).replace("//", "/")
+            filename = os.path.basename(urlparse(tag['src']).path)
+            tag['src'] = f"/images/{filename}"
 
     # Process Media
     for tag in soup.find_all([ 'video', 'audio']):
         if tag.has_attr('src') and re.match(CDN_URL_REGEX, tag['src']) is not None:
-            tag['src'] = re.sub(SCAN_CDN_REGEX, "/media/", tag['src'])
+            filename = os.path.basename(urlparse(tag['src']).path)
+            tag['src'] = f"/media/{filename}"
 
     # Format and unminify the HTML
     formatted_html = soup.prettify()
@@ -362,27 +382,78 @@ def process_css(file_path, output_folder):
         content = f.read()
         logger.info("Processing CSS file: %s", file_path)
 
-        # Find all image URLs in the CSS content
-        image_urls = re.findall(SCAN_CDN_REGEX, content)
-        print("Found %d image URLs in CSS file", image_urls)
-        for match in image_urls:
-            full_url = match[0]
+        # Find all asset URLs in the CSS content (images, fonts, etc.)
+        asset_urls = re.findall(SCAN_CDN_REGEX, content)
+        logger.info("Found %d asset URLs in CSS file", len(asset_urls))
+        for full_url in asset_urls:
             if full_url:
-                # Download the image to the output path/images
-                image_output_path = os.path.join(os.path.dirname(output_folder), "images", os.path.basename(full_url))
+                # Clean URL: strip whitespace and trailing %20 (URL-encoded space)
+                full_url = full_url.rstrip()
+                while full_url.endswith('%20'):
+                    full_url = full_url[:-3]
+
+                # Skip URLs without file extensions (likely incomplete/malformed)
+                parsed_path = urlparse(full_url).path
+                filename = os.path.basename(parsed_path)
+                if '.' not in filename:
+                    logger.warning("Skipping URL without file extension: %s", full_url)
+                    continue
+
+                # Determine asset type by file extension and download to appropriate folder
+                ext = filename.lower().split('.')[-1]
+                if ext in ['woff', 'woff2', 'ttf', 'eot', 'otf']:
+                    asset_folder = "fonts"
+                elif ext in ['js']:
+                    asset_folder = "js"
+                elif ext in ['css']:
+                    asset_folder = "css"
+                elif ext in ['mp4', 'webm', 'ogg', 'mp3', 'wav']:
+                    asset_folder = "media"
+                else:
+                    asset_folder = "images"
+
+                asset_output_path = os.path.join(output_folder, asset_folder, filename)
                 try:
                     response = requests.get(full_url, stream=True, timeout=10)
                     response.raise_for_status()
-                    os.makedirs(os.path.dirname(image_output_path), exist_ok=True)
-                    with open(image_output_path, 'wb') as img_file:
+                    os.makedirs(os.path.dirname(asset_output_path), exist_ok=True)
+                    with open(asset_output_path, 'wb') as asset_file:
                         for chunk in response.iter_content(chunk_size=8192):
-                            img_file.write(chunk)
-                    logger.info("Downloaded image: %s", full_url)
+                            asset_file.write(chunk)
+                    logger.info("Downloaded %s asset: %s", asset_folder, full_url)
                 except requests.RequestException as e:
-                    logger.error("Failed to download image %s: %s", full_url, e)
-
-        # Replace CDN URLs with local paths for images
-        updated_content = re.sub(SCAN_CDN_REGEX, "/images/", content).replace("//", "/")
+                    logger.error("Failed to download asset %s: %s", full_url, e)
+
+        # Replace CDN URLs with local paths, preserving filenames and categorizing by type
+        def replace_cdn_url(match):
+            url = match.group(0).rstrip()
+            while url.endswith('%20'):
+                url = url[:-3]
+
+            parsed_url = urlparse(url)
+            filename = os.path.basename(parsed_url.path)
+
+            # Only replace URLs with file extensions (others are malformed/incomplete)
+            if '.' not in filename:
+                return url
+
+            # Determine asset type by file extension
+            ext = filename.lower().split('.')[-1]
+            if ext in ['js']:
+                return f"/js/{filename}"
+            elif ext in ['css']:
+                return f"/css/{filename}"
+            elif ext in ['woff', 'woff2', 'ttf', 'eot', 'otf']:
+                return f"/fonts/{filename}"
+            elif ext in ['jpg', 'jpeg', 'png', 'gif', 'svg', 'webp', 'ico']:
+                return f"/images/{filename}"
+            elif ext in ['mp4', 'webm', 'ogg', 'mp3', 'wav']:
+                return f"/media/{filename}"
+            else:
+                # Default to images for unknown types
+                return f"/images/{filename}"
+
+        updated_content = re.sub(SCAN_CDN_REGEX, replace_cdn_url, content)
         f.seek(0)
         f.write(updated_content)
         f.truncate()

From 4a62cf9555f9f6803ef283248ca33f509bd2419b Mon Sep 17 00:00:00 2001
From: Simon Kobler <github@kobler.me>
Date: Fri, 23 Jan 2026 09:33:02 +0100
Subject: [PATCH 2/2] Enhance asset downloading to use UUID-based filenames and
 improve internal reference handling in HTML and CSS processing

---
 webexp/cli.py | 206 ++++++++++++++++++++++++++++++++++----------------
 1 file changed, 141 insertions(+), 65 deletions(-)

diff --git a/webexp/cli.py b/webexp/cli.py
index 01299f3..5c35660 100644
--- a/webexp/cli.py
+++ b/webexp/cli.py
@@ -11,6 +11,7 @@
 import os
 import sys
 import logging
+import uuid
 from datetime import datetime
 from importlib.metadata import version
 import requests
@@ -271,6 +272,9 @@ def recursive_scan(current_url):
 
 def download_assets(assets, output_folder):
     """Download assets from the CDN and save them to the output folder."""
+    # Create a mapping of original URLs to UUID-based filenames
+    url_to_filename = {}
+
     def download_file(url, output_path, asset_type):
         try:
             response = requests.get(url, stream=True, timeout=10)
@@ -281,46 +285,84 @@ def download_file(url, output_path, asset_type):
             if dir_path:
                 if os.path.exists(dir_path) and not os.path.isdir(dir_path):
                     logger.error("Cannot create directory %s: file exists at this path", dir_path)
-                    return
+                    return None
                 os.makedirs(dir_path, exist_ok=True)
 
             with open(output_path, 'wb') as file:
                 for chunk in response.iter_content(chunk_size=8192):
                     file.write(chunk)
-            if asset_type == 'html':
-                process_html(output_path)
-            elif asset_type == 'css':
-                process_css(output_path, output_folder)
+            return output_path
         except requests.RequestException as e:
             logger.error("Failed to download asset %s: %s", url, e)
+            return None
 
+    # First pass: download all non-HTML assets with UUID names and build mapping
     for asset_type, urls in assets.items():
+        if asset_type == 'html':
+            continue
         logger.debug("Downloading %s assets...", asset_type)
         for url in urls:
-            # Create the output path by preserving the folder structure
             parsed_uri = urlparse(url)
-            if asset_type == 'html':
-                relative_path = url.replace(
-                    parsed_uri.scheme + "://", ""
-                ).replace(parsed_uri.netloc, "")
-                if relative_path == "":
-                    relative_path = "index.html"
-                else:
-                    relative_path = f"{relative_path}.html"
+            original_filename = os.path.basename(parsed_uri.path)
+            if not original_filename:
+                original_filename = f"asset_{hash(url) & 0xFFFFFFFF:08x}"
+
+            # Get file extension
+            if '.' in original_filename:
+                ext = '.' + original_filename.split('.')[-1]
             else:
-                # For non-HTML assets, preserve the filename from the URL
-                filename = os.path.basename(parsed_uri.path)
-                if not filename:
-                    # If no filename in path, use a hash of the URL
-                    filename = f"asset_{hash(url) & 0xFFFFFFFF:08x}"
-                relative_path = os.path.join(asset_type, filename)
+                ext = ''
+
+            # Generate UUID-based filename
+            uuid_filename = str(uuid.uuid4()) + ext
+            relative_path = os.path.join(asset_type, uuid_filename)
+            output_path = os.path.join(output_folder, relative_path.strip("/"))
+
+            logger.info("Downloading %s to %s", url, output_path)
+            result = download_file(url, output_path, asset_type)
 
-            output_path = os.path.join(output_folder,  relative_path.strip("/"))
+            if result:
+                # Store mapping: original_filename -> new path relative to output folder
+                url_to_filename[url] = f"/{asset_type}/{uuid_filename}"
+                # Also store by original filename for CSS @import and url() references
+                url_to_filename[original_filename] = f"/{asset_type}/{uuid_filename}"
+
+    # Process CSS files to update internal references and download additional assets
+    for asset_type, urls in assets.items():
+        if asset_type == 'css':
+            for url in urls:
+                css_path = None
+                # Find the downloaded CSS file path from our mapping
+                for orig_url, mapped_path in url_to_filename.items():
+                    if orig_url == url:
+                        css_path = os.path.join(output_folder, mapped_path.lstrip('/'))
+                        break
+                if css_path and os.path.exists(css_path):
+                    process_css(css_path, output_folder, url_to_filename)
+
+    # Second pass: download and process HTML files
+    for asset_type, urls in assets.items():
+        if asset_type != 'html':
+            continue
+        logger.debug("Downloading %s assets...", asset_type)
+        for url in urls:
+            parsed_uri = urlparse(url)
+            relative_path = url.replace(
+                parsed_uri.scheme + "://", ""
+            ).replace(parsed_uri.netloc, "")
+            if relative_path == "":
+                relative_path = "index.html"
+            else:
+                relative_path = f"{relative_path}.html"
 
+            output_path = os.path.join(output_folder, relative_path.strip("/"))
             logger.info("Downloading %s to %s", url, output_path)
-            download_file(url, output_path, asset_type)
+            result = download_file(url, output_path, asset_type)
+
+            if result:
+                process_html(output_path, url_to_filename)
 
-def process_html(file):
+def process_html(file, url_to_filename):
     """Process the HTML file to fix asset links and format the HTML."""
 
     with open(file, 'r', encoding='utf-8') as f:
@@ -329,8 +371,15 @@ def process_html(file):
     # Process JS
     for tag in soup.find_all([ 'script']):
         if tag.has_attr('src') and re.match(CDN_URL_REGEX, tag['src']) is not None:
-            filename = os.path.basename(urlparse(tag['src']).path)
-            tag['src'] = f"/js/{filename}"
+            original_url = tag['src']
+            # Look up the mapped UUID filename
+            if original_url in url_to_filename:
+                tag['src'] = url_to_filename[original_url]
+            else:
+                # Fallback: try to find by original filename
+                filename = os.path.basename(urlparse(original_url).path)
+                if filename in url_to_filename:
+                    tag['src'] = url_to_filename[filename]
             # Remove integrity attribute since the file content may have been modified
             if tag.has_attr('integrity'):
                 del tag['integrity']
@@ -338,8 +387,15 @@ def process_html(file):
     # Process CSS
     for tag in soup.find_all([ 'link'], rel="stylesheet"):
         if tag.has_attr('href') and re.match(CDN_URL_REGEX, tag['href']) is not None:
-            filename = os.path.basename(urlparse(tag['href']).path)
-            tag['href'] = f"/css/{filename}"
+            original_url = tag['href']
+            # Look up the mapped UUID filename
+            if original_url in url_to_filename:
+                tag['href'] = url_to_filename[original_url]
+            else:
+                # Fallback: try to find by original filename
+                filename = os.path.basename(urlparse(original_url).path)
+                if filename in url_to_filename:
+                    tag['href'] = url_to_filename[filename]
             # Remove integrity attribute since the file content may have been modified
             if tag.has_attr('integrity'):
                 del tag['integrity']
@@ -347,20 +403,38 @@ def process_html(file):
     # Process links like favicons
     for tag in soup.find_all([ 'link'], rel=["apple-touch-icon", "shortcut icon"]):
         if tag.has_attr('href') and re.match(CDN_URL_REGEX, tag['href']) is not None:
-            filename = os.path.basename(urlparse(tag['href']).path)
-            tag['href'] = f"/images/{filename}"
+            original_url = tag['href']
+            # Look up the mapped UUID filename
+            if original_url in url_to_filename:
+                tag['href'] = url_to_filename[original_url]
+            else:
+                filename = os.path.basename(urlparse(original_url).path)
+                if filename in url_to_filename:
+                    tag['href'] = url_to_filename[filename]
 
     # Process IMG
     for tag in soup.find_all([ 'img']):
         if tag.has_attr('src') and re.match(CDN_URL_REGEX, tag['src']) is not None:
-            filename = os.path.basename(urlparse(tag['src']).path)
-            tag['src'] = f"/images/{filename}"
+            original_url = tag['src']
+            # Look up the mapped UUID filename
+            if original_url in url_to_filename:
+                tag['src'] = url_to_filename[original_url]
+            else:
+                filename = os.path.basename(urlparse(original_url).path)
+                if filename in url_to_filename:
+                    tag['src'] = url_to_filename[filename]
 
     # Process Media
     for tag in soup.find_all([ 'video', 'audio']):
         if tag.has_attr('src') and re.match(CDN_URL_REGEX, tag['src']) is not None:
-            filename = os.path.basename(urlparse(tag['src']).path)
-            tag['src'] = f"/media/{filename}"
+            original_url = tag['src']
+            # Look up the mapped UUID filename
+            if original_url in url_to_filename:
+                tag['src'] = url_to_filename[original_url]
+            else:
+                filename = os.path.basename(urlparse(original_url).path)
+                if filename in url_to_filename:
+                    tag['src'] = url_to_filename[filename]
 
     # Format and unminify the HTML
     formatted_html = soup.prettify()
@@ -371,8 +445,8 @@ def process_html(file):
 
     logger.debug("Processed %s", file)
 
-def process_css(file_path, output_folder):
-    """Process the CSS file to fix asset links."""
+def process_css(file_path, output_folder, url_to_filename):
+    """Process the CSS file to fix asset links and download referenced assets."""
 
     if not os.path.exists(file_path):
         logger.error("CSS folder does not exist: %s", file_path)
@@ -392,27 +466,35 @@ def process_css(file_path, output_folder):
                 while full_url.endswith('%20'):
                     full_url = full_url[:-3]
 
+                # Skip if already downloaded
+                if full_url in url_to_filename:
+                    continue
+
                 # Skip URLs without file extensions (likely incomplete/malformed)
                 parsed_path = urlparse(full_url).path
-                filename = os.path.basename(parsed_path)
-                if '.' not in filename:
+                original_filename = os.path.basename(parsed_path)
+                if '.' not in original_filename:
                     logger.warning("Skipping URL without file extension: %s", full_url)
                     continue
 
-                # Determine asset type by file extension and download to appropriate folder
-                ext = filename.lower().split('.')[-1]
-                if ext in ['woff', 'woff2', 'ttf', 'eot', 'otf']:
+                # Determine asset type by file extension
+                ext = '.' + original_filename.lower().split('.')[-1]
+                ext_name = ext[1:]  # Without the dot
+                if ext_name in ['woff', 'woff2', 'ttf', 'eot', 'otf']:
                     asset_folder = "fonts"
-                elif ext in ['js']:
+                elif ext_name in ['js']:
                     asset_folder = "js"
-                elif ext in ['css']:
+                elif ext_name in ['css']:
                     asset_folder = "css"
-                elif ext in ['mp4', 'webm', 'ogg', 'mp3', 'wav']:
+                elif ext_name in ['mp4', 'webm', 'ogg', 'mp3', 'wav']:
                     asset_folder = "media"
                 else:
                     asset_folder = "images"
 
-                asset_output_path = os.path.join(output_folder, asset_folder, filename)
+                # Generate UUID-based filename
+                uuid_filename = str(uuid.uuid4()) + ext
+                asset_output_path = os.path.join(output_folder, asset_folder, uuid_filename)
+
                 try:
                     response = requests.get(full_url, stream=True, timeout=10)
                     response.raise_for_status()
@@ -420,38 +502,32 @@ def process_css(file_path, output_folder):
                     with open(asset_output_path, 'wb') as asset_file:
                         for chunk in response.iter_content(chunk_size=8192):
                             asset_file.write(chunk)
-                    logger.info("Downloaded %s asset: %s", asset_folder, full_url)
+                    logger.info("Downloaded %s asset: %s -> %s", asset_folder, full_url, uuid_filename)
+                    # Add to mapping
+                    url_to_filename[full_url] = f"/{asset_folder}/{uuid_filename}"
+                    url_to_filename[original_filename] = f"/{asset_folder}/{uuid_filename}"
                 except requests.RequestException as e:
                     logger.error("Failed to download asset %s: %s", full_url, e)
 
-        # Replace CDN URLs with local paths, preserving filenames and categorizing by type
+        # Replace CDN URLs with UUID-based local paths
         def replace_cdn_url(match):
             url = match.group(0).rstrip()
             while url.endswith('%20'):
                 url = url[:-3]
 
+            # Look up in mapping
+            if url in url_to_filename:
+                return url_to_filename[url]
+
+            # Try by filename
             parsed_url = urlparse(url)
             filename = os.path.basename(parsed_url.path)
+            if filename in url_to_filename:
+                return url_to_filename[filename]
 
-            # Only replace URLs with file extensions (others are malformed/incomplete)
-            if '.' not in filename:
-                return url
-
-            # Determine asset type by file extension
-            ext = filename.lower().split('.')[-1]
-            if ext in ['js']:
-                return f"/js/{filename}"
-            elif ext in ['css']:
-                return f"/css/{filename}"
-            elif ext in ['woff', 'woff2', 'ttf', 'eot', 'otf']:
-                return f"/fonts/{filename}"
-            elif ext in ['jpg', 'jpeg', 'png', 'gif', 'svg', 'webp', 'ico']:
-                return f"/images/{filename}"
-            elif ext in ['mp4', 'webm', 'ogg', 'mp3', 'wav']:
-                return f"/media/{filename}"
-            else:
-                # Default to images for unknown types
-                return f"/images/{filename}"
+            # If not found, return original (shouldn't happen)
+            logger.warning("No mapping found for URL in CSS: %s", url)
+            return url
 
         updated_content = re.sub(SCAN_CDN_REGEX, replace_cdn_url, content)
         f.seek(0)