From daca187cb52866e0b5b538b6d0d52537e76e467b Mon Sep 17 00:00:00 2001
From: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com>
Date: Mon, 16 Feb 2026 04:05:05 +0000
Subject: [PATCH 1/2] Strip non-content HTML tags from output entirely

Remove skip-tag regions (script, style, head, noscript, svg, nav,
footer) from the HTML content before paragraph matching. This ensures
they don't appear in the output at all, since they contain no book
content and don't show in the ebook anyway.

Co-Authored-By: tom mottes <tom.mottes@gmail.com>
---
 main.py | 12 ++----------
 1 file changed, 2 insertions(+), 10 deletions(-)
diff --git a/main.py b/main.py
index 52ca3c1..c91677f 100644
--- a/main.py
+++ b/main.py
@@ -462,14 +462,6 @@ def flush_batch():
     re.DOTALL | re.IGNORECASE
 )
 
-def _get_skip_ranges(content: str) -> List[Tuple[int, int]]:
-    return [(m.start(), m.end()) for m in SKIP_TAG_PATTERN.finditer(content)]
-
-def _in_skip_range(pos: int, skip_ranges: List[Tuple[int, int]]) -> bool:
-    for start, end in skip_ranges:
-        if start <= pos < end:
-            return True
-    return False
 
 def _decode_html_text(text: str) -> str:
     decoded = html_module.unescape(text)
@@ -539,8 +531,8 @@ def process_html_file(input_path: str, output_path: Optional[str], resume: bool
     with open(input_path, 'r', encoding='utf-8') as f:
         content = f.read()
 
-    skip_ranges = _get_skip_ranges(content)
-    matches = [m for m in PARAGRAPH_PATTERN.finditer(content) if not _in_skip_range(m.start(), skip_ranges)]
+    content = SKIP_TAG_PATTERN.sub('', content)
+    matches = list(PARAGRAPH_PATTERN.finditer(content))
     paragraph_count = len(matches)
 
     checkpoint_path = get_checkpoint_path(output_path) if output_path else None

From c7b61554f4c28be672906d976dc8a38d1cdb3c19 Mon Sep 17 00:00:00 2001
From: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com>
Date: Mon, 16 Feb 2026 04:08:19 +0000
Subject: [PATCH 2/2] Update test to expect head/title stripped from output

Co-Authored-By: tom mottes <tom.mottes@gmail.com>
---
 test_main.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/test_main.py b/test_main.py
index 3f66805..869c9d4 100644
--- a/test_main.py
+++ b/test_main.py
@@ -421,7 +421,8 @@ def test_non_paragraph_content_preserved(self, tmp_path):
         process_html_file(str(input_file), str(output_file))
         result = output_file.read_text(encoding="utf-8")
         assert "<h1>Title</h1>" in result
-        assert "<title>Test</title>" in result
+        assert "<head>" not in result
+        assert "<title>" not in result
 
     def test_empty_paragraph_no_crash(self, tmp_path):
         html = "<html><body><p></p><p>Real content here.</p></body></html>"