From daca187cb52866e0b5b538b6d0d52537e76e467b Mon Sep 17 00:00:00 2001 From: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com> Date: Mon, 16 Feb 2026 04:05:05 +0000 Subject: [PATCH 1/2] Strip non-content HTML tags from output entirely Remove skip-tag regions (script, style, head, noscript, svg, nav, footer) from the HTML content before paragraph matching. This ensures they don't appear in the output at all, since they contain no book content and don't show in the ebook anyway. Co-Authored-By: tom mottes --- main.py | 12 ++---------- 1 file changed, 2 insertions(+), 10 deletions(-) diff --git a/main.py b/main.py index 52ca3c1..c91677f 100644 --- a/main.py +++ b/main.py @@ -462,14 +462,6 @@ def flush_batch(): re.DOTALL | re.IGNORECASE ) -def _get_skip_ranges(content: str) -> List[Tuple[int, int]]: - return [(m.start(), m.end()) for m in SKIP_TAG_PATTERN.finditer(content)] - -def _in_skip_range(pos: int, skip_ranges: List[Tuple[int, int]]) -> bool: - for start, end in skip_ranges: - if start <= pos < end: - return True - return False def _decode_html_text(text: str) -> str: decoded = html_module.unescape(text) @@ -539,8 +531,8 @@ def process_html_file(input_path: str, output_path: Optional[str], resume: bool with open(input_path, 'r', encoding='utf-8') as f: content = f.read() - skip_ranges = _get_skip_ranges(content) - matches = [m for m in PARAGRAPH_PATTERN.finditer(content) if not _in_skip_range(m.start(), skip_ranges)] + content = SKIP_TAG_PATTERN.sub('', content) + matches = list(PARAGRAPH_PATTERN.finditer(content)) paragraph_count = len(matches) checkpoint_path = get_checkpoint_path(output_path) if output_path else None From c7b61554f4c28be672906d976dc8a38d1cdb3c19 Mon Sep 17 00:00:00 2001 From: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com> Date: Mon, 16 Feb 2026 04:08:19 +0000 Subject: [PATCH 2/2] Update test to expect head/title stripped from output Co-Authored-By: tom mottes --- test_main.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/test_main.py b/test_main.py index 3f66805..869c9d4 100644 --- a/test_main.py +++ b/test_main.py @@ -421,7 +421,8 @@ def test_non_paragraph_content_preserved(self, tmp_path): process_html_file(str(input_file), str(output_file)) result = output_file.read_text(encoding="utf-8") assert "

Title

" in result - assert "Test" in result + assert "" not in result + assert "" not in result def test_empty_paragraph_no_crash(self, tmp_path): html = "<html><body><p></p><p>Real content here.</p></body></html>"