From 69de5d90ab983216a8210d76bb8d79923498bdc6 Mon Sep 17 00:00:00 2001 From: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com> Date: Mon, 16 Feb 2026 03:38:14 +0000 Subject: [PATCH 1/3] Restore tag-skipping for non-content HTML tags in process_html_file Re-add SKIP_TAGS logic that was removed during paragraph-based refactor. Paragraphs inside script, style, head, noscript, svg, nav, and footer tags are now skipped (no flite processing), reducing unnecessary work for ebook conversion. Co-Authored-By: tom mottes --- main.py | 20 +++++++++++++++++++- 1 file changed, 19 insertions(+), 1 deletion(-) diff --git a/main.py b/main.py index c4867aa..0a810e1 100644 --- a/main.py +++ b/main.py @@ -455,6 +455,20 @@ def flush_batch(): PARAGRAPH_PATTERN = re.compile(r'(]*>)(.*?)(

)', re.DOTALL | re.IGNORECASE) TAG_PATTERN = re.compile(r'<[^>]*>') +SKIP_TAGS = {'script', 'style', 'head', 'noscript', 'svg', 'nav', 'footer'} +SKIP_TAG_PATTERN = re.compile( + r'<(?P' + '|'.join(SKIP_TAGS) + r')\b[^>]*>.*?', + re.DOTALL | re.IGNORECASE +) + +def _get_skip_ranges(content: str) -> List[Tuple[int, int]]: + return [(m.start(), m.end()) for m in SKIP_TAG_PATTERN.finditer(content)] + +def _in_skip_range(pos: int, skip_ranges: List[Tuple[int, int]]) -> bool: + for start, end in skip_ranges: + if start <= pos < end: + return True + return False def _decode_html_text(text: str) -> str: decoded = html_module.unescape(text) @@ -524,7 +538,8 @@ def process_html_file(input_path: str, output_path: Optional[str], resume: bool with open(input_path, 'r', encoding='utf-8') as f: content = f.read() - matches = list(PARAGRAPH_PATTERN.finditer(content)) + skip_ranges = _get_skip_ranges(content) + matches = [m for m in PARAGRAPH_PATTERN.finditer(content) if not _in_skip_range(m.start(), skip_ranges)] paragraph_count = len(matches) checkpoint_path = get_checkpoint_path(output_path) if output_path else None @@ -543,6 +558,8 @@ def process_html_file(input_path: str, output_path: Optional[str], resume: bool if not output_path: counter = [0] def replace_paragraph(match): + if _in_skip_range(match.start(), skip_ranges): + return match.group(0) counter[0] += 1 return _process_single_paragraph(match, paragraph_count, counter[0]) print(PARAGRAPH_PATTERN.sub(replace_paragraph, content)) @@ -671,3 +688,4 @@ def main(): if __name__ == "__main__": main() + \ No newline at end of file From 8fdf5e88578e514014e83408a902b8214b205dad Mon Sep 17 00:00:00 2001 From: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com> Date: Mon, 16 Feb 2026 03:45:20 +0000 Subject: [PATCH 2/3] Unify skip logic: both stdout and file paths use filtered matches list Removed the separate _in_skip_range check from the stdout path's replace_paragraph callback. Now both code paths iterate over the same pre-filtered matches list, so skip logic lives in one place. Co-Authored-By: tom mottes --- main.py | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/main.py b/main.py index 0a810e1..a695f29 100644 --- a/main.py +++ b/main.py @@ -556,13 +556,14 @@ def process_html_file(input_path: str, output_path: Optional[str], resume: bool f.truncate(output_bytes) if not output_path: - counter = [0] - def replace_paragraph(match): - if _in_skip_range(match.start(), skip_ranges): - return match.group(0) - counter[0] += 1 - return _process_single_paragraph(match, paragraph_count, counter[0]) - print(PARAGRAPH_PATTERN.sub(replace_paragraph, content)) + prev_end = 0 + result_parts = [] + for counter, match in enumerate(matches, 1): + result_parts.append(content[prev_end:match.start()]) + result_parts.append(_process_single_paragraph(match, paragraph_count, counter)) + prev_end = match.end() + result_parts.append(content[prev_end:]) + print(''.join(result_parts)) return mode = "a" if start_paragraph > 0 else "w" From 68479a515d02cdab8e273fd0ae1e7bb43555fa41 Mon Sep 17 00:00:00 2001 From: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com> Date: Mon, 16 Feb 2026 03:53:25 +0000 Subject: [PATCH 3/3] Merge stdout and file-output paths into single code path Use sys.stdout when no output_path, eliminating the separate if-not-output_path branch. Both cases now share the exact same iteration and batching logic. Co-Authored-By: tom mottes --- main.py | 22 ++++++++-------------- 1 file changed, 8 insertions(+), 14 deletions(-) diff --git a/main.py b/main.py index a695f29..52ca3c1 100644 --- a/main.py +++ b/main.py @@ -9,6 +9,7 @@ import re import string import subprocess +import sys from typing import Dict, List, Optional, Tuple import unicodedata import os @@ -555,19 +556,11 @@ def process_html_file(input_path: str, output_path: Optional[str], resume: bool with open(output_path, "r+b") as f: f.truncate(output_bytes) - if not output_path: - prev_end = 0 - result_parts = [] - for counter, match in enumerate(matches, 1): - result_parts.append(content[prev_end:match.start()]) - result_parts.append(_process_single_paragraph(match, paragraph_count, counter)) - prev_end = match.end() - result_parts.append(content[prev_end:]) - print(''.join(result_parts)) - return - - mode = "a" if start_paragraph > 0 else "w" - out_file = open(output_path, mode, encoding='utf-8') + if output_path: + mode = "a" if start_paragraph > 0 else "w" + out_file = open(output_path, mode, encoding='utf-8') + else: + out_file = sys.stdout prev_end = matches[start_paragraph - 1].end() if start_paragraph > 0 else 0 for batch_start in range(start_paragraph, len(matches), FLITE_BATCH_SIZE): @@ -606,7 +599,8 @@ def process_html_file(input_path: str, output_path: Optional[str], resume: bool out_file.write(content[prev_end:]) out_file.flush() - out_file.close() + if output_path: + out_file.close() if checkpoint_path: remove_checkpoint(checkpoint_path)