From 69de5d90ab983216a8210d76bb8d79923498bdc6 Mon Sep 17 00:00:00 2001
From: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com>
Date: Mon, 16 Feb 2026 03:38:14 +0000
Subject: [PATCH 1/3] Restore tag-skipping for non-content HTML tags in
 process_html_file

Re-add SKIP_TAGS logic that was removed during paragraph-based refactor.
Paragraphs inside script, style, head, noscript, svg, nav, and footer
tags are now skipped (no flite processing), reducing unnecessary work
for ebook conversion.

Co-Authored-By: tom mottes <tom.mottes@gmail.com>
---
 main.py | 20 +++++++++++++++++++-
 1 file changed, 19 insertions(+), 1 deletion(-)
diff --git a/main.py b/main.py
index c4867aa..0a810e1 100644
--- a/main.py
+++ b/main.py
@@ -455,6 +455,20 @@ def flush_batch():
 
 PARAGRAPH_PATTERN = re.compile(r'(<p\b[^>]*>)(.*?)(</p>)', re.DOTALL | re.IGNORECASE)
 TAG_PATTERN = re.compile(r'<[^>]*>')
+SKIP_TAGS = {'script', 'style', 'head', 'noscript', 'svg', 'nav', 'footer'}
+SKIP_TAG_PATTERN = re.compile(
+    r'<(?P<tag>' + '|'.join(SKIP_TAGS) + r')\b[^>]*>.*?</(?P=tag)>',
+    re.DOTALL | re.IGNORECASE
+)
+
+def _get_skip_ranges(content: str) -> List[Tuple[int, int]]:
+    return [(m.start(), m.end()) for m in SKIP_TAG_PATTERN.finditer(content)]
+
+def _in_skip_range(pos: int, skip_ranges: List[Tuple[int, int]]) -> bool:
+    for start, end in skip_ranges:
+        if start <= pos < end:
+            return True
+    return False
 
 def _decode_html_text(text: str) -> str:
     decoded = html_module.unescape(text)
@@ -524,7 +538,8 @@ def process_html_file(input_path: str, output_path: Optional[str], resume: bool
     with open(input_path, 'r', encoding='utf-8') as f:
         content = f.read()
 
-    matches = list(PARAGRAPH_PATTERN.finditer(content))
+    skip_ranges = _get_skip_ranges(content)
+    matches = [m for m in PARAGRAPH_PATTERN.finditer(content) if not _in_skip_range(m.start(), skip_ranges)]
     paragraph_count = len(matches)
 
     checkpoint_path = get_checkpoint_path(output_path) if output_path else None
@@ -543,6 +558,8 @@ def process_html_file(input_path: str, output_path: Optional[str], resume: bool
     if not output_path:
         counter = [0]
         def replace_paragraph(match):
+            if _in_skip_range(match.start(), skip_ranges):
+                return match.group(0)
             counter[0] += 1
             return _process_single_paragraph(match, paragraph_count, counter[0])
         print(PARAGRAPH_PATTERN.sub(replace_paragraph, content))
@@ -671,3 +688,4 @@ def main():
 if __name__ == "__main__":
     main()
 
+    
\ No newline at end of file

From 8fdf5e88578e514014e83408a902b8214b205dad Mon Sep 17 00:00:00 2001
From: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com>
Date: Mon, 16 Feb 2026 03:45:20 +0000
Subject: [PATCH 2/3] Unify skip logic: both stdout and file paths use filtered
 matches list

Removed the separate _in_skip_range check from the stdout path's
replace_paragraph callback. Now both code paths iterate over the
same pre-filtered matches list, so skip logic lives in one place.

Co-Authored-By: tom mottes <tom.mottes@gmail.com>
---
 main.py | 15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)

diff --git a/main.py b/main.py
index 0a810e1..a695f29 100644
--- a/main.py
+++ b/main.py
@@ -556,13 +556,14 @@ def process_html_file(input_path: str, output_path: Optional[str], resume: bool
                     f.truncate(output_bytes)
 
     if not output_path:
-        counter = [0]
-        def replace_paragraph(match):
-            if _in_skip_range(match.start(), skip_ranges):
-                return match.group(0)
-            counter[0] += 1
-            return _process_single_paragraph(match, paragraph_count, counter[0])
-        print(PARAGRAPH_PATTERN.sub(replace_paragraph, content))
+        prev_end = 0
+        result_parts = []
+        for counter, match in enumerate(matches, 1):
+            result_parts.append(content[prev_end:match.start()])
+            result_parts.append(_process_single_paragraph(match, paragraph_count, counter))
+            prev_end = match.end()
+        result_parts.append(content[prev_end:])
+        print(''.join(result_parts))
         return
 
     mode = "a" if start_paragraph > 0 else "w"

From 68479a515d02cdab8e273fd0ae1e7bb43555fa41 Mon Sep 17 00:00:00 2001
From: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com>
Date: Mon, 16 Feb 2026 03:53:25 +0000
Subject: [PATCH 3/3] Merge stdout and file-output paths into single code path

Use sys.stdout when no output_path, eliminating the separate
if-not-output_path branch. Both cases now share the exact same
iteration and batching logic.

Co-Authored-By: tom mottes <tom.mottes@gmail.com>
---
 main.py | 22 ++++++++--------------
 1 file changed, 8 insertions(+), 14 deletions(-)

diff --git a/main.py b/main.py
index a695f29..52ca3c1 100644
--- a/main.py
+++ b/main.py
@@ -9,6 +9,7 @@
 import re
 import string
 import subprocess
+import sys
 from typing import Dict, List, Optional, Tuple
 import unicodedata
 import os
@@ -555,19 +556,11 @@ def process_html_file(input_path: str, output_path: Optional[str], resume: bool
                 with open(output_path, "r+b") as f:
                     f.truncate(output_bytes)
 
-    if not output_path:
-        prev_end = 0
-        result_parts = []
-        for counter, match in enumerate(matches, 1):
-            result_parts.append(content[prev_end:match.start()])
-            result_parts.append(_process_single_paragraph(match, paragraph_count, counter))
-            prev_end = match.end()
-        result_parts.append(content[prev_end:])
-        print(''.join(result_parts))
-        return
-
-    mode = "a" if start_paragraph > 0 else "w"
-    out_file = open(output_path, mode, encoding='utf-8')
+    if output_path:
+        mode = "a" if start_paragraph > 0 else "w"
+        out_file = open(output_path, mode, encoding='utf-8')
+    else:
+        out_file = sys.stdout
     prev_end = matches[start_paragraph - 1].end() if start_paragraph > 0 else 0
 
     for batch_start in range(start_paragraph, len(matches), FLITE_BATCH_SIZE):
@@ -606,7 +599,8 @@ def process_html_file(input_path: str, output_path: Optional[str], resume: bool
 
     out_file.write(content[prev_end:])
     out_file.flush()
-    out_file.close()
+    if output_path:
+        out_file.close()
     if checkpoint_path:
         remove_checkpoint(checkpoint_path)