From 7c7b67170c82e787b7fc4a3d2ce00a2d49e57f9b Mon Sep 17 00:00:00 2001
From: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com>
Date: Mon, 16 Feb 2026 04:25:18 +0000
Subject: [PATCH 1/3] Strip <div id="secondary"> and <div id="actionbar">
 blocks from HTML output

Co-Authored-By: tom mottes <tom.mottes@gmail.com>
---
 main.py      | 30 ++++++++++++++++++++++++++++++
 test_main.py | 42 ++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 72 insertions(+)
diff --git a/main.py b/main.py
index c91677f..685bf2f 100644
--- a/main.py
+++ b/main.py
@@ -461,6 +461,35 @@ def flush_batch():
     r'<(?P<tag>' + '|'.join(SKIP_TAGS) + r')\b[^>]*>.*?</(?P=tag)>',
     re.DOTALL | re.IGNORECASE
 )
+SKIP_DIV_IDS = {'secondary', 'actionbar'}
+_SKIP_DIV_OPEN = re.compile(
+    r'<div\b[^>]*\bid\s*=\s*["\'](' + '|'.join(SKIP_DIV_IDS) + r')["\'][^>]*>',
+    re.IGNORECASE
+)
+_DIV_OPEN = re.compile(r'<div\b', re.IGNORECASE)
+_DIV_CLOSE = re.compile(r'</div\s*>', re.IGNORECASE)
+
+def _strip_divs_by_id(content: str) -> str:
+    """Remove <div id="secondary"> and <div id="actionbar"> blocks, handling nested divs."""
+    while True:
+        m = _SKIP_DIV_OPEN.search(content)
+        if not m:
+            break
+        depth = 1
+        pos = m.end()
+        while depth > 0 and pos < len(content):
+            open_m = _DIV_OPEN.search(content, pos)
+            close_m = _DIV_CLOSE.search(content, pos)
+            if close_m is None:
+                break
+            if open_m and open_m.start() < close_m.start():
+                depth += 1
+                pos = open_m.end()
+            else:
+                depth -= 1
+                pos = close_m.end()
+        content = content[:m.start()] + content[pos:]
+    return content
 
 
 def _decode_html_text(text: str) -> str:
@@ -532,6 +561,7 @@ def process_html_file(input_path: str, output_path: Optional[str], resume: bool
         content = f.read()
 
     content = SKIP_TAG_PATTERN.sub('', content)
+    content = _strip_divs_by_id(content)
     matches = list(PARAGRAPH_PATTERN.finditer(content))
     paragraph_count = len(matches)
 
diff --git a/test_main.py b/test_main.py
index 869c9d4..9b55a99 100644
--- a/test_main.py
+++ b/test_main.py
@@ -22,6 +22,7 @@
     remove_checkpoint,
     _decode_html_text,
     _decode_text_nodes,
+    _strip_divs_by_id,
     process_html_file,
     is_verb_in_sentence,
     ipa_vowels,
@@ -327,6 +328,47 @@ def test_double_word_reductions_non_empty(self):
         assert len(double_word_reductions) > 0
 
 
+class TestStripDivsById:
+    def test_strips_secondary_div(self):
+        html = '<body><div id="secondary"><p>sidebar</p></div><p>main</p></body>'
+        result = _strip_divs_by_id(html)
+        assert 'secondary' not in result
+        assert 'sidebar' not in result
+        assert '<p>main</p>' in result
+
+    def test_strips_actionbar_div(self):
+        html = '<body><div id="actionbar"><button>Click</button></div><p>content</p></body>'
+        result = _strip_divs_by_id(html)
+        assert 'actionbar' not in result
+        assert 'Click' not in result
+        assert '<p>content</p>' in result
+
+    def test_handles_nested_divs(self):
+        html = '<div id="secondary"><div class="inner"><p>nested</p></div></div><p>kept</p>'
+        result = _strip_divs_by_id(html)
+        assert 'secondary' not in result
+        assert 'nested' not in result
+        assert '<p>kept</p>' in result
+
+    def test_preserves_other_divs(self):
+        html = '<div id="primary"><p>main</p></div><div id="secondary"><p>side</p></div>'
+        result = _strip_divs_by_id(html)
+        assert '<div id="primary">' in result
+        assert '<p>main</p>' in result
+        assert 'secondary' not in result
+
+    def test_strips_both_ids(self):
+        html = '<div id="secondary">A</div><div id="actionbar">B</div><p>C</p>'
+        result = _strip_divs_by_id(html)
+        assert 'secondary' not in result
+        assert 'actionbar' not in result
+        assert '<p>C</p>' in result
+
+    def test_no_matching_divs_unchanged(self):
+        html = '<div id="content"><p>hello</p></div>'
+        assert _strip_divs_by_id(html) == html
+
+
 def _flite_available():
     flite_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'flite', 'bin', 'flite')
     if not os.path.isfile(flite_path):

From cd2c9f95776295115bf0f18f0dea7f53c66c42ff Mon Sep 17 00:00:00 2001
From: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com>
Date: Mon, 16 Feb 2026 04:30:56 +0000
Subject: [PATCH 2/3] Generalize tag stripping to support any tag with any
 attribute match

Co-Authored-By: tom mottes <tom.mottes@gmail.com>
---
 main.py      | 63 +++++++++++++++++++++++++++++-----------------------
 test_main.py | 36 +++++++++++++++++++++++-------
 2 files changed, 63 insertions(+), 36 deletions(-)

diff --git a/main.py b/main.py
index 685bf2f..e66392d 100644
--- a/main.py
+++ b/main.py
@@ -461,34 +461,41 @@ def flush_batch():
     r'<(?P<tag>' + '|'.join(SKIP_TAGS) + r')\b[^>]*>.*?</(?P=tag)>',
     re.DOTALL | re.IGNORECASE
 )
-SKIP_DIV_IDS = {'secondary', 'actionbar'}
-_SKIP_DIV_OPEN = re.compile(
-    r'<div\b[^>]*\bid\s*=\s*["\'](' + '|'.join(SKIP_DIV_IDS) + r')["\'][^>]*>',
-    re.IGNORECASE
-)
-_DIV_OPEN = re.compile(r'<div\b', re.IGNORECASE)
-_DIV_CLOSE = re.compile(r'</div\s*>', re.IGNORECASE)
-
-def _strip_divs_by_id(content: str) -> str:
-    """Remove <div id="secondary"> and <div id="actionbar"> blocks, handling nested divs."""
-    while True:
-        m = _SKIP_DIV_OPEN.search(content)
-        if not m:
-            break
-        depth = 1
-        pos = m.end()
-        while depth > 0 and pos < len(content):
-            open_m = _DIV_OPEN.search(content, pos)
-            close_m = _DIV_CLOSE.search(content, pos)
-            if close_m is None:
+# Each rule is (attr, value) — any tag with that attribute=value will be stripped.
+SKIP_ATTR_RULES = [
+    ('id', 'secondary'),
+    ('id', 'actionbar'),
+]
+
+def _strip_tags_by_attr(content: str, rules=SKIP_ATTR_RULES) -> str:
+    """Remove any element whose opening tag matches an (attr, value) rule, handling nesting."""
+    for attr, value in rules:
+        pattern = re.compile(
+            r'<(?P<tag>[a-zA-Z][a-zA-Z0-9]*)\b[^>]*\b'
+            + re.escape(attr) + r'\s*=\s*["\']' + re.escape(value) + r'["\'][^>]*>',
+            re.IGNORECASE
+        )
+        while True:
+            m = pattern.search(content)
+            if not m:
                 break
-            if open_m and open_m.start() < close_m.start():
-                depth += 1
-                pos = open_m.end()
-            else:
-                depth -= 1
-                pos = close_m.end()
-        content = content[:m.start()] + content[pos:]
+            tag = m.group('tag')
+            tag_open = re.compile(r'<' + re.escape(tag) + r'\b', re.IGNORECASE)
+            tag_close = re.compile(r'</' + re.escape(tag) + r'\s*>', re.IGNORECASE)
+            depth = 1
+            pos = m.end()
+            while depth > 0 and pos < len(content):
+                open_m = tag_open.search(content, pos)
+                close_m = tag_close.search(content, pos)
+                if close_m is None:
+                    break
+                if open_m and open_m.start() < close_m.start():
+                    depth += 1
+                    pos = open_m.end()
+                else:
+                    depth -= 1
+                    pos = close_m.end()
+            content = content[:m.start()] + content[pos:]
     return content
 
 
@@ -561,7 +568,7 @@ def process_html_file(input_path: str, output_path: Optional[str], resume: bool
         content = f.read()
 
     content = SKIP_TAG_PATTERN.sub('', content)
-    content = _strip_divs_by_id(content)
+    content = _strip_tags_by_attr(content)
     matches = list(PARAGRAPH_PATTERN.finditer(content))
     paragraph_count = len(matches)
 
diff --git a/test_main.py b/test_main.py
index 9b55a99..7363561 100644
--- a/test_main.py
+++ b/test_main.py
@@ -22,7 +22,7 @@
     remove_checkpoint,
     _decode_html_text,
     _decode_text_nodes,
-    _strip_divs_by_id,
+    _strip_tags_by_attr,
     process_html_file,
     is_verb_in_sentence,
     ipa_vowels,
@@ -328,45 +328,65 @@ def test_double_word_reductions_non_empty(self):
         assert len(double_word_reductions) > 0
 
 
-class TestStripDivsById:
+class TestStripTagsByAttr:
     def test_strips_secondary_div(self):
         html = '<body><div id="secondary"><p>sidebar</p></div><p>main</p></body>'
-        result = _strip_divs_by_id(html)
+        result = _strip_tags_by_attr(html)
         assert 'secondary' not in result
         assert 'sidebar' not in result
         assert '<p>main</p>' in result
 
     def test_strips_actionbar_div(self):
         html = '<body><div id="actionbar"><button>Click</button></div><p>content</p></body>'
-        result = _strip_divs_by_id(html)
+        result = _strip_tags_by_attr(html)
         assert 'actionbar' not in result
         assert 'Click' not in result
         assert '<p>content</p>' in result
 
     def test_handles_nested_divs(self):
         html = '<div id="secondary"><div class="inner"><p>nested</p></div></div><p>kept</p>'
-        result = _strip_divs_by_id(html)
+        result = _strip_tags_by_attr(html)
         assert 'secondary' not in result
         assert 'nested' not in result
         assert '<p>kept</p>' in result
 
     def test_preserves_other_divs(self):
         html = '<div id="primary"><p>main</p></div><div id="secondary"><p>side</p></div>'
-        result = _strip_divs_by_id(html)
+        result = _strip_tags_by_attr(html)
         assert '<div id="primary">' in result
         assert '<p>main</p>' in result
         assert 'secondary' not in result
 
     def test_strips_both_ids(self):
         html = '<div id="secondary">A</div><div id="actionbar">B</div><p>C</p>'
-        result = _strip_divs_by_id(html)
+        result = _strip_tags_by_attr(html)
         assert 'secondary' not in result
         assert 'actionbar' not in result
         assert '<p>C</p>' in result
 
     def test_no_matching_divs_unchanged(self):
         html = '<div id="content"><p>hello</p></div>'
-        assert _strip_divs_by_id(html) == html
+        assert _strip_tags_by_attr(html) == html
+
+    def test_id_not_first_attribute(self):
+        html = '<body><div class="updateable widget-area" id="secondary"><p>sidebar</p></div><p>main</p></body>'
+        result = _strip_tags_by_attr(html)
+        assert 'secondary' not in result
+        assert 'sidebar' not in result
+        assert '<p>main</p>' in result
+
+    def test_strips_non_div_tag(self):
+        html = '<section id="secondary"><p>inside</p></section><p>outside</p>'
+        result = _strip_tags_by_attr(html)
+        assert 'secondary' not in result
+        assert 'inside' not in result
+        assert '<p>outside</p>' in result
+
+    def test_custom_rules(self):
+        html = '<div class="ads"><p>ad</p></div><p>content</p>'
+        result = _strip_tags_by_attr(html, rules=[('class', 'ads')])
+        assert 'ads' not in result
+        assert '<p>content</p>' in result
 
 
 def _flite_available():

From 98cbc9b2031b4ed92adb4b001f2a9f0196016637 Mon Sep 17 00:00:00 2001
From: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com>
Date: Mon, 16 Feb 2026 04:37:09 +0000
Subject: [PATCH 3/3] Restrict default rules to div tag; add tag name to (tag,
 attr, value) tuples

Co-Authored-By: tom mottes <tom.mottes@gmail.com>
---
 main.py      | 17 ++++++++---------
 test_main.py | 11 ++++++++---
 2 files changed, 16 insertions(+), 12 deletions(-)

diff --git a/main.py b/main.py
index e66392d..0f6e4c1 100644
--- a/main.py
+++ b/main.py
@@ -461,17 +461,17 @@ def flush_batch():
     r'<(?P<tag>' + '|'.join(SKIP_TAGS) + r')\b[^>]*>.*?</(?P=tag)>',
     re.DOTALL | re.IGNORECASE
 )
-# Each rule is (attr, value) — any tag with that attribute=value will be stripped.
+# Each rule is (tag, attr, value) — elements matching <tag ... attr="value" ...> will be stripped.
 SKIP_ATTR_RULES = [
-    ('id', 'secondary'),
-    ('id', 'actionbar'),
+    ('div', 'id', 'secondary'),
+    ('div', 'id', 'actionbar'),
 ]
 
 def _strip_tags_by_attr(content: str, rules=SKIP_ATTR_RULES) -> str:
-    """Remove any element whose opening tag matches an (attr, value) rule, handling nesting."""
-    for attr, value in rules:
+    """Remove elements matching (tag, attr, value) rules, handling nesting."""
+    for tag_name, attr, value in rules:
         pattern = re.compile(
-            r'<(?P<tag>[a-zA-Z][a-zA-Z0-9]*)\b[^>]*\b'
+            r'<' + re.escape(tag_name) + r'\b[^>]*\b'
             + re.escape(attr) + r'\s*=\s*["\']' + re.escape(value) + r'["\'][^>]*>',
             re.IGNORECASE
         )
@@ -479,9 +479,8 @@ def _strip_tags_by_attr(content: str, rules=SKIP_ATTR_RULES) -> str:
             m = pattern.search(content)
             if not m:
                 break
-            tag = m.group('tag')
-            tag_open = re.compile(r'<' + re.escape(tag) + r'\b', re.IGNORECASE)
-            tag_close = re.compile(r'</' + re.escape(tag) + r'\s*>', re.IGNORECASE)
+            tag_open = re.compile(r'<' + re.escape(tag_name) + r'\b', re.IGNORECASE)
+            tag_close = re.compile(r'</' + re.escape(tag_name) + r'\s*>', re.IGNORECASE)
             depth = 1
             pos = m.end()
             while depth > 0 and pos < len(content):
diff --git a/test_main.py b/test_main.py
index 7363561..ecf32b9 100644
--- a/test_main.py
+++ b/test_main.py
@@ -375,16 +375,21 @@ def test_id_not_first_attribute(self):
         assert 'sidebar' not in result
         assert '<p>main</p>' in result
 
-    def test_strips_non_div_tag(self):
+    def test_strips_non_div_tag_with_custom_rule(self):
         html = '<section id="secondary"><p>inside</p></section><p>outside</p>'
-        result = _strip_tags_by_attr(html)
+        result = _strip_tags_by_attr(html, rules=[('section', 'id', 'secondary')])
         assert 'secondary' not in result
         assert 'inside' not in result
         assert '<p>outside</p>' in result
 
+    def test_does_not_strip_non_div_by_default(self):
+        html = '<section id="secondary"><p>inside</p></section><p>outside</p>'
+        result = _strip_tags_by_attr(html)
+        assert '<section id="secondary">' in result
+
     def test_custom_rules(self):
         html = '<div class="ads"><p>ad</p></div><p>content</p>'
-        result = _strip_tags_by_attr(html, rules=[('class', 'ads')])
+        result = _strip_tags_by_attr(html, rules=[('div', 'class', 'ads')])
         assert 'ads' not in result
         assert '<p>content</p>' in result