Tomotz · Tomotz · Feb 16, 2026 · Feb 16, 2026 · Feb 16, 2026 · Feb 16, 2026
diff --git a/main.py b/main.py
@@ -461,6 +461,41 @@ def flush_batch():
     r'<(?P<tag>' + '|'.join(SKIP_TAGS) + r')\b[^>]*>.*?</(?P=tag)>',
     re.DOTALL | re.IGNORECASE
 )
+# Each rule is (tag, attr, value) — elements matching <tag ... attr="value" ...> will be stripped.
+SKIP_ATTR_RULES = [
+    ('div', 'id', 'secondary'),
+    ('div', 'id', 'actionbar'),
+]
+
+def _strip_tags_by_attr(content: str, rules=SKIP_ATTR_RULES) -> str:
+    """Remove elements matching (tag, attr, value) rules, handling nesting."""
+    for tag_name, attr, value in rules:
+        pattern = re.compile(
+            r'<' + re.escape(tag_name) + r'\b[^>]*\b'
+            + re.escape(attr) + r'\s*=\s*["\']' + re.escape(value) + r'["\'][^>]*>',
+            re.IGNORECASE
+        )
+        while True:
+            m = pattern.search(content)
+            if not m:
+                break
+            tag_open = re.compile(r'<' + re.escape(tag_name) + r'\b', re.IGNORECASE)
+            tag_close = re.compile(r'</' + re.escape(tag_name) + r'\s*>', re.IGNORECASE)
+            depth = 1
+            pos = m.end()
+            while depth > 0 and pos < len(content):
+                open_m = tag_open.search(content, pos)
+                close_m = tag_close.search(content, pos)
+                if close_m is None:
+                    break
+                if open_m and open_m.start() < close_m.start():
+                    depth += 1
+                    pos = open_m.end()
+                else:
+                    depth -= 1
+                    pos = close_m.end()
+            content = content[:m.start()] + content[pos:]
+    return content
 
 
 def _decode_html_text(text: str) -> str:
@@ -532,6 +567,7 @@ def process_html_file(input_path: str, output_path: Optional[str], resume: bool
         content = f.read()
 
     content = SKIP_TAG_PATTERN.sub('', content)
+    content = _strip_tags_by_attr(content)
     matches = list(PARAGRAPH_PATTERN.finditer(content))
     paragraph_count = len(matches)
 

diff --git a/test_main.py b/test_main.py
@@ -22,6 +22,7 @@
     remove_checkpoint,
     _decode_html_text,
     _decode_text_nodes,
+    _strip_tags_by_attr,
     process_html_file,
     is_verb_in_sentence,
     ipa_vowels,
@@ -327,6 +328,72 @@ def test_double_word_reductions_non_empty(self):
         assert len(double_word_reductions) > 0
 
 
+class TestStripTagsByAttr:
+    def test_strips_secondary_div(self):
+        html = '<body><div id="secondary"><p>sidebar</p></div><p>main</p></body>'
+        result = _strip_tags_by_attr(html)
+        assert 'secondary' not in result
+        assert 'sidebar' not in result
+        assert '<p>main</p>' in result
+
+    def test_strips_actionbar_div(self):
+        html = '<body><div id="actionbar"><button>Click</button></div><p>content</p></body>'
+        result = _strip_tags_by_attr(html)
+        assert 'actionbar' not in result
+        assert 'Click' not in result
+        assert '<p>content</p>' in result
+
+    def test_handles_nested_divs(self):
+        html = '<div id="secondary"><div class="inner"><p>nested</p></div></div><p>kept</p>'
+        result = _strip_tags_by_attr(html)
+        assert 'secondary' not in result
+        assert 'nested' not in result
+        assert '<p>kept</p>' in result
+
+    def test_preserves_other_divs(self):
+        html = '<div id="primary"><p>main</p></div><div id="secondary"><p>side</p></div>'
+        result = _strip_tags_by_attr(html)
+        assert '<div id="primary">' in result
+        assert '<p>main</p>' in result
+        assert 'secondary' not in result
+
+    def test_strips_both_ids(self):
+        html = '<div id="secondary">A</div><div id="actionbar">B</div><p>C</p>'
+        result = _strip_tags_by_attr(html)
+        assert 'secondary' not in result
+        assert 'actionbar' not in result
+        assert '<p>C</p>' in result
+
+    def test_no_matching_divs_unchanged(self):
+        html = '<div id="content"><p>hello</p></div>'
+        assert _strip_tags_by_attr(html) == html
+
+    def test_id_not_first_attribute(self):
+        html = '<body><div class="updateable widget-area" id="secondary"><p>sidebar</p></div><p>main</p></body>'
+        result = _strip_tags_by_attr(html)
+        assert 'secondary' not in result
+        assert 'sidebar' not in result
+        assert '<p>main</p>' in result
+
+    def test_strips_non_div_tag_with_custom_rule(self):
+        html = '<section id="secondary"><p>inside</p></section><p>outside</p>'
+        result = _strip_tags_by_attr(html, rules=[('section', 'id', 'secondary')])
+        assert 'secondary' not in result
+        assert 'inside' not in result
+        assert '<p>outside</p>' in result
+
+    def test_does_not_strip_non_div_by_default(self):
+        html = '<section id="secondary"><p>inside</p></section><p>outside</p>'
+        result = _strip_tags_by_attr(html)
+        assert '<section id="secondary">' in result
+
+    def test_custom_rules(self):
+        html = '<div class="ads"><p>ad</p></div><p>content</p>'
+        result = _strip_tags_by_attr(html, rules=[('div', 'class', 'ads')])
+        assert 'ads' not in result
+        assert '<p>content</p>' in result
+
+
 def _flite_available():
     flite_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'flite', 'bin', 'flite')
     if not os.path.isfile(flite_path):