blocks, handling nested divs."""
- while True:
- m = _SKIP_DIV_OPEN.search(content)
- if not m:
- break
- depth = 1
- pos = m.end()
- while depth > 0 and pos < len(content):
- open_m = _DIV_OPEN.search(content, pos)
- close_m = _DIV_CLOSE.search(content, pos)
- if close_m is None:
+# Each rule is (attr, value) — any tag with that attribute=value will be stripped.
+SKIP_ATTR_RULES = [
+ ('id', 'secondary'),
+ ('id', 'actionbar'),
+]
+
+def _strip_tags_by_attr(content: str, rules=SKIP_ATTR_RULES) -> str:
+ """Remove any element whose opening tag matches an (attr, value) rule, handling nesting."""
+ for attr, value in rules:
+ pattern = re.compile(
+ r'<(?P[a-zA-Z][a-zA-Z0-9]*)\b[^>]*\b'
+ + re.escape(attr) + r'\s*=\s*["\']' + re.escape(value) + r'["\'][^>]*>',
+ re.IGNORECASE
+ )
+ while True:
+ m = pattern.search(content)
+ if not m:
break
- if open_m and open_m.start() < close_m.start():
- depth += 1
- pos = open_m.end()
- else:
- depth -= 1
- pos = close_m.end()
- content = content[:m.start()] + content[pos:]
+ tag = m.group('tag')
+ tag_open = re.compile(r'<' + re.escape(tag) + r'\b', re.IGNORECASE)
+ tag_close = re.compile(r'' + re.escape(tag) + r'\s*>', re.IGNORECASE)
+ depth = 1
+ pos = m.end()
+ while depth > 0 and pos < len(content):
+ open_m = tag_open.search(content, pos)
+ close_m = tag_close.search(content, pos)
+ if close_m is None:
+ break
+ if open_m and open_m.start() < close_m.start():
+ depth += 1
+ pos = open_m.end()
+ else:
+ depth -= 1
+ pos = close_m.end()
+ content = content[:m.start()] + content[pos:]
return content
@@ -561,7 +568,7 @@ def process_html_file(input_path: str, output_path: Optional[str], resume: bool
content = f.read()
content = SKIP_TAG_PATTERN.sub('', content)
- content = _strip_divs_by_id(content)
+ content = _strip_tags_by_attr(content)
matches = list(PARAGRAPH_PATTERN.finditer(content))
paragraph_count = len(matches)
diff --git a/test_main.py b/test_main.py
index 9b55a99..7363561 100644
--- a/test_main.py
+++ b/test_main.py
@@ -22,7 +22,7 @@
remove_checkpoint,
_decode_html_text,
_decode_text_nodes,
- _strip_divs_by_id,
+ _strip_tags_by_attr,
process_html_file,
is_verb_in_sentence,
ipa_vowels,
@@ -328,45 +328,65 @@ def test_double_word_reductions_non_empty(self):
assert len(double_word_reductions) > 0
-class TestStripDivsById:
+class TestStripTagsByAttr:
def test_strips_secondary_div(self):
html = '
sidebar
main
'
- result = _strip_divs_by_id(html)
+ result = _strip_tags_by_attr(html)
assert 'secondary' not in result
assert 'sidebar' not in result
assert '
main
' in result
def test_strips_actionbar_div(self):
html = '
content
'
- result = _strip_divs_by_id(html)
+ result = _strip_tags_by_attr(html)
assert 'actionbar' not in result
assert 'Click' not in result
assert '
content
' in result
def test_handles_nested_divs(self):
html = '
nested
kept
'
- result = _strip_divs_by_id(html)
+ result = _strip_tags_by_attr(html)
assert 'secondary' not in result
assert 'nested' not in result
assert '
kept
' in result
def test_preserves_other_divs(self):
html = '
main
side
'
- result = _strip_divs_by_id(html)
+ result = _strip_tags_by_attr(html)
assert '
' in result
assert '
main
' in result
assert 'secondary' not in result
def test_strips_both_ids(self):
html = '
A
B
C
'
- result = _strip_divs_by_id(html)
+ result = _strip_tags_by_attr(html)
assert 'secondary' not in result
assert 'actionbar' not in result
assert '
C
' in result
def test_no_matching_divs_unchanged(self):
html = '
hello
'
- assert _strip_divs_by_id(html) == html
+ assert _strip_tags_by_attr(html) == html
+
+ def test_id_not_first_attribute(self):
+ html = '
sidebar
main
'
+ result = _strip_tags_by_attr(html)
+ assert 'secondary' not in result
+ assert 'sidebar' not in result
+ assert '
main
' in result
+
+ def test_strips_non_div_tag(self):
+ html = '
inside
outside
'
+ result = _strip_tags_by_attr(html)
+ assert 'secondary' not in result
+ assert 'inside' not in result
+ assert '
outside
' in result
+
+ def test_custom_rules(self):
+ html = '
ad
content
'
+ result = _strip_tags_by_attr(html, rules=[('class', 'ads')])
+ assert 'ads' not in result
+ assert '
content
' in result
def _flite_available():
From 98cbc9b2031b4ed92adb4b001f2a9f0196016637 Mon Sep 17 00:00:00 2001
From: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com>
Date: Mon, 16 Feb 2026 04:37:09 +0000
Subject: [PATCH 3/3] Restrict default rules to div tag; add tag name to (tag,
attr, value) tuples
Co-Authored-By: tom mottes
---
main.py | 17 ++++++++---------
test_main.py | 11 ++++++++---
2 files changed, 16 insertions(+), 12 deletions(-)
diff --git a/main.py b/main.py
index e66392d..0f6e4c1 100644
--- a/main.py
+++ b/main.py
@@ -461,17 +461,17 @@ def flush_batch():
r'<(?P' + '|'.join(SKIP_TAGS) + r')\b[^>]*>.*?(?P=tag)>',
re.DOTALL | re.IGNORECASE
)
-# Each rule is (attr, value) — any tag with that attribute=value will be stripped.
+# Each rule is (tag, attr, value) — elements matching will be stripped.
SKIP_ATTR_RULES = [
- ('id', 'secondary'),
- ('id', 'actionbar'),
+ ('div', 'id', 'secondary'),
+ ('div', 'id', 'actionbar'),
]
def _strip_tags_by_attr(content: str, rules=SKIP_ATTR_RULES) -> str:
- """Remove any element whose opening tag matches an (attr, value) rule, handling nesting."""
- for attr, value in rules:
+ """Remove elements matching (tag, attr, value) rules, handling nesting."""
+ for tag_name, attr, value in rules:
pattern = re.compile(
- r'<(?P[a-zA-Z][a-zA-Z0-9]*)\b[^>]*\b'
+ r'<' + re.escape(tag_name) + r'\b[^>]*\b'
+ re.escape(attr) + r'\s*=\s*["\']' + re.escape(value) + r'["\'][^>]*>',
re.IGNORECASE
)
@@ -479,9 +479,8 @@ def _strip_tags_by_attr(content: str, rules=SKIP_ATTR_RULES) -> str:
m = pattern.search(content)
if not m:
break
- tag = m.group('tag')
- tag_open = re.compile(r'<' + re.escape(tag) + r'\b', re.IGNORECASE)
- tag_close = re.compile(r'' + re.escape(tag) + r'\s*>', re.IGNORECASE)
+ tag_open = re.compile(r'<' + re.escape(tag_name) + r'\b', re.IGNORECASE)
+ tag_close = re.compile(r'' + re.escape(tag_name) + r'\s*>', re.IGNORECASE)
depth = 1
pos = m.end()
while depth > 0 and pos < len(content):
diff --git a/test_main.py b/test_main.py
index 7363561..ecf32b9 100644
--- a/test_main.py
+++ b/test_main.py
@@ -375,16 +375,21 @@ def test_id_not_first_attribute(self):
assert 'sidebar' not in result
assert '
main
' in result
- def test_strips_non_div_tag(self):
+ def test_strips_non_div_tag_with_custom_rule(self):
html = '
inside
outside
'
- result = _strip_tags_by_attr(html)
+ result = _strip_tags_by_attr(html, rules=[('section', 'id', 'secondary')])
assert 'secondary' not in result
assert 'inside' not in result
assert '
outside
' in result
+ def test_does_not_strip_non_div_by_default(self):
+ html = '
inside
outside
'
+ result = _strip_tags_by_attr(html)
+ assert '' in result
+
def test_custom_rules(self):
html = '
ad
content
'
- result = _strip_tags_by_attr(html, rules=[('class', 'ads')])
+ result = _strip_tags_by_attr(html, rules=[('div', 'class', 'ads')])
assert 'ads' not in result
assert '