Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
36 changes: 36 additions & 0 deletions main.py
Original file line number Diff line number Diff line change
Expand Up @@ -461,6 +461,41 @@ def flush_batch():
r'<(?P<tag>' + '|'.join(SKIP_TAGS) + r')\b[^>]*>.*?</(?P=tag)>',
re.DOTALL | re.IGNORECASE
)
# Each rule is (tag, attr, value) — elements matching <tag ... attr="value" ...> will be stripped.
SKIP_ATTR_RULES = [
('div', 'id', 'secondary'),
('div', 'id', 'actionbar'),
]

def _strip_tags_by_attr(content: str, rules=SKIP_ATTR_RULES) -> str:
"""Remove elements matching (tag, attr, value) rules, handling nesting."""
for tag_name, attr, value in rules:
pattern = re.compile(
r'<' + re.escape(tag_name) + r'\b[^>]*\b'
+ re.escape(attr) + r'\s*=\s*["\']' + re.escape(value) + r'["\'][^>]*>',
re.IGNORECASE
)
while True:
m = pattern.search(content)
if not m:
break
tag_open = re.compile(r'<' + re.escape(tag_name) + r'\b', re.IGNORECASE)
tag_close = re.compile(r'</' + re.escape(tag_name) + r'\s*>', re.IGNORECASE)
depth = 1
pos = m.end()
while depth > 0 and pos < len(content):
open_m = tag_open.search(content, pos)
close_m = tag_close.search(content, pos)
if close_m is None:
break
if open_m and open_m.start() < close_m.start():
depth += 1
pos = open_m.end()
else:
depth -= 1
pos = close_m.end()
content = content[:m.start()] + content[pos:]
return content


def _decode_html_text(text: str) -> str:
Expand Down Expand Up @@ -532,6 +567,7 @@ def process_html_file(input_path: str, output_path: Optional[str], resume: bool
content = f.read()

content = SKIP_TAG_PATTERN.sub('', content)
content = _strip_tags_by_attr(content)
matches = list(PARAGRAPH_PATTERN.finditer(content))
paragraph_count = len(matches)

Expand Down
67 changes: 67 additions & 0 deletions test_main.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
remove_checkpoint,
_decode_html_text,
_decode_text_nodes,
_strip_tags_by_attr,
process_html_file,
is_verb_in_sentence,
ipa_vowels,
Expand Down Expand Up @@ -327,6 +328,72 @@ def test_double_word_reductions_non_empty(self):
assert len(double_word_reductions) > 0


class TestStripTagsByAttr:
def test_strips_secondary_div(self):
html = '<body><div id="secondary"><p>sidebar</p></div><p>main</p></body>'
result = _strip_tags_by_attr(html)
assert 'secondary' not in result
assert 'sidebar' not in result
assert '<p>main</p>' in result

def test_strips_actionbar_div(self):
html = '<body><div id="actionbar"><button>Click</button></div><p>content</p></body>'
result = _strip_tags_by_attr(html)
assert 'actionbar' not in result
assert 'Click' not in result
assert '<p>content</p>' in result

def test_handles_nested_divs(self):
html = '<div id="secondary"><div class="inner"><p>nested</p></div></div><p>kept</p>'
result = _strip_tags_by_attr(html)
assert 'secondary' not in result
assert 'nested' not in result
assert '<p>kept</p>' in result

def test_preserves_other_divs(self):
html = '<div id="primary"><p>main</p></div><div id="secondary"><p>side</p></div>'
result = _strip_tags_by_attr(html)
assert '<div id="primary">' in result
assert '<p>main</p>' in result
assert 'secondary' not in result

def test_strips_both_ids(self):
html = '<div id="secondary">A</div><div id="actionbar">B</div><p>C</p>'
result = _strip_tags_by_attr(html)
assert 'secondary' not in result
assert 'actionbar' not in result
assert '<p>C</p>' in result

def test_no_matching_divs_unchanged(self):
html = '<div id="content"><p>hello</p></div>'
assert _strip_tags_by_attr(html) == html

def test_id_not_first_attribute(self):
html = '<body><div class="updateable widget-area" id="secondary"><p>sidebar</p></div><p>main</p></body>'
result = _strip_tags_by_attr(html)
assert 'secondary' not in result
assert 'sidebar' not in result
assert '<p>main</p>' in result

def test_strips_non_div_tag_with_custom_rule(self):
html = '<section id="secondary"><p>inside</p></section><p>outside</p>'
result = _strip_tags_by_attr(html, rules=[('section', 'id', 'secondary')])
assert 'secondary' not in result
assert 'inside' not in result
assert '<p>outside</p>' in result

def test_does_not_strip_non_div_by_default(self):
html = '<section id="secondary"><p>inside</p></section><p>outside</p>'
result = _strip_tags_by_attr(html)
assert '<section id="secondary">' in result

def test_custom_rules(self):
html = '<div class="ads"><p>ad</p></div><p>content</p>'
result = _strip_tags_by_attr(html, rules=[('div', 'class', 'ads')])
assert 'ads' not in result
assert '<p>content</p>' in result


def _flite_available():
flite_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'flite', 'bin', 'flite')
if not os.path.isfile(flite_path):
Expand Down