diff --git a/main.py b/main.py index c91677f..0f6e4c1 100644 --- a/main.py +++ b/main.py @@ -461,6 +461,41 @@ def flush_batch(): r'<(?P' + '|'.join(SKIP_TAGS) + r')\b[^>]*>.*?', re.DOTALL | re.IGNORECASE ) +# Each rule is (tag, attr, value) — elements matching will be stripped. +SKIP_ATTR_RULES = [ + ('div', 'id', 'secondary'), + ('div', 'id', 'actionbar'), +] + +def _strip_tags_by_attr(content: str, rules=SKIP_ATTR_RULES) -> str: + """Remove elements matching (tag, attr, value) rules, handling nesting.""" + for tag_name, attr, value in rules: + pattern = re.compile( + r'<' + re.escape(tag_name) + r'\b[^>]*\b' + + re.escape(attr) + r'\s*=\s*["\']' + re.escape(value) + r'["\'][^>]*>', + re.IGNORECASE + ) + while True: + m = pattern.search(content) + if not m: + break + tag_open = re.compile(r'<' + re.escape(tag_name) + r'\b', re.IGNORECASE) + tag_close = re.compile(r'', re.IGNORECASE) + depth = 1 + pos = m.end() + while depth > 0 and pos < len(content): + open_m = tag_open.search(content, pos) + close_m = tag_close.search(content, pos) + if close_m is None: + break + if open_m and open_m.start() < close_m.start(): + depth += 1 + pos = open_m.end() + else: + depth -= 1 + pos = close_m.end() + content = content[:m.start()] + content[pos:] + return content def _decode_html_text(text: str) -> str: @@ -532,6 +567,7 @@ def process_html_file(input_path: str, output_path: Optional[str], resume: bool content = f.read() content = SKIP_TAG_PATTERN.sub('', content) + content = _strip_tags_by_attr(content) matches = list(PARAGRAPH_PATTERN.finditer(content)) paragraph_count = len(matches) diff --git a/test_main.py b/test_main.py index 869c9d4..ecf32b9 100644 --- a/test_main.py +++ b/test_main.py @@ -22,6 +22,7 @@ remove_checkpoint, _decode_html_text, _decode_text_nodes, + _strip_tags_by_attr, process_html_file, is_verb_in_sentence, ipa_vowels, @@ -327,6 +328,72 @@ def test_double_word_reductions_non_empty(self): assert len(double_word_reductions) > 0 +class TestStripTagsByAttr: + def test_strips_secondary_div(self): + html = '

sidebar

main

' + result = _strip_tags_by_attr(html) + assert 'secondary' not in result + assert 'sidebar' not in result + assert '

main

' in result + + def test_strips_actionbar_div(self): + html = '

content

' + result = _strip_tags_by_attr(html) + assert 'actionbar' not in result + assert 'Click' not in result + assert '

content

' in result + + def test_handles_nested_divs(self): + html = '

nested

kept

' + result = _strip_tags_by_attr(html) + assert 'secondary' not in result + assert 'nested' not in result + assert '

kept

' in result + + def test_preserves_other_divs(self): + html = '

main

side

' + result = _strip_tags_by_attr(html) + assert '
' in result + assert '

main

' in result + assert 'secondary' not in result + + def test_strips_both_ids(self): + html = '
A
B

C

' + result = _strip_tags_by_attr(html) + assert 'secondary' not in result + assert 'actionbar' not in result + assert '

C

' in result + + def test_no_matching_divs_unchanged(self): + html = '

hello

' + assert _strip_tags_by_attr(html) == html + + def test_id_not_first_attribute(self): + html = '

sidebar

main

' + result = _strip_tags_by_attr(html) + assert 'secondary' not in result + assert 'sidebar' not in result + assert '

main

' in result + + def test_strips_non_div_tag_with_custom_rule(self): + html = '

inside

outside

' + result = _strip_tags_by_attr(html, rules=[('section', 'id', 'secondary')]) + assert 'secondary' not in result + assert 'inside' not in result + assert '

outside

' in result + + def test_does_not_strip_non_div_by_default(self): + html = '

inside

outside

' + result = _strip_tags_by_attr(html) + assert '
' in result + + def test_custom_rules(self): + html = '

ad

content

' + result = _strip_tags_by_attr(html, rules=[('div', 'class', 'ads')]) + assert 'ads' not in result + assert '

content

' in result + + def _flite_available(): flite_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'flite', 'bin', 'flite') if not os.path.isfile(flite_path):