From 7c7b67170c82e787b7fc4a3d2ce00a2d49e57f9b Mon Sep 17 00:00:00 2001 From: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com> Date: Mon, 16 Feb 2026 04:25:18 +0000 Subject: [PATCH 1/3] Strip
and
blocks from HTML output Co-Authored-By: tom mottes --- main.py | 30 ++++++++++++++++++++++++++++++ test_main.py | 42 ++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 72 insertions(+) diff --git a/main.py b/main.py index c91677f..685bf2f 100644 --- a/main.py +++ b/main.py @@ -461,6 +461,35 @@ def flush_batch(): r'<(?P' + '|'.join(SKIP_TAGS) + r')\b[^>]*>.*?', re.DOTALL | re.IGNORECASE ) +SKIP_DIV_IDS = {'secondary', 'actionbar'} +_SKIP_DIV_OPEN = re.compile( + r']*\bid\s*=\s*["\'](' + '|'.join(SKIP_DIV_IDS) + r')["\'][^>]*>', + re.IGNORECASE +) +_DIV_OPEN = re.compile(r'', re.IGNORECASE) + +def _strip_divs_by_id(content: str) -> str: + """Remove
and
blocks, handling nested divs.""" + while True: + m = _SKIP_DIV_OPEN.search(content) + if not m: + break + depth = 1 + pos = m.end() + while depth > 0 and pos < len(content): + open_m = _DIV_OPEN.search(content, pos) + close_m = _DIV_CLOSE.search(content, pos) + if close_m is None: + break + if open_m and open_m.start() < close_m.start(): + depth += 1 + pos = open_m.end() + else: + depth -= 1 + pos = close_m.end() + content = content[:m.start()] + content[pos:] + return content def _decode_html_text(text: str) -> str: @@ -532,6 +561,7 @@ def process_html_file(input_path: str, output_path: Optional[str], resume: bool content = f.read() content = SKIP_TAG_PATTERN.sub('', content) + content = _strip_divs_by_id(content) matches = list(PARAGRAPH_PATTERN.finditer(content)) paragraph_count = len(matches) diff --git a/test_main.py b/test_main.py index 869c9d4..9b55a99 100644 --- a/test_main.py +++ b/test_main.py @@ -22,6 +22,7 @@ remove_checkpoint, _decode_html_text, _decode_text_nodes, + _strip_divs_by_id, process_html_file, is_verb_in_sentence, ipa_vowels, @@ -327,6 +328,47 @@ def test_double_word_reductions_non_empty(self): assert len(double_word_reductions) > 0 +class TestStripDivsById: + def test_strips_secondary_div(self): + html = '

sidebar

main

' + result = _strip_divs_by_id(html) + assert 'secondary' not in result + assert 'sidebar' not in result + assert '

main

' in result + + def test_strips_actionbar_div(self): + html = '

content

' + result = _strip_divs_by_id(html) + assert 'actionbar' not in result + assert 'Click' not in result + assert '

content

' in result + + def test_handles_nested_divs(self): + html = '

nested

kept

' + result = _strip_divs_by_id(html) + assert 'secondary' not in result + assert 'nested' not in result + assert '

kept

' in result + + def test_preserves_other_divs(self): + html = '

main

side

' + result = _strip_divs_by_id(html) + assert '
' in result + assert '

main

' in result + assert 'secondary' not in result + + def test_strips_both_ids(self): + html = '
A
B

C

' + result = _strip_divs_by_id(html) + assert 'secondary' not in result + assert 'actionbar' not in result + assert '

C

' in result + + def test_no_matching_divs_unchanged(self): + html = '

hello

' + assert _strip_divs_by_id(html) == html + + def _flite_available(): flite_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'flite', 'bin', 'flite') if not os.path.isfile(flite_path): From cd2c9f95776295115bf0f18f0dea7f53c66c42ff Mon Sep 17 00:00:00 2001 From: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com> Date: Mon, 16 Feb 2026 04:30:56 +0000 Subject: [PATCH 2/3] Generalize tag stripping to support any tag with any attribute match Co-Authored-By: tom mottes --- main.py | 63 +++++++++++++++++++++++++++++----------------------- test_main.py | 36 +++++++++++++++++++++++------- 2 files changed, 63 insertions(+), 36 deletions(-) diff --git a/main.py b/main.py index 685bf2f..e66392d 100644 --- a/main.py +++ b/main.py @@ -461,34 +461,41 @@ def flush_batch(): r'<(?P' + '|'.join(SKIP_TAGS) + r')\b[^>]*>.*?', re.DOTALL | re.IGNORECASE ) -SKIP_DIV_IDS = {'secondary', 'actionbar'} -_SKIP_DIV_OPEN = re.compile( - r']*\bid\s*=\s*["\'](' + '|'.join(SKIP_DIV_IDS) + r')["\'][^>]*>', - re.IGNORECASE -) -_DIV_OPEN = re.compile(r'', re.IGNORECASE) - -def _strip_divs_by_id(content: str) -> str: - """Remove
and
blocks, handling nested divs.""" - while True: - m = _SKIP_DIV_OPEN.search(content) - if not m: - break - depth = 1 - pos = m.end() - while depth > 0 and pos < len(content): - open_m = _DIV_OPEN.search(content, pos) - close_m = _DIV_CLOSE.search(content, pos) - if close_m is None: +# Each rule is (attr, value) — any tag with that attribute=value will be stripped. +SKIP_ATTR_RULES = [ + ('id', 'secondary'), + ('id', 'actionbar'), +] + +def _strip_tags_by_attr(content: str, rules=SKIP_ATTR_RULES) -> str: + """Remove any element whose opening tag matches an (attr, value) rule, handling nesting.""" + for attr, value in rules: + pattern = re.compile( + r'<(?P[a-zA-Z][a-zA-Z0-9]*)\b[^>]*\b' + + re.escape(attr) + r'\s*=\s*["\']' + re.escape(value) + r'["\'][^>]*>', + re.IGNORECASE + ) + while True: + m = pattern.search(content) + if not m: break - if open_m and open_m.start() < close_m.start(): - depth += 1 - pos = open_m.end() - else: - depth -= 1 - pos = close_m.end() - content = content[:m.start()] + content[pos:] + tag = m.group('tag') + tag_open = re.compile(r'<' + re.escape(tag) + r'\b', re.IGNORECASE) + tag_close = re.compile(r'', re.IGNORECASE) + depth = 1 + pos = m.end() + while depth > 0 and pos < len(content): + open_m = tag_open.search(content, pos) + close_m = tag_close.search(content, pos) + if close_m is None: + break + if open_m and open_m.start() < close_m.start(): + depth += 1 + pos = open_m.end() + else: + depth -= 1 + pos = close_m.end() + content = content[:m.start()] + content[pos:] return content @@ -561,7 +568,7 @@ def process_html_file(input_path: str, output_path: Optional[str], resume: bool content = f.read() content = SKIP_TAG_PATTERN.sub('', content) - content = _strip_divs_by_id(content) + content = _strip_tags_by_attr(content) matches = list(PARAGRAPH_PATTERN.finditer(content)) paragraph_count = len(matches) diff --git a/test_main.py b/test_main.py index 9b55a99..7363561 100644 --- a/test_main.py +++ b/test_main.py @@ -22,7 +22,7 @@ remove_checkpoint, _decode_html_text, _decode_text_nodes, - _strip_divs_by_id, + _strip_tags_by_attr, process_html_file, is_verb_in_sentence, ipa_vowels, @@ -328,45 +328,65 @@ def test_double_word_reductions_non_empty(self): assert len(double_word_reductions) > 0 -class TestStripDivsById: +class TestStripTagsByAttr: def test_strips_secondary_div(self): html = '

sidebar

main

' - result = _strip_divs_by_id(html) + result = _strip_tags_by_attr(html) assert 'secondary' not in result assert 'sidebar' not in result assert '

main

' in result def test_strips_actionbar_div(self): html = '

content

' - result = _strip_divs_by_id(html) + result = _strip_tags_by_attr(html) assert 'actionbar' not in result assert 'Click' not in result assert '

content

' in result def test_handles_nested_divs(self): html = '

nested

kept

' - result = _strip_divs_by_id(html) + result = _strip_tags_by_attr(html) assert 'secondary' not in result assert 'nested' not in result assert '

kept

' in result def test_preserves_other_divs(self): html = '

main

side

' - result = _strip_divs_by_id(html) + result = _strip_tags_by_attr(html) assert '
' in result assert '

main

' in result assert 'secondary' not in result def test_strips_both_ids(self): html = '
A
B

C

' - result = _strip_divs_by_id(html) + result = _strip_tags_by_attr(html) assert 'secondary' not in result assert 'actionbar' not in result assert '

C

' in result def test_no_matching_divs_unchanged(self): html = '

hello

' - assert _strip_divs_by_id(html) == html + assert _strip_tags_by_attr(html) == html + + def test_id_not_first_attribute(self): + html = '

sidebar

main

' + result = _strip_tags_by_attr(html) + assert 'secondary' not in result + assert 'sidebar' not in result + assert '

main

' in result + + def test_strips_non_div_tag(self): + html = '

inside

outside

' + result = _strip_tags_by_attr(html) + assert 'secondary' not in result + assert 'inside' not in result + assert '

outside

' in result + + def test_custom_rules(self): + html = '

ad

content

' + result = _strip_tags_by_attr(html, rules=[('class', 'ads')]) + assert 'ads' not in result + assert '

content

' in result def _flite_available(): From 98cbc9b2031b4ed92adb4b001f2a9f0196016637 Mon Sep 17 00:00:00 2001 From: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com> Date: Mon, 16 Feb 2026 04:37:09 +0000 Subject: [PATCH 3/3] Restrict default rules to div tag; add tag name to (tag, attr, value) tuples Co-Authored-By: tom mottes --- main.py | 17 ++++++++--------- test_main.py | 11 ++++++++--- 2 files changed, 16 insertions(+), 12 deletions(-) diff --git a/main.py b/main.py index e66392d..0f6e4c1 100644 --- a/main.py +++ b/main.py @@ -461,17 +461,17 @@ def flush_batch(): r'<(?P' + '|'.join(SKIP_TAGS) + r')\b[^>]*>.*?', re.DOTALL | re.IGNORECASE ) -# Each rule is (attr, value) — any tag with that attribute=value will be stripped. +# Each rule is (tag, attr, value) — elements matching will be stripped. SKIP_ATTR_RULES = [ - ('id', 'secondary'), - ('id', 'actionbar'), + ('div', 'id', 'secondary'), + ('div', 'id', 'actionbar'), ] def _strip_tags_by_attr(content: str, rules=SKIP_ATTR_RULES) -> str: - """Remove any element whose opening tag matches an (attr, value) rule, handling nesting.""" - for attr, value in rules: + """Remove elements matching (tag, attr, value) rules, handling nesting.""" + for tag_name, attr, value in rules: pattern = re.compile( - r'<(?P[a-zA-Z][a-zA-Z0-9]*)\b[^>]*\b' + r'<' + re.escape(tag_name) + r'\b[^>]*\b' + re.escape(attr) + r'\s*=\s*["\']' + re.escape(value) + r'["\'][^>]*>', re.IGNORECASE ) @@ -479,9 +479,8 @@ def _strip_tags_by_attr(content: str, rules=SKIP_ATTR_RULES) -> str: m = pattern.search(content) if not m: break - tag = m.group('tag') - tag_open = re.compile(r'<' + re.escape(tag) + r'\b', re.IGNORECASE) - tag_close = re.compile(r'', re.IGNORECASE) + tag_open = re.compile(r'<' + re.escape(tag_name) + r'\b', re.IGNORECASE) + tag_close = re.compile(r'', re.IGNORECASE) depth = 1 pos = m.end() while depth > 0 and pos < len(content): diff --git a/test_main.py b/test_main.py index 7363561..ecf32b9 100644 --- a/test_main.py +++ b/test_main.py @@ -375,16 +375,21 @@ def test_id_not_first_attribute(self): assert 'sidebar' not in result assert '

main

' in result - def test_strips_non_div_tag(self): + def test_strips_non_div_tag_with_custom_rule(self): html = '

inside

outside

' - result = _strip_tags_by_attr(html) + result = _strip_tags_by_attr(html, rules=[('section', 'id', 'secondary')]) assert 'secondary' not in result assert 'inside' not in result assert '

outside

' in result + def test_does_not_strip_non_div_by_default(self): + html = '

inside

outside

' + result = _strip_tags_by_attr(html) + assert '
' in result + def test_custom_rules(self): html = '

ad

content

' - result = _strip_tags_by_attr(html, rules=[('class', 'ads')]) + result = _strip_tags_by_attr(html, rules=[('div', 'class', 'ads')]) assert 'ads' not in result assert '

content

' in result