From fb5e59d6a4a6bf1817ca1daff95ea1a86775b5dc Mon Sep 17 00:00:00 2001 From: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com> Date: Sat, 14 Feb 2026 19:48:58 +0000 Subject: [PATCH] Add integration tests for process_html_file with flite Co-Authored-By: tom mottes --- test_main.py | 129 +++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 129 insertions(+) diff --git a/test_main.py b/test_main.py index 40aea02..3f66805 100644 --- a/test_main.py +++ b/test_main.py @@ -1,5 +1,7 @@ import json import os +import re +import subprocess import tempfile import pytest @@ -20,6 +22,7 @@ remove_checkpoint, _decode_html_text, _decode_text_nodes, + process_html_file, is_verb_in_sentence, ipa_vowels, ipa_consonants, @@ -322,3 +325,129 @@ def test_h_reduction_non_empty(self): def test_double_word_reductions_non_empty(self): assert len(double_word_reductions) > 0 + + +def _flite_available(): + flite_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'flite', 'bin', 'flite') + if not os.path.isfile(flite_path): + return False + try: + subprocess.check_output([flite_path, "-t", "test", "-i"]) + return True + except (OSError, subprocess.CalledProcessError): + return False + +requires_flite = pytest.mark.skipif(not _flite_available(), reason="flite binary not available") + +IPA_CHARS = set(ipa_vowels + ipa_consonants + "ɾˈˌ") + + +def _extract_text_from_html(html_content): + return re.sub(r'<[^>]*>', '', html_content) + + +def _word_overlap_ratio(text_a, text_b): + words_a = set(re.findall(r'[a-zA-Z]+', text_a.lower())) + words_b = set(re.findall(r'[a-zA-Z]+', text_b.lower())) + if not words_a: + return 0.0 + return len(words_a & words_b) / len(words_a) + + +@requires_flite +class TestProcessHtmlFileIntegration: + + def test_simple_html_no_exception(self, tmp_path): + html = "

The cat sat on the mat.

" + input_file = tmp_path / "input.html" + output_file = tmp_path / "output.html" + input_file.write_text(html, encoding="utf-8") + process_html_file(str(input_file), str(output_file)) + assert output_file.exists() + result = output_file.read_text(encoding="utf-8") + assert len(result) > 0 + + def test_output_contains_ipa(self, tmp_path): + html = "

The quick brown fox jumps over the lazy dog.

" + input_file = tmp_path / "input.html" + output_file = tmp_path / "output.html" + input_file.write_text(html, encoding="utf-8") + process_html_file(str(input_file), str(output_file)) + result = output_file.read_text(encoding="utf-8") + found_ipa = any(ch in IPA_CHARS for ch in result) + assert found_ipa, "Output should contain IPA characters" + + def test_output_preserves_original_text(self, tmp_path): + html = "

Hello world, this is a simple test.

" + input_file = tmp_path / "input.html" + output_file = tmp_path / "output.html" + input_file.write_text(html, encoding="utf-8") + process_html_file(str(input_file), str(output_file)) + result = output_file.read_text(encoding="utf-8") + plain_output = _extract_text_from_html(result) + assert _word_overlap_ratio("Hello world this is a simple test", plain_output) > 0.5 + + def test_html_tags_preserved(self, tmp_path): + html = "

Some bold text here.

" + input_file = tmp_path / "input.html" + output_file = tmp_path / "output.html" + input_file.write_text(html, encoding="utf-8") + process_html_file(str(input_file), str(output_file)) + result = output_file.read_text(encoding="utf-8") + assert "" in result + assert "" in result + + def test_multiple_paragraphs(self, tmp_path): + html = ( + "" + "

First paragraph with some words.

" + "

Second paragraph has different words.

" + "

Third paragraph is also here.

" + "" + ) + input_file = tmp_path / "input.html" + output_file = tmp_path / "output.html" + input_file.write_text(html, encoding="utf-8") + process_html_file(str(input_file), str(output_file)) + result = output_file.read_text(encoding="utf-8") + p_tags = re.findall(r']*>', result) + assert len(p_tags) >= 6, "Each input paragraph should produce an IPA paragraph and an original paragraph" + + def test_non_paragraph_content_preserved(self, tmp_path): + html = "Test

Title

Paragraph text.

" + input_file = tmp_path / "input.html" + output_file = tmp_path / "output.html" + input_file.write_text(html, encoding="utf-8") + process_html_file(str(input_file), str(output_file)) + result = output_file.read_text(encoding="utf-8") + assert "

Title

" in result + assert "Test" in result + + def test_empty_paragraph_no_crash(self, tmp_path): + html = "

Real content here.

" + input_file = tmp_path / "input.html" + output_file = tmp_path / "output.html" + input_file.write_text(html, encoding="utf-8") + process_html_file(str(input_file), str(output_file)) + result = output_file.read_text(encoding="utf-8") + assert len(result) > 0 + + def test_html_entities_handled(self, tmp_path): + html = "

Tom & Jerry went to the park.

" + input_file = tmp_path / "input.html" + output_file = tmp_path / "output.html" + input_file.write_text(html, encoding="utf-8") + process_html_file(str(input_file), str(output_file)) + result = output_file.read_text(encoding="utf-8") + found_ipa = any(ch in IPA_CHARS for ch in result) + assert found_ipa + + def test_stdout_mode_no_exception(self, tmp_path, capsys): + html = "

Testing stdout output mode.

" + input_file = tmp_path / "input.html" + input_file.write_text(html, encoding="utf-8") + process_html_file(str(input_file), None) + captured = capsys.readouterr() + assert len(captured.out) > 0 + found_ipa = any(ch in IPA_CHARS for ch in captured.out) + assert found_ipa