diff --git a/test_main.py b/test_main.py index 40aea02..3f66805 100644 --- a/test_main.py +++ b/test_main.py @@ -1,5 +1,7 @@ import json import os +import re +import subprocess import tempfile import pytest @@ -20,6 +22,7 @@ remove_checkpoint, _decode_html_text, _decode_text_nodes, + process_html_file, is_verb_in_sentence, ipa_vowels, ipa_consonants, @@ -322,3 +325,129 @@ def test_h_reduction_non_empty(self): def test_double_word_reductions_non_empty(self): assert len(double_word_reductions) > 0 + + +def _flite_available(): + flite_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'flite', 'bin', 'flite') + if not os.path.isfile(flite_path): + return False + try: + subprocess.check_output([flite_path, "-t", "test", "-i"]) + return True + except (OSError, subprocess.CalledProcessError): + return False + +requires_flite = pytest.mark.skipif(not _flite_available(), reason="flite binary not available") + +IPA_CHARS = set(ipa_vowels + ipa_consonants + "ɾˈˌ") + + +def _extract_text_from_html(html_content): + return re.sub(r'<[^>]*>', '', html_content) + + +def _word_overlap_ratio(text_a, text_b): + words_a = set(re.findall(r'[a-zA-Z]+', text_a.lower())) + words_b = set(re.findall(r'[a-zA-Z]+', text_b.lower())) + if not words_a: + return 0.0 + return len(words_a & words_b) / len(words_a) + + +@requires_flite +class TestProcessHtmlFileIntegration: + + def test_simple_html_no_exception(self, tmp_path): + html = "
The cat sat on the mat.
" + input_file = tmp_path / "input.html" + output_file = tmp_path / "output.html" + input_file.write_text(html, encoding="utf-8") + process_html_file(str(input_file), str(output_file)) + assert output_file.exists() + result = output_file.read_text(encoding="utf-8") + assert len(result) > 0 + + def test_output_contains_ipa(self, tmp_path): + html = "The quick brown fox jumps over the lazy dog.
" + input_file = tmp_path / "input.html" + output_file = tmp_path / "output.html" + input_file.write_text(html, encoding="utf-8") + process_html_file(str(input_file), str(output_file)) + result = output_file.read_text(encoding="utf-8") + found_ipa = any(ch in IPA_CHARS for ch in result) + assert found_ipa, "Output should contain IPA characters" + + def test_output_preserves_original_text(self, tmp_path): + html = "Hello world, this is a simple test.
" + input_file = tmp_path / "input.html" + output_file = tmp_path / "output.html" + input_file.write_text(html, encoding="utf-8") + process_html_file(str(input_file), str(output_file)) + result = output_file.read_text(encoding="utf-8") + plain_output = _extract_text_from_html(result) + assert _word_overlap_ratio("Hello world this is a simple test", plain_output) > 0.5 + + def test_html_tags_preserved(self, tmp_path): + html = "Some bold text here.
" + input_file = tmp_path / "input.html" + output_file = tmp_path / "output.html" + input_file.write_text(html, encoding="utf-8") + process_html_file(str(input_file), str(output_file)) + result = output_file.read_text(encoding="utf-8") + assert "" in result + assert "" in result + + def test_multiple_paragraphs(self, tmp_path): + html = ( + "" + "First paragraph with some words.
" + "Second paragraph has different words.
" + "Third paragraph is also here.
" + "" + ) + input_file = tmp_path / "input.html" + output_file = tmp_path / "output.html" + input_file.write_text(html, encoding="utf-8") + process_html_file(str(input_file), str(output_file)) + result = output_file.read_text(encoding="utf-8") + p_tags = re.findall(r']*>', result) + assert len(p_tags) >= 6, "Each input paragraph should produce an IPA paragraph and an original paragraph" + + def test_non_paragraph_content_preserved(self, tmp_path): + html = "
Paragraph text.
" + input_file = tmp_path / "input.html" + output_file = tmp_path / "output.html" + input_file.write_text(html, encoding="utf-8") + process_html_file(str(input_file), str(output_file)) + result = output_file.read_text(encoding="utf-8") + assert "Real content here.
" + input_file = tmp_path / "input.html" + output_file = tmp_path / "output.html" + input_file.write_text(html, encoding="utf-8") + process_html_file(str(input_file), str(output_file)) + result = output_file.read_text(encoding="utf-8") + assert len(result) > 0 + + def test_html_entities_handled(self, tmp_path): + html = "Tom & Jerry went to the park.
" + input_file = tmp_path / "input.html" + output_file = tmp_path / "output.html" + input_file.write_text(html, encoding="utf-8") + process_html_file(str(input_file), str(output_file)) + result = output_file.read_text(encoding="utf-8") + found_ipa = any(ch in IPA_CHARS for ch in result) + assert found_ipa + + def test_stdout_mode_no_exception(self, tmp_path, capsys): + html = "Testing stdout output mode.
" + input_file = tmp_path / "input.html" + input_file.write_text(html, encoding="utf-8") + process_html_file(str(input_file), None) + captured = capsys.readouterr() + assert len(captured.out) > 0 + found_ipa = any(ch in IPA_CHARS for ch in captured.out) + assert found_ipa