diff --git a/main.py b/main.py index 52ca3c1..c91677f 100644 --- a/main.py +++ b/main.py @@ -462,14 +462,6 @@ def flush_batch(): re.DOTALL | re.IGNORECASE ) -def _get_skip_ranges(content: str) -> List[Tuple[int, int]]: - return [(m.start(), m.end()) for m in SKIP_TAG_PATTERN.finditer(content)] - -def _in_skip_range(pos: int, skip_ranges: List[Tuple[int, int]]) -> bool: - for start, end in skip_ranges: - if start <= pos < end: - return True - return False def _decode_html_text(text: str) -> str: decoded = html_module.unescape(text) @@ -539,8 +531,8 @@ def process_html_file(input_path: str, output_path: Optional[str], resume: bool with open(input_path, 'r', encoding='utf-8') as f: content = f.read() - skip_ranges = _get_skip_ranges(content) - matches = [m for m in PARAGRAPH_PATTERN.finditer(content) if not _in_skip_range(m.start(), skip_ranges)] + content = SKIP_TAG_PATTERN.sub('', content) + matches = list(PARAGRAPH_PATTERN.finditer(content)) paragraph_count = len(matches) checkpoint_path = get_checkpoint_path(output_path) if output_path else None diff --git a/test_main.py b/test_main.py index 3f66805..869c9d4 100644 --- a/test_main.py +++ b/test_main.py @@ -421,7 +421,8 @@ def test_non_paragraph_content_preserved(self, tmp_path): process_html_file(str(input_file), str(output_file)) result = output_file.read_text(encoding="utf-8") assert "

Title

" in result - assert "Test" in result + assert "" not in result + assert "" not in result def test_empty_paragraph_no_crash(self, tmp_path): html = "<html><body><p></p><p>Real content here.</p></body></html>"