From 50ef2fad610554e38bf0882ca22a88c4f05a6ac4 Mon Sep 17 00:00:00 2001 From: ppinchuk Date: Mon, 8 Dec 2025 16:41:41 -0700 Subject: [PATCH 1/5] Move file --- compass/extraction/apply.py | 2 +- compass/{extraction => utilities}/ngrams.py | 0 2 files changed, 1 insertion(+), 1 deletion(-) rename compass/{extraction => utilities}/ngrams.py (100%) diff --git a/compass/extraction/apply.py b/compass/extraction/apply.py index e43008902..efa9b2db4 100644 --- a/compass/extraction/apply.py +++ b/compass/extraction/apply.py @@ -10,6 +10,7 @@ LegalTextValidator, parse_by_chunks, ) +from compass.utilities.ngrams import sentence_ngram_containment from compass.warn import COMPASSWarning @@ -338,7 +339,6 @@ async def _extract_with_ngram_check( ngram_ocr_fraction_threshold=0.75, ): """Extract ordinance info from doc and validate using ngrams.""" - from compass.extraction.ngrams import sentence_ngram_containment # noqa source = doc.attrs.get("source", "Unknown") doc_is_from_ocr = doc.attrs.get("from_ocr", False) diff --git a/compass/extraction/ngrams.py b/compass/utilities/ngrams.py similarity index 100% rename from compass/extraction/ngrams.py rename to compass/utilities/ngrams.py From dc4e6dfbd6c3ffa8a6010aa6e1e81bf826dbc257 Mon Sep 17 00:00:00 2001 From: ppinchuk Date: Mon, 8 Dec 2025 16:43:51 -0700 Subject: [PATCH 2/5] Always return float --- compass/utilities/ngrams.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/compass/utilities/ngrams.py b/compass/utilities/ngrams.py index 419d1b9b1..5c22be01d 100644 --- a/compass/utilities/ngrams.py +++ b/compass/utilities/ngrams.py @@ -79,12 +79,12 @@ def sentence_ngram_containment(original, test, n): ------- float Fraction of ngrams from the `test` input that were found in the - `original` text. Always returns ``True`` if test has no ngrams. + `original` text. Returns ``0`` if test has no ngrams. """ ngrams_test = convert_text_to_sentence_ngrams(test, n) num_test_ngrams = len(ngrams_test) if not num_test_ngrams: - return True + return 0.0 ngrams_original = set(convert_text_to_sentence_ngrams(original, n)) num_ngrams_found = sum(t in ngrams_original for t in ngrams_test) From af8852c2cdd8f0cccaa0f915a920f0eb05f3eb40 Mon Sep 17 00:00:00 2001 From: ppinchuk Date: Mon, 8 Dec 2025 17:09:04 -0700 Subject: [PATCH 3/5] Use ngrams to check if phrase is in text --- compass/validation/content.py | 32 ++++++++++++++++++++++++++++---- 1 file changed, 28 insertions(+), 4 deletions(-) diff --git a/compass/validation/content.py b/compass/validation/content.py index 1b084cd9d..4152d5a03 100644 --- a/compass/validation/content.py +++ b/compass/validation/content.py @@ -7,11 +7,14 @@ import asyncio import logging from abc import ABC, abstractmethod +from warnings import warn from compass.llm.calling import ChatLLMCaller, StructuredLLMCaller from compass.validation.graphs import setup_graph_correct_document_type from compass.common import setup_async_decision_tree, run_async_tree from compass.utilities.enums import LLMUsageCategory +from compass.utilities.ngrams import convert_text_to_sentence_ngrams +from compass.warn import COMPASSWarning logger = logging.getLogger(__name__) @@ -177,10 +180,31 @@ def _count_acronym_matches(self, heuristics_text): def _count_phrase_matches(self, heuristics_text): """Count number of good tech phrases that appear in text""" - return sum( - all(keyword in heuristics_text for keyword in phrase.split(" ")) - for phrase in self.GOOD_TECH_PHRASES - ) + text_ngrams = {} + total = 0 + for phrase in self.GOOD_TECH_PHRASES: + n = len(phrase.split(" ")) + if n <= 1: + msg = ( + "Make sure your GOOD_TECH_PHRASES contain at least 2 " + f"words! Got phrase: {phrase!r}" + ) + warn(msg, COMPASSWarning) + continue + + if n not in text_ngrams: + text_ngrams[n] = set( + convert_text_to_sentence_ngrams(heuristics_text, n) + ) + + test_ngrams = ( # fmt: off + convert_text_to_sentence_ngrams(phrase, n) + + convert_text_to_sentence_ngrams(f"{phrase}s", n) + ) + if any(t in text_ngrams[n] for t in test_ngrams): + total += 1 + + return total @property @abstractmethod From 520e46c3e181deeaef6bd87fc5e5947f046b968d Mon Sep 17 00:00:00 2001 From: ppinchuk Date: Mon, 8 Dec 2025 17:09:20 -0700 Subject: [PATCH 4/5] Add extra test cases --- tests/python/unit/extraction/test_extraction_validation.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tests/python/unit/extraction/test_extraction_validation.py b/tests/python/unit/extraction/test_extraction_validation.py index c736e3d1b..d482d7012 100644 --- a/tests/python/unit/extraction/test_extraction_validation.py +++ b/tests/python/unit/extraction/test_extraction_validation.py @@ -16,6 +16,7 @@ ("Window SETBACKS", False), ("SWECS SETBACKS", False), ("(wind LWET)", True), + ("(wind\n LWET)", True), ("Wind SWECS", False), ("Wind WES", False), ("Wind WES\n", True), @@ -36,6 +37,7 @@ def test_possibly_mentions_wind(text, truth): ("SOLARIS SETBACKS", False), ("WECS SETBACKS", False), ("(solar farm)", True), + ("(solar\nfarm)", True), ("Solar WECS", False), ("Solar SES", False), ("Solar SES\n", True), From 83b1c7b201f7bdcbb6fe49fe2fe6b3eaa45e7cd5 Mon Sep 17 00:00:00 2001 From: ppinchuk Date: Mon, 8 Dec 2025 18:30:43 -0700 Subject: [PATCH 5/5] Add tests for ngrams --- .../unit/utilities/test_utilities_ngrams.py | 64 +++++++++++++++++++ 1 file changed, 64 insertions(+) create mode 100644 tests/python/unit/utilities/test_utilities_ngrams.py diff --git a/tests/python/unit/utilities/test_utilities_ngrams.py b/tests/python/unit/utilities/test_utilities_ngrams.py new file mode 100644 index 000000000..2876b7cf5 --- /dev/null +++ b/tests/python/unit/utilities/test_utilities_ngrams.py @@ -0,0 +1,64 @@ +"""Test ngram utilities""" + +from pathlib import Path + +import pytest + +from compass.utilities.ngrams import ( + _check_word, + _filtered_words, + sentence_ngram_containment, + convert_text_to_sentence_ngrams, +) + + +def test_check_word_filters_common_terms_and_punctuation(): + """Test `_check_word` rejects stop words and punctuation""" + + assert not _check_word("the") + assert not _check_word(",") + assert _check_word("solar") + + +def test_filtered_words_removes_noise_tokens(): + """Test `_filtered_words` only returns significant tokens""" + + sentence = "The solar arrays, and storage!" + assert _filtered_words(sentence) == [ + "solar", + "arrays", + "storage", + "!", + ] + + +def test_convert_text_to_sentence_ngrams_multiple_sentences(): + """Test `convert_text_to_sentence_ngrams` builds ngrams per sentence""" + + text = "The solar arrays store energy. Solar storage thrives." + assert convert_text_to_sentence_ngrams(text, 2) == [ + ("solar", "arrays"), + ("arrays", "store"), + ("store", "energy"), + ("solar", "storage"), + ("storage", "thrives"), + ] + + +def test_sentence_ngram_containment_computes_fraction(): + """Test `sentence_ngram_containment` returns containment ratio""" + + original = "Solar arrays store energy. Batteries support solar arrays." + test_text = "Solar arrays store energy. Solar arrays fail." + result = sentence_ngram_containment(original, test_text, 2) + assert result == pytest.approx(0.8) + + +def test_sentence_ngram_containment_handles_empty_test_text(): + """Test containment logic handles empty-or-stopword sentences""" + + assert sentence_ngram_containment("", "The and is", 2) == 0.0 + + +if __name__ == "__main__": + pytest.main(["-q", "--show-capture=all", Path(__file__), "-rapP"])