From 50ef2fad610554e38bf0882ca22a88c4f05a6ac4 Mon Sep 17 00:00:00 2001
From: ppinchuk <pinchukpaul@gmail.com>
Date: Mon, 8 Dec 2025 16:41:41 -0700
Subject: [PATCH 1/5] Move file

---
 compass/extraction/apply.py                 | 2 +-
 compass/{extraction => utilities}/ngrams.py | 0
 2 files changed, 1 insertion(+), 1 deletion(-)
 rename compass/{extraction => utilities}/ngrams.py (100%)

diff --git a/compass/extraction/apply.py b/compass/extraction/apply.py
index e43008902..efa9b2db4 100644
--- a/compass/extraction/apply.py
+++ b/compass/extraction/apply.py
@@ -10,6 +10,7 @@
     LegalTextValidator,
     parse_by_chunks,
 )
+from compass.utilities.ngrams import sentence_ngram_containment
 from compass.warn import COMPASSWarning
 
 
@@ -338,7 +339,6 @@ async def _extract_with_ngram_check(
     ngram_ocr_fraction_threshold=0.75,
 ):
     """Extract ordinance info from doc and validate using ngrams."""
-    from compass.extraction.ngrams import sentence_ngram_containment  # noqa
 
     source = doc.attrs.get("source", "Unknown")
     doc_is_from_ocr = doc.attrs.get("from_ocr", False)
diff --git a/compass/extraction/ngrams.py b/compass/utilities/ngrams.py
similarity index 100%
rename from compass/extraction/ngrams.py
rename to compass/utilities/ngrams.py

From dc4e6dfbd6c3ffa8a6010aa6e1e81bf826dbc257 Mon Sep 17 00:00:00 2001
From: ppinchuk <pinchukpaul@gmail.com>
Date: Mon, 8 Dec 2025 16:43:51 -0700
Subject: [PATCH 2/5] Always return float

---
 compass/utilities/ngrams.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/compass/utilities/ngrams.py b/compass/utilities/ngrams.py
index 419d1b9b1..5c22be01d 100644
--- a/compass/utilities/ngrams.py
+++ b/compass/utilities/ngrams.py
@@ -79,12 +79,12 @@ def sentence_ngram_containment(original, test, n):
     -------
     float
         Fraction of ngrams from the `test` input that were found in the
-        `original` text. Always returns ``True`` if test has no ngrams.
+        `original` text. Returns ``0`` if test has no ngrams.
     """
     ngrams_test = convert_text_to_sentence_ngrams(test, n)
     num_test_ngrams = len(ngrams_test)
     if not num_test_ngrams:
-        return True
+        return 0.0
 
     ngrams_original = set(convert_text_to_sentence_ngrams(original, n))
     num_ngrams_found = sum(t in ngrams_original for t in ngrams_test)

From af8852c2cdd8f0cccaa0f915a920f0eb05f3eb40 Mon Sep 17 00:00:00 2001
From: ppinchuk <pinchukpaul@gmail.com>
Date: Mon, 8 Dec 2025 17:09:04 -0700
Subject: [PATCH 3/5] Use ngrams to check if phrase is in text

---
 compass/validation/content.py | 32 ++++++++++++++++++++++++++++----
 1 file changed, 28 insertions(+), 4 deletions(-)

diff --git a/compass/validation/content.py b/compass/validation/content.py
index 1b084cd9d..4152d5a03 100644
--- a/compass/validation/content.py
+++ b/compass/validation/content.py
@@ -7,11 +7,14 @@
 import asyncio
 import logging
 from abc import ABC, abstractmethod
+from warnings import warn
 
 from compass.llm.calling import ChatLLMCaller, StructuredLLMCaller
 from compass.validation.graphs import setup_graph_correct_document_type
 from compass.common import setup_async_decision_tree, run_async_tree
 from compass.utilities.enums import LLMUsageCategory
+from compass.utilities.ngrams import convert_text_to_sentence_ngrams
+from compass.warn import COMPASSWarning
 
 
 logger = logging.getLogger(__name__)
@@ -177,10 +180,31 @@ def _count_acronym_matches(self, heuristics_text):
 
     def _count_phrase_matches(self, heuristics_text):
         """Count number of good tech phrases that appear in text"""
-        return sum(
-            all(keyword in heuristics_text for keyword in phrase.split(" "))
-            for phrase in self.GOOD_TECH_PHRASES
-        )
+        text_ngrams = {}
+        total = 0
+        for phrase in self.GOOD_TECH_PHRASES:
+            n = len(phrase.split(" "))
+            if n <= 1:
+                msg = (
+                    "Make sure your GOOD_TECH_PHRASES contain at least 2 "
+                    f"words! Got phrase: {phrase!r}"
+                )
+                warn(msg, COMPASSWarning)
+                continue
+
+            if n not in text_ngrams:
+                text_ngrams[n] = set(
+                    convert_text_to_sentence_ngrams(heuristics_text, n)
+                )
+
+            test_ngrams = (  # fmt: off
+                convert_text_to_sentence_ngrams(phrase, n)
+                + convert_text_to_sentence_ngrams(f"{phrase}s", n)
+            )
+            if any(t in text_ngrams[n] for t in test_ngrams):
+                total += 1
+
+        return total
 
     @property
     @abstractmethod

From 520e46c3e181deeaef6bd87fc5e5947f046b968d Mon Sep 17 00:00:00 2001
From: ppinchuk <pinchukpaul@gmail.com>
Date: Mon, 8 Dec 2025 17:09:20 -0700
Subject: [PATCH 4/5] Add extra test cases

---
 tests/python/unit/extraction/test_extraction_validation.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/tests/python/unit/extraction/test_extraction_validation.py b/tests/python/unit/extraction/test_extraction_validation.py
index c736e3d1b..d482d7012 100644
--- a/tests/python/unit/extraction/test_extraction_validation.py
+++ b/tests/python/unit/extraction/test_extraction_validation.py
@@ -16,6 +16,7 @@
         ("Window SETBACKS", False),
         ("SWECS SETBACKS", False),
         ("(wind LWET)", True),
+        ("(wind\n LWET)", True),
         ("Wind SWECS", False),
         ("Wind WES", False),
         ("Wind WES\n", True),
@@ -36,6 +37,7 @@ def test_possibly_mentions_wind(text, truth):
         ("SOLARIS SETBACKS", False),
         ("WECS SETBACKS", False),
         ("(solar farm)", True),
+        ("(solar\nfarm)", True),
         ("Solar WECS", False),
         ("Solar SES", False),
         ("Solar SES\n", True),

From 83b1c7b201f7bdcbb6fe49fe2fe6b3eaa45e7cd5 Mon Sep 17 00:00:00 2001
From: ppinchuk <pinchukpaul@gmail.com>
Date: Mon, 8 Dec 2025 18:30:43 -0700
Subject: [PATCH 5/5] Add tests for ngrams

---
 .../unit/utilities/test_utilities_ngrams.py   | 64 +++++++++++++++++++
 1 file changed, 64 insertions(+)
 create mode 100644 tests/python/unit/utilities/test_utilities_ngrams.py

diff --git a/tests/python/unit/utilities/test_utilities_ngrams.py b/tests/python/unit/utilities/test_utilities_ngrams.py
new file mode 100644
index 000000000..2876b7cf5
--- /dev/null
+++ b/tests/python/unit/utilities/test_utilities_ngrams.py
@@ -0,0 +1,64 @@
+"""Test ngram utilities"""
+
+from pathlib import Path
+
+import pytest
+
+from compass.utilities.ngrams import (
+    _check_word,
+    _filtered_words,
+    sentence_ngram_containment,
+    convert_text_to_sentence_ngrams,
+)
+
+
+def test_check_word_filters_common_terms_and_punctuation():
+    """Test `_check_word` rejects stop words and punctuation"""
+
+    assert not _check_word("the")
+    assert not _check_word(",")
+    assert _check_word("solar")
+
+
+def test_filtered_words_removes_noise_tokens():
+    """Test `_filtered_words` only returns significant tokens"""
+
+    sentence = "The solar arrays, and storage!"
+    assert _filtered_words(sentence) == [
+        "solar",
+        "arrays",
+        "storage",
+        "!",
+    ]
+
+
+def test_convert_text_to_sentence_ngrams_multiple_sentences():
+    """Test `convert_text_to_sentence_ngrams` builds ngrams per sentence"""
+
+    text = "The solar arrays store energy. Solar storage thrives."
+    assert convert_text_to_sentence_ngrams(text, 2) == [
+        ("solar", "arrays"),
+        ("arrays", "store"),
+        ("store", "energy"),
+        ("solar", "storage"),
+        ("storage", "thrives"),
+    ]
+
+
+def test_sentence_ngram_containment_computes_fraction():
+    """Test `sentence_ngram_containment` returns containment ratio"""
+
+    original = "Solar arrays store energy. Batteries support solar arrays."
+    test_text = "Solar arrays store energy. Solar arrays fail."
+    result = sentence_ngram_containment(original, test_text, 2)
+    assert result == pytest.approx(0.8)
+
+
+def test_sentence_ngram_containment_handles_empty_test_text():
+    """Test containment logic handles empty-or-stopword sentences"""
+
+    assert sentence_ngram_containment("", "The and is", 2) == 0.0
+
+
+if __name__ == "__main__":
+    pytest.main(["-q", "--show-capture=all", Path(__file__), "-rapP"])