NatLabRockies · ppinchuk · Dec 11, 2025 · Dec 8, 2025 · Dec 8, 2025 · Dec 9, 2025
@@ -10,6 +10,7 @@
     LegalTextValidator,
     parse_by_chunks,
 )
+from compass.utilities.ngrams import sentence_ngram_containment
 from compass.warn import COMPASSWarning
 
 
@@ -338,7 +339,6 @@ async def _extract_with_ngram_check(
     ngram_ocr_fraction_threshold=0.75,
 ):
     """Extract ordinance info from doc and validate using ngrams."""
-    from compass.extraction.ngrams import sentence_ngram_containment  # noqa
 
     source = doc.attrs.get("source", "Unknown")
     doc_is_from_ocr = doc.attrs.get("from_ocr", False)

@@ -79,12 +79,12 @@ def sentence_ngram_containment(original, test, n):
     -------
     float
         Fraction of ngrams from the `test` input that were found in the
-        `original` text. Always returns ``True`` if test has no ngrams.
+        `original` text. Returns ``0`` if test has no ngrams.
     """
     ngrams_test = convert_text_to_sentence_ngrams(test, n)
     num_test_ngrams = len(ngrams_test)
     if not num_test_ngrams:
-        return True
+        return 0.0
 
     ngrams_original = set(convert_text_to_sentence_ngrams(original, n))
     num_ngrams_found = sum(t in ngrams_original for t in ngrams_test)

@@ -7,11 +7,14 @@
 import asyncio
 import logging
 from abc import ABC, abstractmethod
+from warnings import warn
 
 from compass.llm.calling import ChatLLMCaller, StructuredLLMCaller
 from compass.validation.graphs import setup_graph_correct_document_type
 from compass.common import setup_async_decision_tree, run_async_tree
 from compass.utilities.enums import LLMUsageCategory
+from compass.utilities.ngrams import convert_text_to_sentence_ngrams
+from compass.warn import COMPASSWarning
 
 
 logger = logging.getLogger(__name__)
@@ -177,10 +180,31 @@ def _count_acronym_matches(self, heuristics_text):
 
     def _count_phrase_matches(self, heuristics_text):
         """Count number of good tech phrases that appear in text"""
-        return sum(
-            all(keyword in heuristics_text for keyword in phrase.split(" "))
-            for phrase in self.GOOD_TECH_PHRASES
-        )
+        text_ngrams = {}
+        total = 0
+        for phrase in self.GOOD_TECH_PHRASES:
+            n = len(phrase.split(" "))
+            if n <= 1:
+                msg = (
+                    "Make sure your GOOD_TECH_PHRASES contain at least 2 "
+                    f"words! Got phrase: {phrase!r}"
+                )
+                warn(msg, COMPASSWarning)
+                continue
+
+            if n not in text_ngrams:
+                text_ngrams[n] = set(
+                    convert_text_to_sentence_ngrams(heuristics_text, n)
+                )
+
+            test_ngrams = (  # fmt: off
+                convert_text_to_sentence_ngrams(phrase, n)
+                + convert_text_to_sentence_ngrams(f"{phrase}s", n)
+            )
+            if any(t in text_ngrams[n] for t in test_ngrams):
+                total += 1
+
+        return total
 
     @property
     @abstractmethod

@@ -16,6 +16,7 @@
         ("Window SETBACKS", False),
         ("SWECS SETBACKS", False),
         ("(wind LWET)", True),
+        ("(wind\n LWET)", True),
         ("Wind SWECS", False),
         ("Wind WES", False),
         ("Wind WES\n", True),
@@ -36,6 +37,7 @@ def test_possibly_mentions_wind(text, truth):
         ("SOLARIS SETBACKS", False),
         ("WECS SETBACKS", False),
         ("(solar farm)", True),
+        ("(solar\nfarm)", True),
         ("Solar WECS", False),
         ("Solar SES", False),
         ("Solar SES\n", True),

@@ -0,0 +1,64 @@
+"""Test ngram utilities"""
+
+from pathlib import Path
+
+import pytest
+
+from compass.utilities.ngrams import (
+    _check_word,
+    _filtered_words,
+    sentence_ngram_containment,
+    convert_text_to_sentence_ngrams,
+)
+
+
+def test_check_word_filters_common_terms_and_punctuation():
+    """Test `_check_word` rejects stop words and punctuation"""
+
+    assert not _check_word("the")
+    assert not _check_word(",")
+    assert _check_word("solar")
+
+
+def test_filtered_words_removes_noise_tokens():
+    """Test `_filtered_words` only returns significant tokens"""
+
+    sentence = "The solar arrays, and storage!"
+    assert _filtered_words(sentence) == [
+        "solar",
+        "arrays",
+        "storage",
+        "!",
+    ]
+
+
+def test_convert_text_to_sentence_ngrams_multiple_sentences():
+    """Test `convert_text_to_sentence_ngrams` builds ngrams per sentence"""
+
+    text = "The solar arrays store energy. Solar storage thrives."
+    assert convert_text_to_sentence_ngrams(text, 2) == [
+        ("solar", "arrays"),
+        ("arrays", "store"),
+        ("store", "energy"),
+        ("solar", "storage"),
+        ("storage", "thrives"),
+    ]
+
+
+def test_sentence_ngram_containment_computes_fraction():
+    """Test `sentence_ngram_containment` returns containment ratio"""
+
+    original = "Solar arrays store energy. Batteries support solar arrays."
+    test_text = "Solar arrays store energy. Solar arrays fail."
+    result = sentence_ngram_containment(original, test_text, 2)
+    assert result == pytest.approx(0.8)
+
+
+def test_sentence_ngram_containment_handles_empty_test_text():
+    """Test containment logic handles empty-or-stopword sentences"""
+
+    assert sentence_ngram_containment("", "The and is", 2) == 0.0
+
+
+if __name__ == "__main__":
+    pytest.main(["-q", "--show-capture=all", Path(__file__), "-rapP"])