Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion compass/extraction/apply.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
LegalTextValidator,
parse_by_chunks,
)
from compass.utilities.ngrams import sentence_ngram_containment
from compass.warn import COMPASSWarning


Expand Down Expand Up @@ -338,7 +339,6 @@ async def _extract_with_ngram_check(
ngram_ocr_fraction_threshold=0.75,
):
"""Extract ordinance info from doc and validate using ngrams."""
from compass.extraction.ngrams import sentence_ngram_containment # noqa

source = doc.attrs.get("source", "Unknown")
doc_is_from_ocr = doc.attrs.get("from_ocr", False)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -79,12 +79,12 @@ def sentence_ngram_containment(original, test, n):
-------
float
Fraction of ngrams from the `test` input that were found in the
`original` text. Always returns ``True`` if test has no ngrams.
`original` text. Returns ``0`` if test has no ngrams.
"""
ngrams_test = convert_text_to_sentence_ngrams(test, n)
num_test_ngrams = len(ngrams_test)
if not num_test_ngrams:
return True
return 0.0

ngrams_original = set(convert_text_to_sentence_ngrams(original, n))
num_ngrams_found = sum(t in ngrams_original for t in ngrams_test)
Expand Down
32 changes: 28 additions & 4 deletions compass/validation/content.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,11 +7,14 @@
import asyncio
import logging
from abc import ABC, abstractmethod
from warnings import warn

from compass.llm.calling import ChatLLMCaller, StructuredLLMCaller
from compass.validation.graphs import setup_graph_correct_document_type
from compass.common import setup_async_decision_tree, run_async_tree
from compass.utilities.enums import LLMUsageCategory
from compass.utilities.ngrams import convert_text_to_sentence_ngrams
from compass.warn import COMPASSWarning


logger = logging.getLogger(__name__)
Expand Down Expand Up @@ -177,10 +180,31 @@ def _count_acronym_matches(self, heuristics_text):

def _count_phrase_matches(self, heuristics_text):
"""Count number of good tech phrases that appear in text"""
return sum(
all(keyword in heuristics_text for keyword in phrase.split(" "))
for phrase in self.GOOD_TECH_PHRASES
)
text_ngrams = {}
total = 0
for phrase in self.GOOD_TECH_PHRASES:
n = len(phrase.split(" "))
if n <= 1:
msg = (
"Make sure your GOOD_TECH_PHRASES contain at least 2 "
f"words! Got phrase: {phrase!r}"
)
warn(msg, COMPASSWarning)
continue

if n not in text_ngrams:
text_ngrams[n] = set(
convert_text_to_sentence_ngrams(heuristics_text, n)
)

test_ngrams = ( # fmt: off
convert_text_to_sentence_ngrams(phrase, n)
+ convert_text_to_sentence_ngrams(f"{phrase}s", n)
)
if any(t in text_ngrams[n] for t in test_ngrams):
total += 1

return total

@property
@abstractmethod
Expand Down
2 changes: 2 additions & 0 deletions tests/python/unit/extraction/test_extraction_validation.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
("Window SETBACKS", False),
("SWECS SETBACKS", False),
("(wind LWET)", True),
("(wind\n LWET)", True),
("Wind SWECS", False),
("Wind WES", False),
("Wind WES\n", True),
Expand All @@ -36,6 +37,7 @@ def test_possibly_mentions_wind(text, truth):
("SOLARIS SETBACKS", False),
("WECS SETBACKS", False),
("(solar farm)", True),
("(solar\nfarm)", True),
("Solar WECS", False),
("Solar SES", False),
("Solar SES\n", True),
Expand Down
64 changes: 64 additions & 0 deletions tests/python/unit/utilities/test_utilities_ngrams.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
"""Test ngram utilities"""

from pathlib import Path

import pytest

from compass.utilities.ngrams import (
_check_word,
_filtered_words,
sentence_ngram_containment,
convert_text_to_sentence_ngrams,
)


def test_check_word_filters_common_terms_and_punctuation():
"""Test `_check_word` rejects stop words and punctuation"""

assert not _check_word("the")
assert not _check_word(",")
assert _check_word("solar")


def test_filtered_words_removes_noise_tokens():
"""Test `_filtered_words` only returns significant tokens"""

sentence = "The solar arrays, and storage!"
assert _filtered_words(sentence) == [
"solar",
"arrays",
"storage",
"!",
]


def test_convert_text_to_sentence_ngrams_multiple_sentences():
"""Test `convert_text_to_sentence_ngrams` builds ngrams per sentence"""

text = "The solar arrays store energy. Solar storage thrives."
assert convert_text_to_sentence_ngrams(text, 2) == [
("solar", "arrays"),
("arrays", "store"),
("store", "energy"),
("solar", "storage"),
("storage", "thrives"),
]


def test_sentence_ngram_containment_computes_fraction():
"""Test `sentence_ngram_containment` returns containment ratio"""

original = "Solar arrays store energy. Batteries support solar arrays."
test_text = "Solar arrays store energy. Solar arrays fail."
result = sentence_ngram_containment(original, test_text, 2)
assert result == pytest.approx(0.8)


def test_sentence_ngram_containment_handles_empty_test_text():
"""Test containment logic handles empty-or-stopword sentences"""

assert sentence_ngram_containment("", "The and is", 2) == 0.0


if __name__ == "__main__":
pytest.main(["-q", "--show-capture=all", Path(__file__), "-rapP"])