Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGES.md
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ Changes:

Fixes:
- Modifies rendering of AhocorasickTokenizer parameter in API docs II
- Fixes dropping of nominative reporter due to overlapping CitationTokens

## Current

Expand Down
20 changes: 19 additions & 1 deletion eyecite/tokenizers.py
Original file line number Diff line number Diff line change
Expand Up @@ -362,11 +362,29 @@ def tokenize(self, text: str) -> tuple[Tokens, list[tuple[int, Token]]]:
and isinstance(token, CitationToken)
and token_is_from_nominative_reporter(last_token)
):
# if a token has overlapping matches between a nominative
# If a token has overlapping matches between a nominative
# reporter and another type of case citation, prefer the
# other case citation. See #221 and #174
#
# Example: "Calderon v. Thompson, 523 U.S. 538"
# - nominative token: "Thompson, 523" (positions 12-25)
# - standard token: "523 U.S. 538" (positions 22-34)
# These overlap at "523", so we discard the nominative token.
citation_tokens.pop(-1)
all_tokens.pop(-1)

# However, the text "Thompson, " was only contained in the
# nominative token. If we discard the token, then the text no longer
# appears in any token. So we have to make sure it's preserved.
non_overlapping_start = last_token.start
non_overlapping_end = min(last_token.end, token.start)

if non_overlapping_start < non_overlapping_end:
discarded_text = text[
non_overlapping_start:non_overlapping_end
]
self.append_text(all_tokens, discarded_text)
offset = non_overlapping_end
else:
# skip overlaps
continue
Expand Down
54 changes: 54 additions & 0 deletions tests/test_FindTest.py
Original file line number Diff line number Diff line change
Expand Up @@ -1962,3 +1962,57 @@ def test_citation_in_parenthetical_does_not_emit_warning(self, mock_warn):
citations = get_citations(text)
self.assertEqual(len(citations), 2)
mock_warn.assert_not_called()

def test_nominative_reporter_case_name_extraction(self):
"""Test that case names are correctly extracted when nominative reporter
names appear in case names.

This tests the issue where nominative reporter names (like Thompson) in
case names can interfere with tokenization and case name extraction due
to overlapping citation token conflicts.
"""
test_pairs = [
# Thompson is a known nominative reporter - this was failing
(
"Calderon v. Thompson, 523 U.S. 538, 556 (1998)",
case_citation(
volume="523",
reporter="U.S.",
page="538",
year=1998,
metadata={
"plaintiff": "Calderon",
"defendant": "Thompson",
"pin_cite": "556",
"court": "scotus",
},
),
),
]

for cite_string, expected in test_pairs:
with self.subTest(cite_string=cite_string):
found_cites = get_citations(cite_string)
self.assertEqual(len(found_cites), 1)
found = found_cites[0]

self.assertEqual(
found.metadata.plaintiff, expected.metadata.plaintiff
)
self.assertEqual(
found.metadata.defendant, expected.metadata.defendant
)

full_span_text = cite_string[
found.full_span()[0] : found.full_span()[1]
]
self.assertIn(
expected.metadata.plaintiff,
full_span_text,
f"Full span should include plaintiff '{expected.metadata.plaintiff}' for: {cite_string}",
)
self.assertIn(
expected.metadata.defendant,
full_span_text,
f"Full span should include defendant '{expected.metadata.defendant}' for: {cite_string}",
)
Loading