From c94456dc47165d434e18d69de276719f78be9edc Mon Sep 17 00:00:00 2001 From: Brandon Liu Date: Mon, 7 Jul 2025 13:01:23 -0400 Subject: [PATCH 1/2] fix(tokenizer): Don't drop nominative reporter when overlapping --- eyecite/tokenizers.py | 20 +++++++++++++++- tests/test_FindTest.py | 54 ++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 73 insertions(+), 1 deletion(-) diff --git a/eyecite/tokenizers.py b/eyecite/tokenizers.py index 30cbf6f..caebb1e 100644 --- a/eyecite/tokenizers.py +++ b/eyecite/tokenizers.py @@ -362,11 +362,29 @@ def tokenize(self, text: str) -> tuple[Tokens, list[tuple[int, Token]]]: and isinstance(token, CitationToken) and token_is_from_nominative_reporter(last_token) ): - # if a token has overlapping matches between a nominative + # If a token has overlapping matches between a nominative # reporter and another type of case citation, prefer the # other case citation. See #221 and #174 + # + # Example: "Calderon v. Thompson, 523 U.S. 538" + # - nominative token: "Thompson, 523" (positions 12-25) + # - standard token: "523 U.S. 538" (positions 22-34) + # These overlap at "523", so we discard the nominative token. citation_tokens.pop(-1) all_tokens.pop(-1) + + # However, the text "Thompson, " was only contained in the + # nominative token. If we discard the token, then the text no longer + # appears in any token. So we have to make sure it's preserved. + non_overlapping_start = last_token.start + non_overlapping_end = min(last_token.end, token.start) + + if non_overlapping_start < non_overlapping_end: + discarded_text = text[ + non_overlapping_start:non_overlapping_end + ] + self.append_text(all_tokens, discarded_text) + offset = non_overlapping_end else: # skip overlaps continue diff --git a/tests/test_FindTest.py b/tests/test_FindTest.py index 13bfe01..ff04fb8 100644 --- a/tests/test_FindTest.py +++ b/tests/test_FindTest.py @@ -1962,3 +1962,57 @@ def test_citation_in_parenthetical_does_not_emit_warning(self, mock_warn): citations = get_citations(text) self.assertEqual(len(citations), 2) mock_warn.assert_not_called() + + def test_nominative_reporter_case_name_extraction(self): + """Test that case names are correctly extracted when nominative reporter + names appear in case names. + + This tests the issue where nominative reporter names (like Thompson) in + case names can interfere with tokenization and case name extraction due + to overlapping citation token conflicts. + """ + test_pairs = [ + # Thompson is a known nominative reporter - this was failing + ( + "Calderon v. Thompson, 523 U.S. 538, 556 (1998)", + case_citation( + volume="523", + reporter="U.S.", + page="538", + year=1998, + metadata={ + "plaintiff": "Calderon", + "defendant": "Thompson", + "pin_cite": "556", + "court": "scotus", + }, + ), + ), + ] + + for cite_string, expected in test_pairs: + with self.subTest(cite_string=cite_string): + found_cites = get_citations(cite_string) + self.assertEqual(len(found_cites), 1) + found = found_cites[0] + + self.assertEqual( + found.metadata.plaintiff, expected.metadata.plaintiff + ) + self.assertEqual( + found.metadata.defendant, expected.metadata.defendant + ) + + full_span_text = cite_string[ + found.full_span()[0] : found.full_span()[1] + ] + self.assertIn( + expected.metadata.plaintiff, + full_span_text, + f"Full span should include plaintiff '{expected.metadata.plaintiff}' for: {cite_string}", + ) + self.assertIn( + expected.metadata.defendant, + full_span_text, + f"Full span should include defendant '{expected.metadata.defendant}' for: {cite_string}", + ) From 5d79fe34e2b9fca9ef704622d9f911eca9e16774 Mon Sep 17 00:00:00 2001 From: Brandon Liu Date: Mon, 7 Jul 2025 13:07:46 -0400 Subject: [PATCH 2/2] Add to CHANGES.md --- CHANGES.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGES.md b/CHANGES.md index 8e4ab05..8df1b69 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -12,6 +12,7 @@ Changes: Fixes: - Modifies rendering of AhocorasickTokenizer parameter in API docs II +- Fixes dropping of nominative reporter due to overlapping CitationTokens ## Current