freelawproject · branliu0 · Jul 7, 2025 · Jul 7, 2025
diff --git a/CHANGES.md b/CHANGES.md
@@ -12,6 +12,7 @@ Changes:
 
 Fixes:
 - Modifies rendering of AhocorasickTokenizer parameter in API docs II
+- Fixes dropping of nominative reporter due to overlapping CitationTokens
 
 ## Current
 

diff --git a/eyecite/tokenizers.py b/eyecite/tokenizers.py
@@ -362,11 +362,29 @@ def tokenize(self, text: str) -> tuple[Tokens, list[tuple[int, Token]]]:
                     and isinstance(token, CitationToken)
                     and token_is_from_nominative_reporter(last_token)
                 ):
-                    # if a token has overlapping matches between a nominative
+                    # If a token has overlapping matches between a nominative
                     # reporter and another type of case citation, prefer the
                     # other case citation. See #221 and #174
+                    #
+                    # Example: "Calderon v. Thompson, 523 U.S. 538"
+                    # - nominative token: "Thompson, 523" (positions 12-25)
+                    # - standard token: "523 U.S. 538" (positions 22-34)
+                    # These overlap at "523", so we discard the nominative token.
                     citation_tokens.pop(-1)
                     all_tokens.pop(-1)
+
+                    # However, the text "Thompson, " was only contained in the
+                    # nominative token. If we discard the token, then the text no longer
+                    # appears in any token. So we have to make sure it's preserved.
+                    non_overlapping_start = last_token.start
+                    non_overlapping_end = min(last_token.end, token.start)
+
+                    if non_overlapping_start < non_overlapping_end:
+                        discarded_text = text[
+                            non_overlapping_start:non_overlapping_end
+                        ]
+                        self.append_text(all_tokens, discarded_text)
+                        offset = non_overlapping_end
                 else:
                     # skip overlaps
                     continue

diff --git a/tests/test_FindTest.py b/tests/test_FindTest.py
@@ -1962,3 +1962,57 @@ def test_citation_in_parenthetical_does_not_emit_warning(self, mock_warn):
         citations = get_citations(text)
         self.assertEqual(len(citations), 2)
         mock_warn.assert_not_called()
+
+    def test_nominative_reporter_case_name_extraction(self):
+        """Test that case names are correctly extracted when nominative reporter
+        names appear in case names.
+
+        This tests the issue where nominative reporter names (like Thompson) in
+        case names can interfere with tokenization and case name extraction due
+        to overlapping citation token conflicts.
+        """
+        test_pairs = [
+            # Thompson is a known nominative reporter - this was failing
+            (
+                "Calderon v. Thompson, 523 U.S. 538, 556 (1998)",
+                case_citation(
+                    volume="523",
+                    reporter="U.S.",
+                    page="538",
+                    year=1998,
+                    metadata={
+                        "plaintiff": "Calderon",
+                        "defendant": "Thompson",
+                        "pin_cite": "556",
+                        "court": "scotus",
+                    },
+                ),
+            ),
+        ]
+
+        for cite_string, expected in test_pairs:
+            with self.subTest(cite_string=cite_string):
+                found_cites = get_citations(cite_string)
+                self.assertEqual(len(found_cites), 1)
+                found = found_cites[0]
+
+                self.assertEqual(
+                    found.metadata.plaintiff, expected.metadata.plaintiff
+                )
+                self.assertEqual(
+                    found.metadata.defendant, expected.metadata.defendant
+                )
+
+                full_span_text = cite_string[
+                    found.full_span()[0] : found.full_span()[1]
+                ]
+                self.assertIn(
+                    expected.metadata.plaintiff,
+                    full_span_text,
+                    f"Full span should include plaintiff '{expected.metadata.plaintiff}' for: {cite_string}",
+                )
+                self.assertIn(
+                    expected.metadata.defendant,
+                    full_span_text,
+                    f"Full span should include defendant '{expected.metadata.defendant}' for: {cite_string}",
+                )
-Original file line number
+Diff line change
@@ Expand Up / @@ -12,6 +12,7 @@ Changes: @@
     Fixes:
     - Modifies rendering of AhocorasickTokenizer parameter in API docs II
+    - Fixes dropping of nominative reporter due to overlapping CitationTokens
     ## Current
@@ Expand Down @@