diff --git a/README.rst b/README.rst index b0194cff..ddce5574 100644 --- a/README.rst +++ b/README.rst @@ -66,7 +66,7 @@ Install eyecite:: Here's a short example of extracting citations and their metadata from text using eyecite's main :code:`get_citations()` function:: - from eyecite import get_citations + from eyecite import get_citations, Document text = """ Mass. Gen. Laws ch. 1, § 2 (West 1999) (barring ...). @@ -75,7 +75,8 @@ Here's a short example of extracting citations and their metadata from text usin Foo, supra, at 5. """ - get_citations(text) + document = Document(text) + get_citations(document) # returns: [ @@ -113,21 +114,41 @@ eyecite's full API is documented `here str, default :code:`''`: The text to parse. If the - text has markup, it's better to use the :code:`markup_text` argument to get - enhanced extraction. One of `plain_text` or `markup_text` must be passed - as input. -2. :code:`remove_ambiguous` ==> bool, default :code:`False`: Whether to remove citations - that might refer to more than one reporter and can't be narrowed down by date. -3. :code:`tokenizer` ==> Tokenizer, default :code:`eyecite.tokenizers.default_tokenizer`: - An instance of a Tokenizer object (see "Tokenizers" below). -4. :code:`markup_text` ==> str, default :code:`''`: optional XML or HTML source - text that will be used to extract ReferenceCitations or help identify case - names using markup tags. -5. :code:`clean_steps` ==> list, default :code:`None`: list of callables or the - name string of functions in `clean.py`. Used to clean the input text +1. :code:`document` ==> Document: The document to parse. +2. :code:`remove_ambiguous` ==> bool, default :code:`False`: Whether to remove citations that might refer to more than one reporter and can't be narrowed down by date. +3. :code:`tokenizer` ==> Tokenizer, default :code:`eyecite.tokenizers.default_tokenizer`: An instance of a Tokenizer object (see "Tokenizers" below). + + +Creating a Document +------------------- + +eyecite operates on :code:`Document` objects, an abstraction representing the +text of a legal document (or any string!) and any pre-processing cleaning steps +applied to it before citation parsing. Creating a :code:`Document` requires +only a string and three optional parameters: + +1. :code:`source_text` ==> str: The text to be parsed. +2. :code:`has_markup` ==> bool, default :code:`False`: Whether the text contains markup. If it does, it is best to indicate this for a downstream performance boost that exploits the XML / HTML structure. +3. :code:`clean_steps` ==> list, default :code:`[]`: Optional cleaning steps to apply to the source text (see below for more details). +4. :code:`use_dmp` ==> bool, default :code:`True`: Whether differences between the given source text and the cleaned text should be calculated using the fast_diff_match_patch_python library (the default) or the slower built-in difflib library, which may be useful for debugging. + + +Cleaning Document Text +^^^^^^^^^^^^^^^^^^^ + +Oftentimes, the text you want to parse contains unhelpful whitespace, +linebreaks, markup, or other characters that could obstruct the +identification of citations. To deal with these problems, you can specify a +sequence of cleaning steps to apply to your text when you create a `Document`. +The :code:`clean_steps` parameter currently accepts these values as cleaners: + +1. :code:`inline_whitespace`: replace all runs of tab and space characters with a single space character +2. :code:`all_whitespace`: replace all runs of any whitespace character with a single space character +3. :code:`underscores`: remove two or more underscores, a common error in text extracted from PDFs +4. :code:`html`: remove non-visible HTML content using the lxml library +5. Custom function: any function taking a string and returning a string. Resolving Reference Citations @@ -144,7 +165,7 @@ citation extraction, followed by a secondary reference resolution step. If you have an external database (e.g., CourtListener) that provides resolved case names, you can use this feature to enhance citation finding.:: - from eyecite import get_citations + from eyecite import get_citations, Document from eyecite.find import extract_reference_citations from eyecite.helpers import filter_citations @@ -156,19 +177,15 @@ case names, you can use this feature to enhance citation finding.:: "self-interested conduct as an As Theatre Enterprises at 552 held, parallel" ) - - from eyecite import get_citations - from eyecite.find import extract_reference_citations - from eyecite.helpers import filter_citations - # Step 1: Extract full citations - citations = get_citations(plain_text) + document = Document(plain_text) + citations = get_citations(document) # Step 2: Resolve the case name from an external database or prior knowledge citations[0].metadata.resolved_case_name_short = "Theatre Enterprises" # Step 3: Extract reference citations using the resolved name - references = extract_reference_citations(citations[0], plain_text) + references = extract_reference_citations(citations[0], document) # Step 4: Filter and merge citations new_citations = filter_citations(citations + references) @@ -177,82 +194,23 @@ Keep in mind that this feature requires an external database or heuristic method to resolve the short case name before extracting reference citations a second time. -Cleaning Input Text -------------------- - -For a given citation text such as "... 1 Baldwin's Rep. 1 ...", you can input -the cleaned text and pass it in the :code:`plain_text` argument without -:code:`clean_steps``, or you can pass it without pre processing and pass a list -to :code:`clean_steps` - -* Spaces will be single space characters, not multiple spaces or other whitespace. -* Quotes and hyphens will be standard quote and hyphen characters. -* No junk such as HTML tags inside the citation. - -The cleanup is done via :code:`clean_text`: - -:: - - from eyecite import clean_text, get_citations - - source_text = '

foo 1 U.S. 1

' - plain_text = clean_text(text, ['html', 'inline_whitespace', my_func]) - found_citations = get_citations(plain_text) - -See the `Annotating Citations <#annotating-citations>`_ section for how to insert links into the original text using -citations extracted from the cleaned text. - -:code:`clean_text` currently accepts these values as cleaners: - -1. :code:`inline_whitespace`: replace all runs of tab and space characters with a single space character -2. :code:`all_whitespace`: replace all runs of any whitespace character with a single space character -3. :code:`underscores`: remove two or more underscores, a common error in text extracted from PDFs -4. :code:`html`: remove non-visible HTML content using the lxml library -5. Custom function: any function taking a string and returning a string. - - Annotating Citations -------------------- -For simple plain text, you can insert links to citations using the :code:`annotate_citations` function: +Once the citations have been extracted, you can insert links (or other annotations) to +them in your original text using the :code:`annotate_citations` function: :: - from eyecite import get_citations, annotate_citations + from eyecite import get_citations, annotate_citations, Document - plain_text = 'bob lissner v. test 1 U.S. 12, 347-348 (4th Cir. 1982)' - citations = get_citations(plain_text) - linked_text = annotate_citations(plain_text, [[c.span(), "", ""] for c in citations]) + document = Document('bob lissner v. test 1 U.S. 12, 347-348 (4th Cir. 1982)') + citations = get_citations(document) + linked_text = annotate_citations(document, [[c.span(), "", ""] for c in citations]) returns: 'bob lissner v. test 1 U.S. 12, 347-348 (4th Cir. 1982)' -Each citation returned by get_citations keeps track of where it was found in the source text. -As a result, :code:`annotate_citations` must be called with the *same* cleaned text used by :code:`get_citations` -to extract citations. If you do not, the offsets returned by the citation's :code:`span` method will -not align with the text, and your annotations will be in the wrong place. - -If you want to clean text and then insert annotations into the original text, you can pass -the original text in as :code:`source_text`: - -:: - - from eyecite import get_citations, annotate_citations, clean_text - - source_text = '

bob lissner v. test 1 U.S. 12, 347-348 (4th Cir. 1982)

' - plain_text = clean_text(source_text, ['html', 'inline_whitespace']) - citations = get_citations(plain_text) - linked_text = annotate_citations(plain_text, [[c.span(), "", ""] for c in citations], source_text=source_text) - - returns: - '

bob lissner v. test 1 U.S. 12, 347-348 (4th Cir. 1982)

' - -The above example extracts citations from :code:`plain_text` and applies them to -:code:`source_text`, using a diffing algorithm to insert annotations in the correct locations -in the original text. - -There is also a :code:`full_span` attribute that can be used to get the indexes of the full citation, including the -pre- and post-citation attributes. Wrapping HTML Tags ^^^^^^^^^^^^^^^^^^ @@ -279,7 +237,7 @@ that takes :code:`(before, span_text, after)` and returns the annotated text: def annotator(before, span_text, after): return before + span_text.lower() + after - linked_text = annotate_citations(plain_text, [[c.span(), "", ""] for c in citations], annotator=annotator) + linked_text = annotate_citations(document, [[c.span(), "", ""] for c in citations], annotator=annotator) returns: 'bob lissner v. test 1 u.s. 12, 347-348 (4th Cir. 1982)' @@ -294,10 +252,10 @@ returning a dictionary that maps resources to lists of associated citations: :: - from eyecite import get_citations, resolve_citations + from eyecite import get_citations, resolve_citations, Document - text = 'first citation: 1 U.S. 12. second citation: 2 F.3d 2. third citation: Id.' - found_citations = get_citations(text) + document = Document('first citation: 1 U.S. 12. second citation: 2 F.3d 2. third citation: Id.') + found_citations = get_citations(document) resolved_citations = resolve_citations(found_citations) returns (pseudo): diff --git a/benchmark/benchmark.py b/benchmark/benchmark.py index f002b9cb..eccefd6f 100644 --- a/benchmark/benchmark.py +++ b/benchmark/benchmark.py @@ -12,7 +12,7 @@ from matplotlib import pyplot as plt # type: ignore -from eyecite import get_citations +from eyecite import Document, get_citations SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__)) sys.path.append(os.path.dirname(SCRIPT_DIR)) @@ -59,11 +59,13 @@ def generate_branch_report(self, branch: str) -> None: if text: # Remove XML encodings from xml_harvard text = re.sub(r"^<\?xml.*?\?>", "", text, count=1) - params["markup_text"] = text or "" + params["source_text"] = text or "" + params["has_markup"] = True else: - params["markup_text"] = row["plain_text"] + params["source_text"] = row["plain_text"] - found_citations = get_citations(**params) + document = Document(**params) + found_citations = get_citations(document) # Get the citation text string from the cite object cites = [cite.token.data for cite in found_citations if cite.token] diff --git a/eyecite/__init__.py b/eyecite/__init__.py index fe93ab77..f4c3125a 100644 --- a/eyecite/__init__.py +++ b/eyecite/__init__.py @@ -1,6 +1,7 @@ from .annotate import annotate_citations from .clean import clean_text from .find import get_citations +from .models import Document from .resolve import resolve_citations __all__ = [ @@ -8,6 +9,7 @@ "get_citations", "clean_text", "resolve_citations", + "Document", ] # No need to create API documentation for these internal helper functions diff --git a/eyecite/annotate.py b/eyecite/annotate.py index 0f5f86dc..085b46c9 100644 --- a/eyecite/annotate.py +++ b/eyecite/annotate.py @@ -3,17 +3,19 @@ from difflib import SequenceMatcher from functools import partial from logging import getLogger -from typing import Any +from typing import TYPE_CHECKING, Any import fast_diff_match_patch from eyecite.utils import ( is_balanced_html, maybe_balance_style_tags, - placeholder_markup, wrap_html_tags, ) +if TYPE_CHECKING: + from eyecite.models import Document + logger = getLogger(__name__) @@ -122,78 +124,62 @@ def update(self, offset, bisect): def annotate_citations( - plain_text: str, + document: "Document", annotations: Iterable[tuple[tuple[int, int], Any, Any]], - source_text: str = "", unbalanced_tags: str = "unchecked", - use_dmp: bool = True, annotator: Callable[[Any, str, Any], str] | None = None, - offset_updater: SpanUpdater | None = None, ) -> str: - """Given a list of citations and the text from which they were parsed, + """Given a `eyecite.models.Document` and a list of citation positions, insert annotations into the text surrounding each citation. This could be useful for linking the citations to a URL, or otherwise indicating that they were successfully parsed or resolved. - If you pre-processed your text before extracting the citations, this - function will intelligently reconcile the differences between the original - source text and the cleaned text using a diffing algorithm, ensuring that - each annotation is inserted in the correct location. - Example: - >>> plain_text = "foo 1 U.S. 1 bar" - >>> citations = get_citations(plain_text) - >>> annotate_citations("foo 1 U.S. 1 bar", - ... [(citations[0].span(), "", "")]) + >>> document = Document("foo 1 U.S. 1 bar") + >>> citations = get_citations(document) + >>> annotate_citations(document, [(citations[0].span(), "", "")]) >>> >>> returns: "foo 1 U.S. 1 bar" Args: - plain_text: The text containing the citations. If this text was - cleaned, you should also pass the `source_text` below. + document: The `eyecite.models.Document` object from which the + citations were parsed. annotations: A `Tuple` of (1) the start and end positions of the citation in the text, (2) the text to insert before the citation, and (3) the text to insert after the citation. - source_text: If provided, apply annotations to this text instead using - a diffing algorithm. - unbalanced_tags: If provided, unbalanced_tags="skip" will skip - inserting annotations that result in invalid HTML. - unbalanced_tags="wrap" will ensure valid HTML by wrapping - annotations around any unbalanced tags. - use_dmp: If `True` (default), use the fast_diff_match_patch_python - library for diffing. If `False`, use the slower built-in difflib, - which may be useful for debugging. + unbalanced_tags: Optional instruction for how to handle the insertion + of annotations into a `eyecite.models.Document` instantiated from + markup. If `unbalanced_tags="unchecked"` (default), no handling + is performed. If `unbalanced_tags="skip"`, annotations that would + result in the creation of invalid markup are skipped. If + `unbalanced_tags="wrap"`, annotations that would result in the + creation of invalid markup are wrapped in additional tags to + ensure balance. annotator: If provided, should be a function that takes three arguments (the text to insert before, the text of the citation, and the text to insert after) and returns the annotation. This is useful for customizing the annotation action: If you don't pass this function, eyecite will simply concatenate the before_text, citation_text, and after_text together for each annotation. - offset_updater: If provided, use this SpanUpdater. Citation finding - for HTML / XML sources use a SpanUpdater called `plain_to_markup` - to find citations, passing it saves this expensive instantiation. Returns: The annotated text. """ if unbalanced_tags not in ["unchecked", "skip", "wrap"]: raise ValueError(f"Unknown option '{unbalanced_tags}") - # set up offset_updater if we have to move annotations to source_text - if offset_updater: - plain_text = source_text - elif source_text and source_text != plain_text: - placeholder_text = placeholder_markup(source_text) - offset_updater = SpanUpdater( - plain_text, placeholder_text, use_dmp=use_dmp - ) - plain_text = source_text + # if no cleaning was applied to the document, then no need to calculate + # any offsets + if document.source_text == document.cleaned_text: + offset_updater = None + else: + offset_updater = document.cleaned_to_source # append text for each annotation to out annotations = sorted(annotations) out = [] last_end = 0 for (start, end), before, after in annotations: - # if we're applying to source_text, update offsets + # update offsets if necessary if offset_updater: start = offset_updater.update(start, bisect_right) end = offset_updater.update(end, bisect_left) @@ -206,7 +192,7 @@ def annotate_citations( # if annotation is entirely covered, skip continue - span_text = plain_text[start:end] + span_text = document.source_text[start:end] # handle HTML tags if unbalanced_tags == "unchecked": @@ -217,7 +203,7 @@ def annotate_citations( else: # "skip" case original_span_text = span_text start, end, span_text = maybe_balance_style_tags( - start, end, plain_text + start, end, document.source_text ) if not is_balanced_html(span_text): logger.warning( @@ -234,14 +220,14 @@ def annotate_citations( # append each span out.extend( [ - plain_text[last_end:start], + document.source_text[last_end:start], annotated_span, ] ) last_end = end # append text after final citation - if last_end < len(plain_text): - out.append(plain_text[last_end:]) + if last_end < len(document.source_text): + out.append(document.source_text[last_end:]) return "".join(out) diff --git a/eyecite/find.py b/eyecite/find.py index 381d281d..19f0a088 100644 --- a/eyecite/find.py +++ b/eyecite/find.py @@ -1,6 +1,5 @@ import re from bisect import bisect_left, bisect_right -from collections.abc import Callable, Iterable from typing import cast from eyecite.helpers import ( @@ -38,43 +37,35 @@ def get_citations( - plain_text: str = "", + document: Document, remove_ambiguous: bool = False, tokenizer: Tokenizer = default_tokenizer, - markup_text: str = "", - clean_steps: Iterable[str | Callable[[str], str]] | None = None, ) -> list[CitationBase]: - """This is eyecite's main workhorse function. Given a string of text - (e.g., a judicial opinion or other legal doc), return a list of - `eyecite.models.CitationBase` objects representing the citations found - in the doc. + """This is eyecite's main workhorse function. Given a + `eyecite.models.Document` object (e.g., a judicial opinion, other legal + document, or just a string of text), return a list of + `eyecite.models.CitationBase` objects representing the citations found in + the document. Args: - plain_text: The text to parse. You may wish to use the - `eyecite.clean.clean_text` function to pre-process your text - before passing it here. + document: The `eyecite.models.Document` object (which can be + instantiated with just a string) to parse for citations. remove_ambiguous: Whether to remove citations that might refer to more than one reporter and can't be narrowed down by date. tokenizer: An instance of a Tokenizer object. See `eyecite.tokenizers` for information about available tokenizers. Uses the `eyecite.tokenizers.AhocorasickTokenizer` by default. - markup_text: if the source text has markup (XML or HTML mostly), pass - it to extract ReferenceCitations that may be detectable via - markup style tags - clean_steps: Cleanup steps and methods Returns: A list of `eyecite.models.CitationBase` objects """ - if plain_text == "eyecite": + if document.source_text == "eyecite": return joke_cite - document = Document( - plain_text=plain_text, - markup_text=markup_text, - clean_steps=clean_steps, - ) + # Tokenize the document using the chosen tokenizer document.tokenize(tokenizer=tokenizer) + + # Iterate through the document's tokens and look for citations citations: list[CitationBase] = [] for i, token in document.citation_tokens: citation: CitationBase @@ -135,6 +126,9 @@ def get_citations( citations.append(citation) + # Ensure that the extracted citations are sorted in the order that they + # appear in the document (addresses rare but possible overlaps and other + # subtle problems) citations = filter_citations(citations) # Remove citations with multiple reporter candidates where we couldn't @@ -159,16 +153,16 @@ def extract_reference_citations( :return: Reference citations """ - if len(document.plain_text) <= citation.span()[-1]: + if len(document.cleaned_text) <= citation.span()[-1]: return [] if not isinstance(citation, FullCaseCitation): return [] reference_citations = extract_pincited_reference_citations( - citation, document.plain_text + citation, document.cleaned_text ) - if document.markup_text: + if document.has_markup: reference_citations.extend( find_reference_citations_from_markup( document, @@ -293,7 +287,7 @@ def _extract_shortform_citation( }, ) - if document.markup_text: + if document.has_markup: find_case_name_in_html(citation, document, short=True) if citation.metadata.antecedent_guess is None: find_case_name(citation, document, short=True) @@ -425,35 +419,35 @@ def find_reference_citations_from_markup( regex = rf"<(?:{tags})>\s*({'|'.join(regexes)})[:;.,\s]*" if ( - not document.plain_to_markup - or not document.markup_to_plain - or not document.markup_text + not document.cleaned_to_source + or not document.source_to_cleaned + or not document.has_markup ): # ensure we have markup text return [] - start_in_markup = document.plain_to_markup.update( + start_in_markup = document.cleaned_to_source.update( citation.span()[0], bisect_right ) for match in re.finditer( - regex, document.markup_text[start_in_markup:] + regex, document.source_text[start_in_markup:] ): - full_start_in_plain = document.markup_to_plain.update( + full_start_in_plain = document.source_to_cleaned.update( start_in_markup + match.start(), bisect_left ) - full_end_in_plain = document.markup_to_plain.update( + full_end_in_plain = document.source_to_cleaned.update( start_in_markup + match.end(), bisect_right ) # the first group [match.group(0)] is the whole match, # with whitespace and punctuation. the second group, match.group(1) # is the only capturing and named group - start_in_plain = document.markup_to_plain.update( + start_in_plain = document.source_to_cleaned.update( start_in_markup + match.start(1), bisect_left ) - end_in_plain = document.markup_to_plain.update( + end_in_plain = document.source_to_cleaned.update( start_in_markup + match.end(1), bisect_right ) - raw_after = document.plain_text[full_end_in_plain:] + raw_after = document.cleaned_text[full_end_in_plain:] if re.match(r"^\s*(v[.s]|supra)\s", raw_after): # filter likely bad reference matches # when matching reference citations in markup it is possible @@ -464,7 +458,7 @@ def find_reference_citations_from_markup( reference = ReferenceCitation( token=CaseReferenceToken( - data=document.plain_text[start_in_plain:end_in_plain], + data=document.cleaned_text[start_in_plain:end_in_plain], start=start_in_plain, end=end_in_plain, ), diff --git a/eyecite/helpers.py b/eyecite/helpers.py index d4a9fb3e..d8528908 100644 --- a/eyecite/helpers.py +++ b/eyecite/helpers.py @@ -495,11 +495,14 @@ def find_html_tags_at_position( List of tuples containing (tag_name, start_pos, end_pos) Empty list if no matching tags found """ - markup_loc = document.plain_to_markup.update( # type: ignore + if not document.has_markup: + return [] + + markup_loc = document.cleaned_to_source.update( # type: ignore position, bisect_right, ) - tags = [r for r in document.emphasis_tags if r[1] <= markup_loc < r[2]] + tags = [r for r in document.emphasis_tags if r[1] <= markup_loc < r[2]] # type: ignore if len(tags) != 1: return [] return tags @@ -838,15 +841,15 @@ def convert_html_to_plain_text_and_loc( """ markup_location = results[0] - start = document.markup_to_plain.update( # type: ignore + start = document.source_to_cleaned.update( # type: ignore markup_location[1], bisect_right, ) - end = document.markup_to_plain.update( # type: ignore + end = document.source_to_cleaned.update( # type: ignore markup_location[2], bisect_right, ) - case_name = document.plain_text[start:end] + case_name = document.cleaned_text[start:end] return (case_name, start, end) diff --git a/eyecite/models.py b/eyecite/models.py index 605a0f70..a259459f 100644 --- a/eyecite/models.py +++ b/eyecite/models.py @@ -12,7 +12,11 @@ from eyecite import clean_text from eyecite.annotate import SpanUpdater -from eyecite.utils import REPORTERS_THAT_NEED_PAGE_CORRECTION, hash_sha256 +from eyecite.utils import ( + REPORTERS_THAT_NEED_PAGE_CORRECTION, + create_placeholder_markup, + hash_sha256, +) logger = logging.getLogger(__name__) @@ -532,7 +536,7 @@ def add_metadata(self, document: "Document"): add_post_citation(self, document.words) - if document.markup_text: + if document.has_markup: find_case_name_in_html(self, document) if self.metadata.defendant is None: find_case_name(self, document) @@ -878,82 +882,104 @@ def __eq__(self, other): @dataclass(eq=False, unsafe_hash=False) class Document: - """A class to encapsulate the source text and the pre-processing applied to - it before citation parsing - - If the source text comes from `markup_text`, SpanUpdater objects are - created to help on citation parsing + """Class representing the text of a legal document or string and + any pre-processing cleaning steps applied to it before citation parsing. + If the source text contains markup, indicate that by setting + `has_markup=True` for a performance boost in citation parsing. """ - plain_text: str = "" - markup_text: str | None = "" - citation_tokens: list[tuple[int, Token]] = field(default_factory=list) - words: Tokens = field(default_factory=list) - plain_to_markup: SpanUpdater | None = field(default=None, init=False) - markup_to_plain: SpanUpdater | None = field(default=None, init=False) - clean_steps: Iterable[str | Callable[[str], str]] | None = field( + ################# + # User parameters + ################# + + # Instantiate with some source text and indicate whether it has markup + # or not + source_text: str + has_markup: bool = False + + # Provide optional cleaning steps to apply to the instantiating text + clean_steps: Iterable[str | Callable[[str], str]] = field( default_factory=list ) - emphasis_tags: list[tuple[str, int, int]] = field(default_factory=list) - source_text: str = "" # will be useful for the annotation step - def __post_init__(self): - from eyecite.utils import placeholder_markup + # Specify how differences between the given source_text and the + # cleaned cleaned_text should be calculated. If `True` (default), the + # fast_diff_match_patch_python library is used for diffing. If `False`, + # the slower built-in difflib is used instead, which may be useful for + # debugging. + use_dmp: bool = True - if self.plain_text and not self.markup_text: - self.source_text = self.plain_text - if self.clean_steps: - self.plain_text = clean_text(self.plain_text, self.clean_steps) + ###################################################### + # Document properties that are set after instantiation + ###################################################### - elif self.markup_text and not self.plain_text: - self.source_text = self.markup_text + # The cleaned version of the source_text + cleaned_text: str = field(init=False) - if "html" not in self.clean_steps: - self.clean_steps.insert("html", 0) - logger.warning( - "`html` has been added to `markup_text` clean_steps list" - ) + # The tokenized version of the cleaned_text + citation_tokens: list[tuple[int, Token]] = field(init=False) + words: Tokens = field(init=False) - self.plain_text = clean_text(self.markup_text, self.clean_steps) + # Functions for diffing the source_text and cleaned_text + cleaned_to_source: SpanUpdater | None = field(default=None, init=False) + source_to_cleaned: SpanUpdater | None = field(default=None, init=False) - # Replace original tags (including their attributes) with same‐length placeholders - # so that SpanUpdater’s offset calculations remain correct and aren’t skewed by - # attribute characters (e.g., in id or index). ex. - placeholder_markup = placeholder_markup(self.markup_text) + # When `has_markup=True`, these are additional variables + # representing useful markup-specific features of the document + emphasis_tags: list[tuple[str, int, int]] | None = field( + default=None, init=False + ) + placeholder_markup: str | None = field(default=None, init=False) - self.plain_to_markup = SpanUpdater( - self.plain_text, placeholder_markup - ) - self.markup_to_plain = SpanUpdater( - self.markup_text, self.plain_text - ) + def __post_init__(self): + """Configure Document properties after user initialization""" - self.identify_emphasis_tags() + # (1) Plain text + if not self.has_markup: + self.cleaned_text = clean_text(self.source_text, self.clean_steps) - elif not self.markup_text and not self.plain_text: - raise ValueError("Both `markup_text` and `plain_text` are empty") + # (2) Markup text + else: + if "html" not in self.clean_steps: + self.clean_steps.insert(0, "html") + logger.warning("`html` has been added to clean_steps list") - elif self.plain_text and self.markup_text: - # both arguments were passed, we assume that `plain_text` is the - # cleaned version of `markup_text` - if self.clean_steps: - raise ValueError( - "Both `markup_text` and `plain_text` were passed. " - "Not clear which to apply `clean_steps` to" - ) + self.cleaned_text = clean_text(self.source_text, self.clean_steps) + + # Replace original tags (including their attributes) with + # same-length placeholders so that the SpanUpdater's offset + # calculations remain correct and aren't skewed by attribute + # characters (e.g., in id or index). ex. + self.placeholder_markup = create_placeholder_markup( + self.source_text + ) - self.source_text = self.markup_text + # Identify any emphasis tags in the markup + self._identify_emphasis_tags() - def identify_emphasis_tags(self): + # Create functions for diffing the source_text and cleaned_text + self.cleaned_to_source = SpanUpdater( + self.cleaned_text, + self.placeholder_markup if self.has_markup else self.source_text, + use_dmp=self.use_dmp, + ) + self.source_to_cleaned = SpanUpdater( + self.placeholder_markup if self.has_markup else self.source_text, + self.cleaned_text, + use_dmp=self.use_dmp, + ) + + def _identify_emphasis_tags(self): pattern = re.compile( r"<(em|i)[^>]*>(.*?)", re.IGNORECASE | re.DOTALL ) self.emphasis_tags = [ (m.group(2).strip(), m.start(), m.end()) - for m in pattern.finditer(self.markup_text) + for m in pattern.finditer(self.source_text) ] def tokenize(self, tokenizer): - """Tokenize the document and store the results in the document - object""" - self.words, self.citation_tokens = tokenizer.tokenize(self.plain_text) + """Tokenize the document and store the results""" + self.words, self.citation_tokens = tokenizer.tokenize( + self.cleaned_text + ) diff --git a/eyecite/utils.py b/eyecite/utils.py index 13d089ea..a2485b34 100644 --- a/eyecite/utils.py +++ b/eyecite/utils.py @@ -317,7 +317,7 @@ def maybe_balance_style_tags( return start, end, plain_text[start:end] -def placeholder_markup(html: str) -> str: +def create_placeholder_markup(html: str) -> str: """Create placeholder HTML to identify annotation locations. This allows diffing or annotation algorithms to maintain correct diff --git a/tests/test_AnnotateTest.py b/tests/test_AnnotateTest.py index b2e14f31..9d0fff8a 100644 --- a/tests/test_AnnotateTest.py +++ b/tests/test_AnnotateTest.py @@ -2,7 +2,7 @@ from pathlib import Path from unittest import TestCase -from eyecite import annotate_citations, clean_text, get_citations +from eyecite import annotate_citations, get_citations from eyecite.models import Document from eyecite.utils import maybe_balance_style_tags @@ -62,27 +62,37 @@ def lower_annotator(before, text, after): "foo 1 U.S. 1 bar", "foo <0>1 U.S. 1 bar", ["html", "inline_whitespace"], + {"has_markup": True}, ), # whitespace and html -- unbalanced tags are repaired ( "foo 1 U.S. 1; 2 U.S. 2", "foo <0>1 U.S. 1; <1>2 U.S. 2", ["html", "inline_whitespace"], - {"unbalanced_tags": "skip"}, + { + "unbalanced_tags": "skip", + "has_markup": True, + }, ), # whitespace and html -- wrap unbalanced tags ( "1 U.S. 1; 2 U.S. 2", "<0>1 U.S.<0> 1; <1>2 U.S. 2", ["html", "inline_whitespace"], - {"unbalanced_tags": "wrap"}, + { + "unbalanced_tags": "wrap", + "has_markup": True, + }, ), # tighly-wrapped html -- skip unbalanced tags (issue #54) ( "foo Ibid. bar", "foo <0>Ibid. bar", ["html", "inline_whitespace"], - {"unbalanced_tags": "skip"}, + { + "unbalanced_tags": "skip", + "has_markup": True, + }, ), # whitespace containing linebreaks ("1\nU.S. 1", "<0>1\nU.S. 1", ["all_whitespace"]), @@ -123,7 +133,7 @@ def lower_annotator(before, text, after): { "annotate_anchors": True, "unbalanced_tags": "skip", - "use_markup": True, + "has_markup": True, }, ), # solvable unbalanced tag @@ -145,7 +155,7 @@ def lower_annotator(before, text, after): { "annotate_anchors": True, "unbalanced_tags": "skip", - "use_markup": True, + "has_markup": True, }, ), # The next 2 examples could be resolved if we increased the @@ -175,7 +185,7 @@ def lower_annotator(before, text, after): { "annotate_anchors": True, "unbalanced_tags": "skip", - "use_markup": True, + "has_markup": True, }, ), ( @@ -197,7 +207,7 @@ def lower_annotator(before, text, after): { "annotate_anchors": True, "unbalanced_tags": "skip", - "use_markup": True, + "has_markup": True, }, ), ( @@ -207,7 +217,7 @@ def lower_annotator(before, text, after): { "annotate_anchors": False, "unbalanced_tags": "skip", - "use_markup": True, + "has_markup": True, }, ), # Ensure < does not affect annotations @@ -218,7 +228,7 @@ def lower_annotator(before, text, after): { "annotate_anchors": False, "unbalanced_tags": "skip", - "use_markup": True, + "has_markup": True, }, ), ) @@ -229,18 +239,14 @@ def lower_annotator(before, text, after): clean_steps=clean_steps, annotate_args=annotate_kwargs, ): - if annotate_kwargs.pop("use_markup", False): - get_citations_args = {"markup_text": source_text} + if annotate_kwargs.pop("has_markup", False): + document = Document( + source_text, has_markup=True, clean_steps=clean_steps + ) else: - get_citations_args = {"plain_text": source_text} - - document = Document( - **get_citations_args, clean_steps=clean_steps - ) + document = Document(source_text, clean_steps=clean_steps) - cites = get_citations( - **get_citations_args, clean_steps=clean_steps - ) + cites = get_citations(document) annotations = [ (c.span(), f"<{i}>", f"") for i, c in enumerate(cites) @@ -253,9 +259,8 @@ def lower_annotator(before, text, after): ] annotated = annotate_citations( - document.plain_text, + document, annotations, - source_text=source_text, **annotate_kwargs, ) self.assertEqual(annotated, expected) @@ -298,9 +303,14 @@ def test_long_diff(self): opinion_text = ( Path(__file__).parent / "assets" / "opinion.txt" ).read_text() - cleaned_text = clean_text(opinion_text, ["all_whitespace"]) + document = Document( + opinion_text, + has_markup=True, + clean_steps=["html", "all_whitespace"], + ) annotated_text = annotate_citations( - cleaned_text, [((902, 915), "~FOO~", "~BAR~")], opinion_text + document, + [((902, 915), "~FOO~", "~BAR~")], ) self.assertIn("~FOO~539\n U. S. 306~BAR~", annotated_text) @@ -333,11 +343,13 @@ def test_span_with_pincite(self): ), ] for source_text, expected in test_pairs: - plain_text = clean_text(source_text, ["all_whitespace", "html"]) - citations = get_citations(plain_text) + document = Document( + source_text, clean_steps=["all_whitespace", "html"] + ) + citations = get_citations(document) for citation in citations: start, end = citation.span_with_pincite() - pin_cite_span = plain_text[start:end] + pin_cite_span = document.cleaned_text[start:end] self.assertEqual( pin_cite_span, expected.pop(0), diff --git a/tests/test_CourtsTest.py b/tests/test_CourtsTest.py index 500c4b8a..642cff02 100644 --- a/tests/test_CourtsTest.py +++ b/tests/test_CourtsTest.py @@ -1,6 +1,7 @@ from unittest import TestCase from eyecite import get_citations +from eyecite.models import Document class RegexesTest(TestCase): @@ -38,5 +39,5 @@ def test_parenthetical_court_parser(self): "Wallace v. Cellco P'ship, No. CV 14-8052-DSF (AS), 2015 WL 13908106, at *7 (C.D. Cal. Feb. 9, 2015)": "cacd", } for key in samples: - eyecite_result = get_citations(key) + eyecite_result = get_citations(Document(key)) self.assertEqual(eyecite_result[0].metadata.court, samples[key]) diff --git a/tests/test_FindTest.py b/tests/test_FindTest.py index 89416e31..c52ebdb9 100644 --- a/tests/test_FindTest.py +++ b/tests/test_FindTest.py @@ -62,15 +62,19 @@ def get_comparison_attrs(cite): tokenizers = tested_tokenizers for q, expected_cites, *kwargs in test_pairs: kwargs = kwargs[0] if kwargs else {} - clean_steps = kwargs.get("clean_steps", []) + clean_steps = kwargs.pop("clean_steps", []) for tokenizer in tokenizers: with self.subTest( message, tokenizer=type(tokenizer).__name__, q=q ): if "html" in clean_steps: - kwargs["markup_text"] = q + kwargs["document"] = Document( + q, has_markup=True, clean_steps=clean_steps + ) else: - kwargs["plain_text"] = q + kwargs["document"] = Document( + q, clean_steps=clean_steps + ) cites_found = get_citations(tokenizer=tokenizer, **kwargs) self.assertEqual( @@ -1115,7 +1119,7 @@ def test_nominative_reporter_overlaps(self): ), ] for cite_string, cite_object in pairs: - parsed_cite = get_citations(cite_string)[0] + parsed_cite = get_citations(Document(cite_string))[0] self.assertEqual( parsed_cite, cite_object, @@ -1147,7 +1151,7 @@ def test_citation_fullspan(self): # Make sure it works with several citations in one string combined_example = "citation number one is Wilson v. Mar. Overseas Corp., 150 F.3d 1, 6-7 ( 1st Cir. 1998); This is different from Commonwealth v. Bauer, 604 A.2d 1098 (Pa.Super. 1992), my second example" - extracted = get_citations(combined_example) + extracted = get_citations(Document(combined_example)) # answers format is (citation_index, (full_span_start, full_span_end)) answers = [(0, (23, 86)), (1, (111, 164))] for cit_idx, (start, end) in answers: @@ -1173,7 +1177,7 @@ def test_citation_fullspan(self): "Alderson v. Concordia Par. Corr. Facility, 848 F.3d 415 (5th Cir. 2017)", ] for example in simple_examples: - extracted = get_citations(example)[0] + extracted = get_citations(Document(example))[0] error_msg = "Full span indices for a simple example should be (0, len(example)) " self.assertEqual( extracted.full_span(), (0, len(example)), error_msg @@ -1184,7 +1188,7 @@ def test_citation_fullspan(self): ("Citing 66 B.U. L. Rev. 71 (1986)", 7), ] for sentence, start_idx in stopword_examples: - extracted = get_citations(sentence)[0] + extracted = get_citations(Document(sentence))[0] error_msg = "Wrong span for stopword example" self.assertEqual( extracted.full_span(), (start_idx, len(sentence)), error_msg @@ -1204,10 +1208,10 @@ def test_reference_extraction_using_resolved_names(self): [State v. Wingler at 175, citing, Minnesota ex rel.]""", ] for plain_text in texts: - citations = get_citations(plain_text) + document = Document(plain_text) + citations = get_citations(document) found_cite = citations[0] found_cite.metadata.resolved_case_name = "State v. Wingler" - document = Document(plain_text=plain_text, markup_text="") references = extract_reference_citations( citation=found_cite, document=document ) @@ -1242,7 +1246,11 @@ def test_reference_extraction_from_markup(self): punitive goals as well.\" 44 F.3d at 493.

""" citations = get_citations( - markup_text=markup_text, clean_steps=["html", "all_whitespace"] + Document( + markup_text, + has_markup=True, + clean_steps=["html", "all_whitespace"], + ) ) references = [c for c in citations if isinstance(c, ReferenceCitation)] # Tests both for the order and exact counts. Note that there is one @@ -1283,7 +1291,11 @@ def test_reference_filtering(self): ] for markup_text in texts: citations = get_citations( - markup_text=markup_text, clean_steps=["html", "all_whitespace"] + Document( + markup_text, + has_markup=True, + clean_steps=["html", "all_whitespace"], + ) ) self.assertFalse( any(isinstance(cite, ReferenceCitation) for cite in citations) @@ -1955,6 +1967,6 @@ def test_citation_in_parenthetical_does_not_emit_warning(self, mock_warn): warning should be emitted. """ text = "Gotthelf v. Toyota Motor Sales, U.S.A., Inc., 525 F. App’x 94, 103 n.15 (3d Cir. 2013) (quoting Iqbal, 556 U.S. at 686-87)." - citations = get_citations(text) + citations = get_citations(Document(text)) self.assertEqual(len(citations), 2) mock_warn.assert_not_called() diff --git a/tests/test_ModelsTest.py b/tests/test_ModelsTest.py index b195530b..a7ef0e74 100644 --- a/tests/test_ModelsTest.py +++ b/tests/test_ModelsTest.py @@ -1,7 +1,7 @@ from unittest import TestCase from eyecite import get_citations -from eyecite.models import Resource +from eyecite.models import Document, Resource from eyecite.test_factories import ( case_citation, id_citation, @@ -98,8 +98,8 @@ def test_citation_comparison_with_nominative_reporter(self): """Are two citation objects equal when their attributes are the same, even if one of them has a nominative reporter?""" citations = [ - get_citations("5 U.S. 137")[0], - get_citations("5 U.S. (1 Cranch) 137")[0], + get_citations(Document("5 U.S. 137"))[0], + get_citations(Document("5 U.S. (1 Cranch) 137"))[0], ] print( "Testing citation comparison with nominative reporter...", end=" " @@ -128,8 +128,8 @@ def test_tax_court_citation_comparison(self): the same, even if they are tax court citations and might not have volumes?""" citations = [ - get_citations("T.C.M. (RIA) ¶ 95,342")[0], - get_citations("T.C.M. (RIA) ¶ 95,342")[0], + get_citations(Document("T.C.M. (RIA) ¶ 95,342"))[0], + get_citations(Document("T.C.M. (RIA) ¶ 95,342"))[0], ] print("Testing tax court citation comparison...", end=" ") self.assertEqual(citations[0], citations[1]) @@ -163,7 +163,7 @@ def test_missing_page_cite_conversion(self): attribute set to None?""" citation1 = case_citation(2, volume="2", reporter="U.S.", page="__") - citation2 = get_citations("2 U.S. __")[0] + citation2 = get_citations(Document("2 U.S. __"))[0] print("Testing missing page conversion...", end=" ") self.assertIsNone(citation1.groups["page"]) self.assertIsNone(citation2.groups["page"]) @@ -207,7 +207,9 @@ def test_corrected_full_citation_includes_closing_parenthesis(self): """Does the corrected_citation_full method return a properly formatted citation?""" journal_citation = get_citations( - "Originalism without Foundations, 65 N.Y.U. L. Rev. 1373 (1990)" + Document( + "Originalism without Foundations, 65 N.Y.U. L. Rev. 1373 (1990)" + ) )[0] self.assertEqual( journal_citation.corrected_citation_full(), @@ -215,7 +217,7 @@ def test_corrected_full_citation_includes_closing_parenthesis(self): ) full_case_citation = get_citations( - "Meritor Sav. Bank v. Vinson, 477 U.S. 57, 60 (1986)" + Document("Meritor Sav. Bank v. Vinson, 477 U.S. 57, 60 (1986)") )[0] self.assertEqual( full_case_citation.corrected_citation_full(), @@ -242,7 +244,7 @@ def test_page_correction(self): ), ] for citation, corrected_citation, corrected_page in tests: - cite = get_citations(citation)[0] + cite = get_citations(Document(citation))[0] self.assertEqual( cite.corrected_citation(), corrected_citation, diff --git a/tests/test_ResolveTest.py b/tests/test_ResolveTest.py index c6ad9003..bf78e722 100644 --- a/tests/test_ResolveTest.py +++ b/tests/test_ResolveTest.py @@ -52,10 +52,8 @@ def checkReferenceResolution( None """ - document = Document( - plain_text=citation_text, - ) - citations = get_citations(citation_text) + document = Document(citation_text) + citations = get_citations(document) if resolved_case_name_short: citations[ 0 @@ -127,7 +125,7 @@ def checkResolution(self, *expected_resolutions: tuple[int | None, str]): for i, cite_text in expected_resolutions: # extract cite and make sure there's only one: - cites = get_citations(cite_text) + cites = get_citations(document=Document(cite_text)) self.assertEqual( len(cites), 1, diff --git a/tests/test_UtilsTest.py b/tests/test_UtilsTest.py index eddd1a01..e3a07056 100644 --- a/tests/test_UtilsTest.py +++ b/tests/test_UtilsTest.py @@ -3,6 +3,7 @@ from unittest import TestCase from eyecite import clean_text, get_citations +from eyecite.models import Document from eyecite.utils import dump_citations @@ -34,7 +35,7 @@ def test_clean_text_invalid(self): def test_dump_citations(self): text = "blah. Foo v. Bar, 1 U.S. 2, 3-4 (1999). blah" - cites = get_citations(text) + cites = get_citations(Document(text)) dumped_text = dump_citations(cites, text) dumped_text = re.sub(r"\x1B.*?m", "", dumped_text) # strip colors expected = dedent(