Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
36 commits
Select commit Hold shift + click to select a range
9f7e49a
dummy push
yonadavGit Jan 26, 2026
0c0b601
chore(tasks): add tqdm progress bars to bulk disambiguation task disp…
yonadavGit Jan 26, 2026
95bffcd
chore(disambiguator): fix SEFARIA_SEARCH_URL to remove redundant /api…
yonadavGit Jan 27, 2026
662ead1
chore(disambiguator): handle and record Dicta API errors with non-200…
yonadavGit Jan 27, 2026
0d5d8dd
dummy commit
yonadavGit Jan 27, 2026
e609e25
chore(tasks): add resume support for bulk disambiguation task dispatc…
yonadavGit Jan 27, 2026
c78bf0e
chore(tasks): update ambiguous payload resume point for bulk disambig…
yonadavGit Jan 27, 2026
6148a77
chore(tasks): add CLI args for skipping/resuming ambiguous and non-se…
yonadavGit Jan 27, 2026
9c7d7cc
chore(tests): add integration tests for non-segment disambiguator
yonadavGit Feb 1, 2026
68430ff
chore(disambiguator): add LLM prior formation and confirmation functi…
yonadavGit Feb 1, 2026
7a56a13
chore(disambiguator): update Sefaria search functions to return lists…
yonadavGit Feb 1, 2026
6f1f4f3
chore(disambiguator): update default LLM model to claude-sonnet-4-5-2…
yonadavGit Feb 3, 2026
6c5263e
chore(disambiguator): add function to strip cantillation and vowels f…
yonadavGit Feb 3, 2026
f1adc84
chore(tests): add additional test cases for non-segment disambiguator
yonadavGit Feb 3, 2026
739899c
chore(disambiguator): refine LLM prompt for verbatim phrase extractio…
yonadavGit Feb 3, 2026
1394c1a
chore(tests): add test case for ownerless property reference resolution
yonadavGit Feb 4, 2026
967070f
chore(tests): comment out outdated test case for Hebrew reference res…
yonadavGit Feb 4, 2026
ab3aa29
chore(disambiguator): add llm_resolved_phrase to NonSegmentResolution…
yonadavGit Feb 4, 2026
5ab5c57
chore(disambiguator): enhance resolution metadata with llm_resolved_p…
yonadavGit Feb 4, 2026
5db1942
merge master
yonadavGit Feb 4, 2026
5ce06cb
chore(disambiguator): update resolution fields for ambiguous and non-…
yonadavGit Feb 4, 2026
446b407
chore(disambiguator): reduce debug limit and enhance non-segment reso…
yonadavGit Feb 5, 2026
1f9f626
chore(disambiguator): add integration tests for ambiguous disambiguat…
yonadavGit Feb 8, 2026
f81cd5d
chore(disambiguator): implement LLM-based resolution for base text vs…
yonadavGit Feb 8, 2026
773005f
chore(disambiguator): update resolution result fields to use optional…
yonadavGit Feb 8, 2026
1a91b41
chore(disambiguator): enhance handling of ambiguous references and up…
yonadavGit Feb 8, 2026
f2f431c
chore(disambiguator): improve logging for resolution data and adjust …
yonadavGit Feb 9, 2026
0d5e642
dummy push
yonadavGit Feb 9, 2026
07a7117
fix(disambiguator): add line break
nsantacruz Feb 9, 2026
c39ce12
Merge branch 'master' into chore/sc-40225/create-task-for-running-dis…
nsantacruz Feb 9, 2026
5094ca5
chore: update gunicorn version to 25.0.3
nsantacruz Feb 9, 2026
5bd6af6
chore: downgrade gunicorn version to 23.0.0
nsantacruz Feb 9, 2026
b1a7e16
chore: downgrade gunicorn version to 23.0.0
nsantacruz Feb 9, 2026
d634a97
merge master
yonadavGit Feb 10, 2026
9795f99
Merge remote-tracking branch 'origin/chore/sc-40225/create-task-for-r…
yonadavGit Feb 10, 2026
172ed3e
chore(dispatch_library_links_disambiguation_tasks): enhance non-segme…
yonadavGit Feb 10, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
90 changes: 73 additions & 17 deletions scripts/dispatch_library_links_disambiguation_tasks.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,26 +6,42 @@
2. Non-segment-level resolutions

Set DEBUG_MODE = True at the top of the script to limit to 10 random docs for debug.

Examples:
python dispatch_library_links_disambiguation_tasks.py --ambiguous-start 565440 --non-segment-start 0
python dispatch_library_links_disambiguation_tasks.py --ambiguous-start skip --non-segment-start 0
"""

import django
django.setup()

from collections import defaultdict
import argparse
from tqdm import tqdm
from sefaria.model import Ref
from sefaria.system.exceptions import InputError
from sefaria.system.database import db
from sefaria.settings import CELERY_QUEUES, CELERY_ENABLED
from sefaria.celery_setup.app import app
from dataclasses import asdict
from sefaria.helper.linker.disambiguator import AmbiguousResolutionPayload, NonSegmentResolutionPayload
from sefaria.helper.linker.tasks import _is_non_segment_or_perek_ref

# Global flag for debug mode
DEBUG_MODE = True # True = sample a small random subset; False = process all matching LinkerOutput docs
DEBUG_LIMIT = 500 # Number of random examples to fetch in debug mode
DEBUG_MODE = False # True = sample a small random subset; False = process all matching LinkerOutput docs
DEBUG_LIMIT = 10 # Number of random examples to fetch in debug mode
DEBUG_SEED = 6133 # Seed for reproducible random sampling



def _parse_start_arg(value: str):
if value is None:
return 0
if value.lower() == "skip":
return "skip"
return int(value)


def is_segment_level_ref(ref_str):
"""Check if a reference string is segment-level"""
try:
Expand Down Expand Up @@ -147,8 +163,11 @@ def find_non_segment_level_resolutions():
"$elemMatch": {
"type": "citation",
"failed": {"$ne": True},
"ambiguous": {"$ne": True},
"ref": {"$exists": True}
"ref": {"$exists": True},
"$or": [
{"ambiguous": {"$ne": True}},
{"llm_ambiguous_option_valid": True},
],
}
}
}
Expand All @@ -170,16 +189,21 @@ def find_non_segment_level_resolutions():
for span in raw_linker_output.get('spans', []):
# Only look at successful citation resolutions
if (span.get('type') != 'citation' or
span.get('failed', False) or
span.get('ambiguous', False)):
span.get('failed', False)):
continue
if span.get('ambiguous', False) and not span.get('llm_ambiguous_option_valid'):
continue

ref_str = span.get('ref')
if span.get('ambiguous', False) and span.get('llm_ambiguous_option_valid'):
amb_resolved_ref = span.get('llm_resolved_ref_ambiguous')
if amb_resolved_ref:
ref_str = amb_resolved_ref
if not ref_str:
continue

# Check if it's NOT segment level
if not is_segment_level_ref(ref_str):
# Check if it's NOT segment level (including perek/parasha treated as non-segment)
if _is_non_segment_or_perek_ref(ref_str):
try:
oref = Ref(ref_str)
ref_level = 'unknown'
Expand Down Expand Up @@ -219,6 +243,15 @@ def enqueue_bulk_disambiguation(payload: dict):

def main():
"""Main execution function - find and dispatch tasks"""
parser = argparse.ArgumentParser()
parser.add_argument("--ambiguous-start", default="0",
help="Number to skip for ambiguous resolutions, or 'skip'")
parser.add_argument("--non-segment-start", default="0",
help="Number to skip for non-segment resolutions, or 'skip'")
args = parser.parse_args()
ambiguous_start_from = _parse_start_arg(args.ambiguous_start)
non_segment_start_from = _parse_start_arg(args.non_segment_start)

print("Starting Library Links Disambiguation Tasks Dispatcher")
if DEBUG_MODE:
print(f"DEBUG MODE: Limited to {DEBUG_LIMIT} documents")
Expand All @@ -237,17 +270,39 @@ def main():
return

# Find ambiguous resolutions
ambiguous_resolutions = find_ambiguous_resolutions()
ambiguous_resolutions = [] if ambiguous_start_from == "skip" else find_ambiguous_resolutions()

# Find non-segment-level resolutions
non_segment_resolutions = find_non_segment_level_resolutions()

# Dispatch bulk disambiguation tasks (single payload each)
print(f"Dispatching {len(ambiguous_resolutions) + len(non_segment_resolutions)} bulk disambiguation tasks...")
# Dispatch ambiguous first
print(f"Dispatching {len(ambiguous_resolutions)} ambiguous disambiguation tasks...")
try:
for resolution in ambiguous_resolutions:
ambiguous_iter = (
ambiguous_resolutions[ambiguous_start_from:]
if isinstance(ambiguous_start_from, int) and ambiguous_start_from
else ambiguous_resolutions
)
for resolution in tqdm(
ambiguous_iter,
desc="Ambiguous resolutions",
initial=ambiguous_start_from if isinstance(ambiguous_start_from, int) else 0,
total=len(ambiguous_resolutions),
):
enqueue_bulk_disambiguation(asdict(resolution))
for resolution in non_segment_resolutions:

# Find non-segment-level resolutions AFTER ambiguous dispatch
non_segment_resolutions = [] if non_segment_start_from == "skip" else find_non_segment_level_resolutions()
print(f"Dispatching {len(non_segment_resolutions)} non-segment disambiguation tasks...")

non_segment_iter = (
non_segment_resolutions[non_segment_start_from:]
if isinstance(non_segment_start_from, int) and non_segment_start_from
else non_segment_resolutions
)
for resolution in tqdm(
non_segment_iter,
desc="Non-segment resolutions",
initial=non_segment_start_from if isinstance(non_segment_start_from, int) else 0,
total=len(non_segment_resolutions),
):
enqueue_bulk_disambiguation(asdict(resolution))
print("Dispatched bulk disambiguation tasks")
except Exception as e:
Expand All @@ -259,4 +314,5 @@ def main():


if __name__ == "__main__":
main()
# main()
print(len(find_non_segment_level_resolutions()))
Loading
Loading