From 1baf6d64e531c062ab77d2c3bd37ba2168b4a673 Mon Sep 17 00:00:00 2001 From: Yitzhak Clark Date: Mon, 19 Jan 2026 10:08:14 +0200 Subject: [PATCH] feat(dev): add file-based caching for library objects during development Add optional file-based caching for expensive library objects (AutoCompleter, Linker, topic pools) that persist across Django server reloads during development. This significantly speeds up the development iteration cycle by avoiding re-initialization of these objects on every code change. New settings: - USE_DEV_FILE_CACHE: Enable/disable the feature (default: False) - DEV_FILE_CACHE_DIR: Cache directory (default: /tmp/sefaria_dev_cache) Modified methods to use file cache: - Library.build_full_auto_completer() - Library.build_lexicon_auto_completers() - Library.build_cross_lexicon_auto_completer() - Library.build_linker() - TopicManager.build_slug_to_pools_cache() Added management command: - python manage.py clear_dev_cache Co-Authored-By: Claude Opus 4.5 --- django_topics/models/topic.py | 13 ++ reader/management/commands/clear_dev_cache.py | 55 ++++++++ sefaria/local_settings_example.py | 7 + sefaria/model/text.py | 60 +++++++++ sefaria/settings.py | 5 + sefaria/system/cache.py | 123 +++++++++++++++++- 6 files changed, 262 insertions(+), 1 deletion(-) create mode 100644 reader/management/commands/clear_dev_cache.py diff --git a/django_topics/models/topic.py b/django_topics/models/topic.py index 0d8b9848db..e42762f1c9 100644 --- a/django_topics/models/topic.py +++ b/django_topics/models/topic.py @@ -24,7 +24,17 @@ def get_topic_slugs_by_pool(self, pool: str) -> QuerySet: return self.filter(pools__name=pool).values_list("slug", flat=True) def build_slug_to_pools_cache(self, rebuild=False): + from sefaria.system.cache import load_from_dev_file_cache, save_to_dev_file_cache + if rebuild or not self.slug_to_pools: + # Try to load from dev file cache first (only if not forcing rebuild) + if not rebuild: + cached = load_from_dev_file_cache("topic_slug_to_pools") + if cached is not None: + self.slug_to_pools = cached + return + + # Build from scratch new_slug_to_pools = defaultdict(list) topics = self.model.objects.values_list('slug', 'pools__name') for slug, pool_name in topics: @@ -32,6 +42,9 @@ def build_slug_to_pools_cache(self, rebuild=False): new_slug_to_pools[slug].append(pool_name) self.slug_to_pools = new_slug_to_pools + # Save to dev file cache for next reload + save_to_dev_file_cache("topic_slug_to_pools", dict(self.slug_to_pools)) + class Topic(models.Model): slug = models.CharField(max_length=255, primary_key=True) en_title = models.CharField(max_length=255, blank=True, default="") diff --git a/reader/management/commands/clear_dev_cache.py b/reader/management/commands/clear_dev_cache.py new file mode 100644 index 0000000000..5b6bebc171 --- /dev/null +++ b/reader/management/commands/clear_dev_cache.py @@ -0,0 +1,55 @@ +""" +Management command to clear the development file cache. + +Usage: + python manage.py clear_dev_cache # Clear all cache entries + python manage.py clear_dev_cache --list # List all cache entries + python manage.py clear_dev_cache --name full_auto_completer # Clear specific entry +""" +from django.core.management.base import BaseCommand +from sefaria.system.cache import clear_dev_file_cache, list_dev_file_cache, is_dev_file_cache_enabled + + +class Command(BaseCommand): + help = 'Clear the development file cache used for persisting library objects across server reloads' + + def add_arguments(self, parser): + parser.add_argument( + '--name', + type=str, + help='Clear a specific cache entry by name (e.g., full_auto_completer, linker_he)', + ) + parser.add_argument( + '--list', + action='store_true', + help='List all cache entries instead of clearing', + ) + + def handle(self, *args, **options): + if not is_dev_file_cache_enabled(): + self.stdout.write( + self.style.WARNING( + 'Dev file cache is not enabled. Set USE_DEV_FILE_CACHE = True in local_settings.py' + ) + ) + + if options['list']: + entries = list_dev_file_cache() + if entries: + self.stdout.write('Dev file cache entries:') + for entry in entries: + self.stdout.write(f' - {entry}') + else: + self.stdout.write('No cache entries found.') + return + + name = options.get('name') + deleted = clear_dev_file_cache(name) + + if name: + if deleted: + self.stdout.write(self.style.SUCCESS(f'Cleared cache entry: {name}')) + else: + self.stdout.write(self.style.WARNING(f'Cache entry not found: {name}')) + else: + self.stdout.write(self.style.SUCCESS(f'Cleared {deleted} cache entries')) diff --git a/sefaria/local_settings_example.py b/sefaria/local_settings_example.py index bcaa8f3364..845bf8305f 100644 --- a/sefaria/local_settings_example.py +++ b/sefaria/local_settings_example.py @@ -220,6 +220,13 @@ # Turns on loading of machine learning models to run linker ENABLE_LINKER = False +# File-based caching for library objects (AutoCompleter, Linker) during development. +# When enabled, these expensive objects are serialized to disk after first build +# and loaded from disk on subsequent server reloads, significantly speeding up +# development iteration. Should NOT be used in production. +USE_DEV_FILE_CACHE = False +DEV_FILE_CACHE_DIR = "/tmp/sefaria_dev_cache" + # Caching with Cloudflare CLOUDFLARE_ZONE = "" CLOUDFLARE_EMAIL = "" diff --git a/sefaria/model/text.py b/sefaria/model/text.py index 7771bb649b..a427e1c5a7 100644 --- a/sefaria/model/text.py +++ b/sefaria/model/text.py @@ -5290,6 +5290,21 @@ def build_full_auto_completer(self): for each of the languages in the library. Sets internal boolean to True upon successful completion to indicate auto completer is ready. """ + from sefaria.system.cache import load_from_dev_file_cache, save_to_dev_file_cache + + # Try to load from dev file cache first + cached = load_from_dev_file_cache("full_auto_completer") + if cached is not None: + self._full_auto_completer = cached + # Re-establish cross-references between language autocompleters + for lang in self.langs: + self._full_auto_completer[lang].set_other_lang_ac( + self._full_auto_completer["he" if lang == "en" else "en"] + ) + self._full_auto_completer_is_ready = True + return + + # Build from scratch from .autospell import AutoCompleter self._full_auto_completer = { lang: AutoCompleter(lang, library, include_people=True, include_topics=True, include_categories=True, include_parasha=False, include_users=True, include_collections=True) for lang in self.langs @@ -5299,12 +5314,25 @@ def build_full_auto_completer(self): self._full_auto_completer[lang].set_other_lang_ac(self._full_auto_completer["he" if lang == "en" else "en"]) self._full_auto_completer_is_ready = True + # Save to dev file cache for next reload + save_to_dev_file_cache("full_auto_completer", self._full_auto_completer) + def build_lexicon_auto_completers(self): """ Sets lexicon autocompleter for each lexicon in LexiconSet using a LexiconTrie Sets internal boolean to True upon successful completion to indicate auto completer is ready. """ + from sefaria.system.cache import load_from_dev_file_cache, save_to_dev_file_cache + + # Try to load from dev file cache first + cached = load_from_dev_file_cache("lexicon_auto_completers") + if cached is not None: + self._lexicon_auto_completer = cached + self._lexicon_auto_completer_is_ready = True + return + + # Build from scratch from .autospell import LexiconTrie from .lexicon import LexiconSet self._lexicon_auto_completer = { @@ -5312,15 +5340,31 @@ def build_lexicon_auto_completers(self): } self._lexicon_auto_completer_is_ready = True + # Save to dev file cache for next reload + save_to_dev_file_cache("lexicon_auto_completers", self._lexicon_auto_completer) + def build_cross_lexicon_auto_completer(self): """ Builds the cross lexicon auto completer excluding titles Sets internal boolean to True upon successful completion to indicate auto completer is ready. """ + from sefaria.system.cache import load_from_dev_file_cache, save_to_dev_file_cache + + # Try to load from dev file cache first + cached = load_from_dev_file_cache("cross_lexicon_auto_completer") + if cached is not None: + self._cross_lexicon_auto_completer = cached + self._cross_lexicon_auto_completer_is_ready = True + return + + # Build from scratch from .autospell import AutoCompleter self._cross_lexicon_auto_completer = AutoCompleter("he", library, include_titles=False, include_lexicons=True) self._cross_lexicon_auto_completer_is_ready = True + # Save to dev file cache for next reload + save_to_dev_file_cache("cross_lexicon_auto_completer", self._cross_lexicon_auto_completer) + def cross_lexicon_auto_completer(self): """ @@ -5682,6 +5726,17 @@ def get_linker(self, lang: str, rebuild=False): return linker def build_linker(self, lang: str): + from sefaria.system.cache import load_from_dev_file_cache, save_to_dev_file_cache + + cache_key = f"linker_{lang}" + + # Try to load from dev file cache first + cached = load_from_dev_file_cache(cache_key) + if cached is not None: + self._linker_by_lang[lang] = cached + return self._linker_by_lang[lang] + + # Build from scratch from sefaria.model.linker.linker import Linker logger.info("Loading Spacy Model") @@ -5691,6 +5746,11 @@ def build_linker(self, lang: str): named_entity_recognizer = self._build_named_entity_recognizer(lang) cat_resolver = self._build_category_resolver(lang) self._linker_by_lang[lang] = Linker(named_entity_recognizer, ref_resolver, named_entity_resolver, cat_resolver) + + # Save to dev file cache for next reload + # Note: Spacy models may not pickle cleanly - if this fails, cache save is skipped gracefully + save_to_dev_file_cache(cache_key, self._linker_by_lang[lang]) + return self._linker_by_lang[lang] @staticmethod diff --git a/sefaria/settings.py b/sefaria/settings.py index 35770f59cf..0783a101de 100644 --- a/sefaria/settings.py +++ b/sefaria/settings.py @@ -310,6 +310,11 @@ def get_static_url(): } +# Default values for settings that may be overridden in local_settings. +# These ensure the application works even if local_settings doesn't define them. +USE_DEV_FILE_CACHE = False +DEV_FILE_CACHE_DIR = "/tmp/sefaria_dev_cache" + # Grab environment specific settings from a file which # is left out of the repo. try: diff --git a/sefaria/system/cache.py b/sefaria/system/cache.py index 23450df930..826e3a1a29 100644 --- a/sefaria/system/cache.py +++ b/sefaria/system/cache.py @@ -277,4 +277,125 @@ def invalidate_cache_by_pattern(pattern: str, cache_type: Optional[str] = None) "backend": "unknown", "count": 0, "message": "Internal server error" - } \ No newline at end of file + } + + +# ============================================================================= +# File-based cache for development - persists expensive objects across reloads +# ============================================================================= + +import pickle +from pathlib import Path + + +def _get_dev_file_cache_dir() -> Path: + """Get the development file cache directory from settings.""" + cache_dir = Path(getattr(settings, 'DEV_FILE_CACHE_DIR', '/tmp/sefaria_dev_cache')) + cache_dir.mkdir(parents=True, exist_ok=True) + return cache_dir + + +def _get_dev_cache_path(name: str) -> Path: + """Get the full path for a named cache file.""" + return _get_dev_file_cache_dir() / f"{name}.pkl" + + +def is_dev_file_cache_enabled() -> bool: + """Check if development file caching is enabled.""" + return getattr(settings, 'USE_DEV_FILE_CACHE', False) + + +def save_to_dev_file_cache(name: str, obj: Any) -> bool: + """ + Save an object to the development file cache. + + Args: + name: A unique identifier for the cached object + obj: The object to cache (must be picklable) + + Returns: + True if saved successfully, False otherwise + """ + if not is_dev_file_cache_enabled(): + return False + + try: + cache_path = _get_dev_cache_path(name) + with open(cache_path, 'wb') as f: + pickle.dump(obj, f, protocol=pickle.HIGHEST_PROTOCOL) + logger.info(f"Saved {name} to dev file cache: {cache_path}") + return True + except Exception as e: + logger.warning(f"Failed to save {name} to dev file cache: {e}") + return False + + +def load_from_dev_file_cache(name: str) -> Optional[Any]: + """ + Load an object from the development file cache. + + Args: + name: The unique identifier for the cached object + + Returns: + The cached object if found and valid, None otherwise + """ + if not is_dev_file_cache_enabled(): + return None + + try: + cache_path = _get_dev_cache_path(name) + if not cache_path.exists(): + logger.info(f"Dev file cache miss for {name}: file not found") + return None + + with open(cache_path, 'rb') as f: + obj = pickle.load(f) + logger.info(f"Loaded {name} from dev file cache: {cache_path}") + return obj + except Exception as e: + logger.warning(f"Failed to load {name} from dev file cache: {e}") + return None + + +def clear_dev_file_cache(name: Optional[str] = None) -> int: + """ + Clear the development file cache. + + Args: + name: If provided, only clear this specific cache entry. + If None, clear all cache entries. + + Returns: + Number of cache files deleted + """ + cache_dir = _get_dev_file_cache_dir() + deleted = 0 + + if name: + cache_path = _get_dev_cache_path(name) + if cache_path.exists(): + cache_path.unlink() + deleted = 1 + logger.info(f"Cleared dev file cache entry: {name}") + else: + for cache_file in cache_dir.glob("*.pkl"): + try: + cache_file.unlink() + deleted += 1 + except Exception as e: + logger.warning(f"Failed to delete cache file {cache_file}: {e}") + logger.info(f"Cleared {deleted} dev file cache entries") + + return deleted + + +def list_dev_file_cache() -> list: + """ + List all entries in the development file cache. + + Returns: + List of cache entry names (without .pkl extension) + """ + cache_dir = _get_dev_file_cache_dir() + return [f.stem for f in cache_dir.glob("*.pkl")] \ No newline at end of file