Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 13 additions & 0 deletions django_topics/models/topic.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,14 +24,27 @@ def get_topic_slugs_by_pool(self, pool: str) -> QuerySet:
return self.filter(pools__name=pool).values_list("slug", flat=True)

def build_slug_to_pools_cache(self, rebuild=False):
from sefaria.system.cache import load_from_dev_file_cache, save_to_dev_file_cache

if rebuild or not self.slug_to_pools:
# Try to load from dev file cache first (only if not forcing rebuild)
if not rebuild:
cached = load_from_dev_file_cache("topic_slug_to_pools")
if cached is not None:
self.slug_to_pools = cached
Comment on lines +32 to +34
Copy link

Copilot AI Jan 19, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

After loading from cache, slug_to_pools is a regular dict, but when built from scratch it's a defaultdict(list). This inconsistency could cause issues. Consider either: 1) wrapping the cached dict in defaultdict(list, cached) after loading, or 2) always using a regular dict and avoiding defaultdict behavior.

Copilot uses AI. Check for mistakes.
return

# Build from scratch
new_slug_to_pools = defaultdict(list)
topics = self.model.objects.values_list('slug', 'pools__name')
for slug, pool_name in topics:
if pool_name:
new_slug_to_pools[slug].append(pool_name)
self.slug_to_pools = new_slug_to_pools

# Save to dev file cache for next reload
save_to_dev_file_cache("topic_slug_to_pools", dict(self.slug_to_pools))

class Topic(models.Model):
slug = models.CharField(max_length=255, primary_key=True)
en_title = models.CharField(max_length=255, blank=True, default="")
Expand Down
55 changes: 55 additions & 0 deletions reader/management/commands/clear_dev_cache.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
"""
Management command to clear the development file cache.

Usage:
python manage.py clear_dev_cache # Clear all cache entries
python manage.py clear_dev_cache --list # List all cache entries
python manage.py clear_dev_cache --name full_auto_completer # Clear specific entry
"""
from django.core.management.base import BaseCommand
from sefaria.system.cache import clear_dev_file_cache, list_dev_file_cache, is_dev_file_cache_enabled


class Command(BaseCommand):
help = 'Clear the development file cache used for persisting library objects across server reloads'

def add_arguments(self, parser):
parser.add_argument(
'--name',
type=str,
help='Clear a specific cache entry by name (e.g., full_auto_completer, linker_he)',
)
parser.add_argument(
'--list',
action='store_true',
help='List all cache entries instead of clearing',
)

def handle(self, *args, **options):
if not is_dev_file_cache_enabled():
self.stdout.write(
self.style.WARNING(
'Dev file cache is not enabled. Set USE_DEV_FILE_CACHE = True in local_settings.py'
)
)
Copy link

Copilot AI Jan 19, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The command will still attempt to list or clear the cache even when USE_DEV_FILE_CACHE is False, which could be confusing. Consider returning early after displaying the warning message to prevent executing the cache operations when the feature is disabled.

Suggested change
)
)
return

Copilot uses AI. Check for mistakes.

if options['list']:
entries = list_dev_file_cache()
if entries:
self.stdout.write('Dev file cache entries:')
for entry in entries:
self.stdout.write(f' - {entry}')
else:
self.stdout.write('No cache entries found.')
return

name = options.get('name')
deleted = clear_dev_file_cache(name)

if name:
if deleted:
self.stdout.write(self.style.SUCCESS(f'Cleared cache entry: {name}'))
else:
self.stdout.write(self.style.WARNING(f'Cache entry not found: {name}'))
else:
self.stdout.write(self.style.SUCCESS(f'Cleared {deleted} cache entries'))
7 changes: 7 additions & 0 deletions sefaria/local_settings_example.py
Original file line number Diff line number Diff line change
Expand Up @@ -220,6 +220,13 @@
# Turns on loading of machine learning models to run linker
ENABLE_LINKER = False

# File-based caching for library objects (AutoCompleter, Linker) during development.
# When enabled, these expensive objects are serialized to disk after first build
# and loaded from disk on subsequent server reloads, significantly speeding up
# development iteration. Should NOT be used in production.
Copy link

Copilot AI Jan 19, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Consider adding documentation about when developers should clear the cache, such as after database changes, model updates, or when cached objects appear stale. This could be included in the docstring or as a comment in local_settings_example.py to help developers avoid debugging stale cache issues.

Suggested change
# development iteration. Should NOT be used in production.
# development iteration. Should NOT be used in production.
# Cached files are stored under DEV_FILE_CACHE_DIR; if you change the database contents,
# update models/serialization for these objects, or observe stale behavior, clear this
# directory (delete its contents) so that fresh cache files will be rebuilt.

Copilot uses AI. Check for mistakes.
USE_DEV_FILE_CACHE = False
DEV_FILE_CACHE_DIR = "/tmp/sefaria_dev_cache"

# Caching with Cloudflare
CLOUDFLARE_ZONE = ""
CLOUDFLARE_EMAIL = ""
Expand Down
60 changes: 60 additions & 0 deletions sefaria/model/text.py
Original file line number Diff line number Diff line change
Expand Up @@ -5290,6 +5290,21 @@ def build_full_auto_completer(self):
for each of the languages in the library.
Sets internal boolean to True upon successful completion to indicate auto completer is ready.
"""
from sefaria.system.cache import load_from_dev_file_cache, save_to_dev_file_cache

# Try to load from dev file cache first
cached = load_from_dev_file_cache("full_auto_completer")
if cached is not None:
self._full_auto_completer = cached
# Re-establish cross-references between language autocompleters
for lang in self.langs:
self._full_auto_completer[lang].set_other_lang_ac(
self._full_auto_completer["he" if lang == "en" else "en"]
)
self._full_auto_completer_is_ready = True
return

# Build from scratch
from .autospell import AutoCompleter
self._full_auto_completer = {
lang: AutoCompleter(lang, library, include_people=True, include_topics=True, include_categories=True, include_parasha=False, include_users=True, include_collections=True) for lang in self.langs
Expand All @@ -5299,28 +5314,57 @@ def build_full_auto_completer(self):
self._full_auto_completer[lang].set_other_lang_ac(self._full_auto_completer["he" if lang == "en" else "en"])
self._full_auto_completer_is_ready = True

# Save to dev file cache for next reload
save_to_dev_file_cache("full_auto_completer", self._full_auto_completer)

def build_lexicon_auto_completers(self):
"""
Sets lexicon autocompleter for each lexicon in LexiconSet using a LexiconTrie
Sets internal boolean to True upon successful completion to indicate auto completer is ready.

"""
from sefaria.system.cache import load_from_dev_file_cache, save_to_dev_file_cache

# Try to load from dev file cache first
cached = load_from_dev_file_cache("lexicon_auto_completers")
if cached is not None:
self._lexicon_auto_completer = cached
self._lexicon_auto_completer_is_ready = True
return

# Build from scratch
from .autospell import LexiconTrie
from .lexicon import LexiconSet
self._lexicon_auto_completer = {
lexicon.name: LexiconTrie(lexicon.name) for lexicon in LexiconSet({'should_autocomplete': True})
}
self._lexicon_auto_completer_is_ready = True

# Save to dev file cache for next reload
save_to_dev_file_cache("lexicon_auto_completers", self._lexicon_auto_completer)

def build_cross_lexicon_auto_completer(self):
"""
Builds the cross lexicon auto completer excluding titles
Sets internal boolean to True upon successful completion to indicate auto completer is ready.
"""
from sefaria.system.cache import load_from_dev_file_cache, save_to_dev_file_cache

# Try to load from dev file cache first
cached = load_from_dev_file_cache("cross_lexicon_auto_completer")
if cached is not None:
self._cross_lexicon_auto_completer = cached
self._cross_lexicon_auto_completer_is_ready = True
return

# Build from scratch
from .autospell import AutoCompleter
self._cross_lexicon_auto_completer = AutoCompleter("he", library, include_titles=False, include_lexicons=True)
self._cross_lexicon_auto_completer_is_ready = True

# Save to dev file cache for next reload
save_to_dev_file_cache("cross_lexicon_auto_completer", self._cross_lexicon_auto_completer)


def cross_lexicon_auto_completer(self):
"""
Expand Down Expand Up @@ -5682,6 +5726,17 @@ def get_linker(self, lang: str, rebuild=False):
return linker

def build_linker(self, lang: str):
from sefaria.system.cache import load_from_dev_file_cache, save_to_dev_file_cache

cache_key = f"linker_{lang}"

# Try to load from dev file cache first
cached = load_from_dev_file_cache(cache_key)
if cached is not None:
self._linker_by_lang[lang] = cached
return self._linker_by_lang[lang]

# Build from scratch
from sefaria.model.linker.linker import Linker

logger.info("Loading Spacy Model")
Expand All @@ -5691,6 +5746,11 @@ def build_linker(self, lang: str):
named_entity_recognizer = self._build_named_entity_recognizer(lang)
cat_resolver = self._build_category_resolver(lang)
self._linker_by_lang[lang] = Linker(named_entity_recognizer, ref_resolver, named_entity_resolver, cat_resolver)

# Save to dev file cache for next reload
# Note: Spacy models may not pickle cleanly - if this fails, cache save is skipped gracefully
save_to_dev_file_cache(cache_key, self._linker_by_lang[lang])

return self._linker_by_lang[lang]

@staticmethod
Expand Down
5 changes: 5 additions & 0 deletions sefaria/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -310,6 +310,11 @@ def get_static_url():
}


# Default values for settings that may be overridden in local_settings.
# These ensure the application works even if local_settings doesn't define them.
USE_DEV_FILE_CACHE = False
DEV_FILE_CACHE_DIR = "/tmp/sefaria_dev_cache"
Copy link

Copilot AI Jan 19, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Using /tmp as the default cache directory may cause issues on systems where /tmp is cleaned on reboot or in containerized environments. Consider using a more persistent location or documenting that developers should configure DEV_FILE_CACHE_DIR in local_settings.py for reliable caching across system restarts.

Suggested change
DEV_FILE_CACHE_DIR = "/tmp/sefaria_dev_cache"
DEV_FILE_CACHE_DIR = os.path.join(os.path.expanduser("~"), ".sefaria_dev_cache")

Copilot uses AI. Check for mistakes.

# Grab environment specific settings from a file which
# is left out of the repo.
try:
Expand Down
123 changes: 122 additions & 1 deletion sefaria/system/cache.py
Original file line number Diff line number Diff line change
Expand Up @@ -277,4 +277,125 @@ def invalidate_cache_by_pattern(pattern: str, cache_type: Optional[str] = None)
"backend": "unknown",
"count": 0,
"message": "Internal server error"
}
}


# =============================================================================
# File-based cache for development - persists expensive objects across reloads
# =============================================================================

import pickle
from pathlib import Path
Comment on lines +287 to +288
Copy link

Copilot AI Jan 19, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The pickle import should be placed at the top of the file with other standard library imports for better code organization and adherence to PEP 8 standards. Currently it's imported in the middle of the file after other code sections.

Copilot uses AI. Check for mistakes.
Copy link

Copilot AI Jan 19, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The Path import should be placed at the top of the file with other imports for better code organization and adherence to PEP 8 standards. Currently it's imported in the middle of the file after other code sections.

Copilot uses AI. Check for mistakes.


def _get_dev_file_cache_dir() -> Path:
"""Get the development file cache directory from settings."""
cache_dir = Path(getattr(settings, 'DEV_FILE_CACHE_DIR', '/tmp/sefaria_dev_cache'))
cache_dir.mkdir(parents=True, exist_ok=True)
Copy link

Copilot AI Jan 19, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The cache directory is created with default permissions using mkdir(parents=True, exist_ok=True). For security, consider explicitly setting restrictive permissions (e.g., mode=0o700) to ensure the cache directory is only accessible by the application user, preventing unauthorized access to pickled objects.

Suggested change
cache_dir.mkdir(parents=True, exist_ok=True)
cache_dir.mkdir(parents=True, exist_ok=True, mode=0o700)
try:
cache_dir.chmod(0o700)
except Exception as e:
# Log a warning but do not fail if we cannot enforce strict permissions
logger.warning(f"Unable to set permissions on dev file cache dir {cache_dir}: {e}")

Copilot uses AI. Check for mistakes.
return cache_dir


def _get_dev_cache_path(name: str) -> Path:
"""Get the full path for a named cache file."""
return _get_dev_file_cache_dir() / f"{name}.pkl"


def is_dev_file_cache_enabled() -> bool:
"""Check if development file caching is enabled."""
return getattr(settings, 'USE_DEV_FILE_CACHE', False)


def save_to_dev_file_cache(name: str, obj: Any) -> bool:
"""
Save an object to the development file cache.

Args:
name: A unique identifier for the cached object
obj: The object to cache (must be picklable)

Returns:
True if saved successfully, False otherwise
"""
if not is_dev_file_cache_enabled():
return False

try:
cache_path = _get_dev_cache_path(name)
with open(cache_path, 'wb') as f:
pickle.dump(obj, f, protocol=pickle.HIGHEST_PROTOCOL)
Comment on lines +322 to +325
Copy link

Copilot AI Jan 19, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Using pickle to deserialize untrusted data is a security risk as it can lead to arbitrary code execution. While this is intended for development use only, consider adding a warning in the documentation or using a safer serialization format like JSON where possible. At minimum, ensure the cache directory has appropriate permissions and is not world-writable.

Copilot uses AI. Check for mistakes.
logger.info(f"Saved {name} to dev file cache: {cache_path}")
return True
except Exception as e:
logger.warning(f"Failed to save {name} to dev file cache: {e}")
return False


def load_from_dev_file_cache(name: str) -> Optional[Any]:
"""
Load an object from the development file cache.

Args:
name: The unique identifier for the cached object

Returns:
The cached object if found and valid, None otherwise
"""
if not is_dev_file_cache_enabled():
return None

try:
cache_path = _get_dev_cache_path(name)
if not cache_path.exists():
logger.info(f"Dev file cache miss for {name}: file not found")
return None

with open(cache_path, 'rb') as f:
obj = pickle.load(f)
logger.info(f"Loaded {name} from dev file cache: {cache_path}")
return obj
except Exception as e:
logger.warning(f"Failed to load {name} from dev file cache: {e}")
return None


def clear_dev_file_cache(name: Optional[str] = None) -> int:
"""
Clear the development file cache.

Args:
name: If provided, only clear this specific cache entry.
If None, clear all cache entries.

Returns:
Number of cache files deleted
"""
cache_dir = _get_dev_file_cache_dir()
deleted = 0

if name:
cache_path = _get_dev_cache_path(name)
if cache_path.exists():
cache_path.unlink()
deleted = 1
logger.info(f"Cleared dev file cache entry: {name}")
else:
for cache_file in cache_dir.glob("*.pkl"):
try:
cache_file.unlink()
deleted += 1
except Exception as e:
logger.warning(f"Failed to delete cache file {cache_file}: {e}")
logger.info(f"Cleared {deleted} dev file cache entries")

return deleted


def list_dev_file_cache() -> list:
"""
List all entries in the development file cache.

Returns:
List of cache entry names (without .pkl extension)
"""
cache_dir = _get_dev_file_cache_dir()
return [f.stem for f in cache_dir.glob("*.pkl")]
Loading