diff --git a/scripts/scheduled/reindex_elasticsearch_cronjob.py b/scripts/scheduled/reindex_elasticsearch_cronjob.py index 4f8d1d420e..9f5a4aacb2 100644 --- a/scripts/scheduled/reindex_elasticsearch_cronjob.py +++ b/scripts/scheduled/reindex_elasticsearch_cronjob.py @@ -30,6 +30,9 @@ import django django.setup() +from django.contrib.auth.models import User +from django.conf import settings + from sefaria.model import * from sefaria.search import index_all, get_new_and_current_index_names, index_client, es_client, TextIndexer, setup_logging from sefaria.local_settings import SEFARIA_BOT_API_KEY @@ -149,29 +152,23 @@ def detailed_failure_report(self) -> str: if self.failed_text_versions: lines.append(f"\nFailed Text Versions: {len(self.failed_text_versions)}") lines.append("-" * 40) - for i, failure in enumerate(self.failed_text_versions[:50], 1): + for i, failure in enumerate(self.failed_text_versions, 1): title = failure.get('title', 'Unknown') version = failure.get('version', 'Unknown') lang = failure.get('lang', 'Unknown') error_type = failure.get('error_type', 'Unknown') - error = failure.get('error', 'Unknown error')[:100] + error = failure.get('error', 'Unknown error') lines.append(f"{i}. {title} ({version}, {lang})") lines.append(f" Error: {error_type}: {error}") - - if len(self.failed_text_versions) > 50: - lines.append(f"... and {len(self.failed_text_versions) - 50} more") if self.skipped_text_versions: lines.append(f"\nSkipped Text Versions: {len(self.skipped_text_versions)}") lines.append("-" * 40) - for i, skip in enumerate(self.skipped_text_versions[:20], 1): + for i, skip in enumerate(self.skipped_text_versions, 1): title = skip.get('title', 'Unknown') version = skip.get('version', 'Unknown') reason = skip.get('reason', 'Unknown') lines.append(f"{i}. {title} ({version}) - {reason}") - - if len(self.skipped_text_versions) > 20: - lines.append(f"... and {len(self.skipped_text_versions) - 20} more") lines.append(SEPARATOR_LINE) return "\n".join(lines) @@ -190,11 +187,69 @@ def check_elasticsearch_connection() -> bool: return False +def check_django_database_connection() -> bool: + """ + Verify Django database (PostgreSQL) is reachable and properly configured. + + This is critical because sheet indexing requires Django User lookups. + If the database is unreachable, UserProfile falls back to "User {id}" names, + which corrupts the search index with useless data. + + Returns: + True if database is properly configured and reachable, False otherwise. + """ + # First, check if database credentials are configured + db_config = settings.DATABASES.get('default', {}) + db_user = db_config.get('USER') + db_password = db_config.get('PASSWORD') + db_host = db_config.get('HOST') + db_port = db_config.get('PORT') + db_name = db_config.get('NAME') + + # Log configuration (without password) + logger.debug(f"Django database config - host: {db_host}, port: {db_port}, " + f"name: {db_name}, user: {db_user}, password_set: {bool(db_password)}") + + # Check for missing credentials (os.getenv returns None when env var is missing) + missing_fields = [] + if not db_user: + missing_fields.append("USER (DATABASES_USER env var)") + if not db_password: + missing_fields.append("PASSWORD (DATABASES_PASSWORD env var)") + if not db_host: + missing_fields.append("HOST (DATABASES_HOST env var)") + if not db_port: + missing_fields.append("PORT (DATABASES_PORT env var)") + + if missing_fields: + logger.error(f"Django database configuration incomplete - missing: {', '.join(missing_fields)}") + logger.error("This typically means the 'local-settings-secret' Kubernetes secret is missing or misconfigured.") + logger.error("Sheet indexing will produce 'User {id}' names instead of real names!") + return False + + # Now try to actually connect and query + try: + # Simple query to verify connection works + user_count = User.objects.count() + logger.debug(f"Django database connection verified - user_count: {user_count}") + + # Extra sanity check: verify we can get a real user name + first_user = User.objects.first() + if first_user: + logger.debug(f"Sample user lookup successful - id: {first_user.id}, " + f"name: {first_user.first_name} {first_user.last_name}") + + return True + except Exception as e: + logger.error(f"Django database connection failed - error: {str(e)}", exc_info=True) + logger.error(f"Database config - host: {db_host}, port: {db_port}, name: {db_name}, user: {db_user}") + return False + + def check_index_exists(index_name: str) -> bool: """Check if an Elasticsearch index exists.""" try: exists = index_client.exists(index=index_name) - logger.debug(f"Index existence check - index: {index_name}, exists: {exists}") return exists except Exception as e: logger.warning(f"Failed to check index existence - index: {index_name}, error: {str(e)}") @@ -234,10 +289,6 @@ def log_index_state(index_type: str, result: ReindexingResult): new_exists = check_index_exists(new_index) new_count = get_index_doc_count(new_index) if new_exists else 0 - logger.debug(f"Index state for {index_type} - alias: {alias}, current_index: {current_index}, " - f"current_doc_count: {current_count}, new_index: {new_index}, " - f"new_index_exists: {new_exists}, new_doc_count: {new_count}") - # Warn if new index already exists with documents if new_exists and new_count > 0: result.add_warning( @@ -251,7 +302,6 @@ def log_index_state(index_type: str, result: ReindexingResult): def run_pagesheetrank_update(result: ReindexingResult) -> bool: """Run pagesheetrank update with error handling.""" - logger.debug("Starting pagesheetrank update") try: update_pagesheetrank() result.record_step_success("pagesheetrank_update", "PageSheetRank values updated successfully") @@ -263,7 +313,6 @@ def run_pagesheetrank_update(result: ReindexingResult) -> bool: def run_index_all(result: ReindexingResult) -> bool: """Run full index with error handling and failure capture.""" - logger.debug("Starting full index rebuild") try: index_all() @@ -305,10 +354,6 @@ def should_retry_with_backoff(attempt: int, max_retries: int, context: str = "") return False wait_time = attempt * 30 # Linear backoff (30s, 60s, 90s) - log_msg = f"Retrying in {wait_time} seconds..." - if context: - log_msg = f"{context} - {log_msg}" - logger.debug(log_msg) time.sleep(wait_time) return True @@ -319,14 +364,10 @@ def run_sheets_by_timestamp(timestamp: str, result: ReindexingResult, max_retrie This catches sheets created/modified during the reindexing process. """ - logger.debug(f"Starting sheets-by-timestamp API call - timestamp: {timestamp}") - url = "https://www.sefaria.org/admin/index-sheets-by-timestamp" for attempt in range(1, max_retries + 1): try: - logger.debug(f"API attempt {attempt}/{max_retries} - url: {url}") - r = requests.post( url, data={"timestamp": timestamp, "apikey": SEFARIA_BOT_API_KEY}, @@ -400,7 +441,6 @@ def main(): # Store timestamp before we start (sheets created after this will be caught by API) last_sheet_timestamp = datetime.now().isoformat() - logger.debug(f"Captured start timestamp for sheet catch-up - timestamp: {last_sheet_timestamp}") # Pre-flight checks logger.debug("Running pre-flight checks...") @@ -411,8 +451,18 @@ def main(): logger.info(result.get_summary()) sys.exit(1) result.record_step_success("preflight_elasticsearch", "Elasticsearch connection verified") - - # 2. Log current index states + + # 2. Check Django database connection (critical for sheet indexing) + if not check_django_database_connection(): + result.record_step_failure("preflight_django_database", + "Cannot connect to Django database. Sheet indexing would produce 'User {id}' names. " + "Check that 'local-settings-secret' Kubernetes secret is properly configured with " + "DATABASES_USER, DATABASES_PASSWORD, DATABASES_HOST, DATABASES_PORT environment variables.") + logger.info(result.get_summary()) + sys.exit(1) + result.record_step_success("preflight_django_database", "Django database connection verified") + + # 3. Log current index states try: log_index_state('text', result) log_index_state('sheet', result) @@ -470,7 +520,6 @@ def main(): # Only log final index states if there were failures if result.failed_text_versions or result.skipped_text_versions or result.steps_failed: - logger.debug("Final index states:") try: log_index_state('text', result) log_index_state('sheet', result) diff --git a/sefaria/search.py b/sefaria/search.py index 43bb6074df..bea618cd7f 100644 --- a/sefaria/search.py +++ b/sefaria/search.py @@ -333,13 +333,9 @@ def create_index(index_name, type, force=False): raise if type == 'text': - logger.debug(f"Applying text mapping to index - index_name: {index_name}") put_text_mapping(index_name) - logger.debug(f"Text mapping applied successfully - index_name: {index_name}") elif type == 'sheet': - logger.debug(f"Applying sheet mapping to index - index_name: {index_name}") put_sheet_mapping(index_name) - logger.debug(f"Sheet mapping applied successfully - index_name: {index_name}") else: logger.warning(f"Unknown type, no mapping applied - type: {type}, index_name: {index_name}") @@ -558,7 +554,6 @@ def traverse(mini_toc): r = Ref(title) except InputError: parse_errors.append(title) - logger.debug(f"Failed to parse ref - title: {title}") return vlist = cls.get_ref_version_list(r) vpriorities = defaultdict(lambda: 0) @@ -602,9 +597,6 @@ def get_all_versions(cls, tries=0, versions=None, page=0): versions += temp_versions page += 1 first_run = False - # Log progress every 100 pages - if page % PROGRESS_LOG_EVERY_N == 0: - logger.debug(f"Fetching versions - page: {page}, total_so_far: {len(versions)}") logger.debug(f"Completed fetching all versions - total: {len(versions)}") return versions except pymongo.errors.AutoReconnect as e: @@ -900,11 +892,9 @@ def index_sheets_by_timestamp(timestamp): name_dict = get_new_and_current_index_names('sheet', debug=False) curr_index_name = name_dict.get('current') - logger.debug(f"Using sheet index - index_name: {curr_index_name}") try: ids = db.sheets.find({"status": "public", "dateModified": {"$gt": timestamp}}).distinct("id") - logger.debug(f"Found sheets to index by timestamp - count: {len(ids)}, timestamp: {timestamp}") except Exception as e: logger.error(f"Error querying sheets by timestamp - timestamp: {timestamp}, error: {str(e)}", exc_info=True) return str(e) @@ -939,7 +929,6 @@ def index_public_sheets(index_name): ids = db.sheets.find({"status": "public"}).distinct("id") total = len(ids) - logger.debug(f"Found public sheets to index - total: {total}, first_10_ids: {ids[:10] if ids else []}") succeeded = 0 failed = 0 @@ -974,17 +963,13 @@ def clear_index(index_name): """ Delete the search index. """ - logger.debug(f"Attempting to delete Elasticsearch index - index_name: {index_name}") try: # Check if index exists before trying to delete if index_client.exists(index=index_name): index_client.delete(index=index_name) - logger.debug(f"Successfully deleted Elasticsearch index - index_name: {index_name}") - else: - logger.debug(f"Index does not exist, nothing to delete - index_name: {index_name}") except NotFoundError: # Index doesn't exist - handle race condition where index is deleted between exists() check and delete() call - logger.debug(f"Index not found when attempting to delete - index_name: {index_name}") + pass except Exception as e: logger.error(f"Error deleting Elasticsearch index - index_name: {index_name} - error: {str(e)}") @@ -1099,9 +1084,7 @@ def index_all(skip=0, debug=False): raise # Clear index queue - logger.debug("Clearing stale index queue") deleted = db.index_queue.delete_many({}) - logger.debug(f"Cleared index queue - deleted_count: {deleted.deleted_count}") end = datetime.now() total_elapsed = end - start @@ -1122,20 +1105,10 @@ def index_all_of_type(type, skip=0, debug=False): # Check if new index already exists new_exists = index_client.exists(index=index_names_dict.get('new')) - if new_exists: - try: - stats = index_client.stats(index=index_names_dict.get('new')) - doc_count = stats.get('_all', {}).get('primaries', {}).get('docs', {}).get('count', 0) - logger.debug(f"New index already exists, will be recreated - index: {index_names_dict.get('new')}, existing_doc_count: {doc_count}") - except Exception: - logger.debug(f"New index already exists, will be recreated - index: {index_names_dict.get('new')}") # Countdown (keeping for backwards compatibility, but logging instead of just printing) - logger.debug("Starting countdown before indexing...") for i in range(10): remaining = 10 - i - logger.debug(f'STARTING IN T-MINUS {remaining}') - logger.debug(f"Countdown - seconds_remaining: {remaining}") pytime.sleep(1) # Perform the actual indexing @@ -1143,26 +1116,21 @@ def index_all_of_type(type, skip=0, debug=False): index_all_of_type_by_index_name(type, index_names_dict.get('new'), skip, debug) # Switch aliases - logger.debug("Switching aliases after indexing") try: index_client.delete_alias(index=index_names_dict.get('current'), name=index_names_dict.get('alias')) - logger.debug(f"Successfully deleted alias from old index - alias: {index_names_dict.get('alias')}, old_index: {index_names_dict.get('current')}") except NotFoundError: - logger.debug(f"Alias not found on old index (may be first run) - alias: {index_names_dict.get('alias')}, old_index: {index_names_dict.get('current')}") + # Alias not found on old index (may be first run) + pass # Clear any index with the alias name clear_index(index_names_dict.get('alias')) # Create new alias index_client.put_alias(index=index_names_dict.get('new'), name=index_names_dict.get('alias')) - logger.debug(f"Successfully created alias for new index - alias: {index_names_dict.get('alias')}, new_index: {index_names_dict.get('new')}") # Cleanup old index if index_names_dict.get('new') != index_names_dict.get('current'): - logger.debug(f"Cleaning up old index - old_index: {index_names_dict.get('current')}") clear_index(index_names_dict.get('current')) - - logger.debug(f"Completed index_all_of_type for '{type}' - type: {type}, final_index: {index_names_dict.get('new')}, alias: {index_names_dict.get('alias')}") def index_all_of_type_by_index_name(type, index_name, skip=0, debug=False, force_recreate=True): @@ -1179,16 +1147,8 @@ def index_all_of_type_by_index_name(type, index_name, skip=0, debug=False, force # Check if index exists and validate index_exists = index_client.exists(index=index_name) - if index_exists and skip == 0: - try: - stats = index_client.stats(index=index_name) - doc_count = stats.get('_all', {}).get('primaries', {}).get('docs', {}).get('count', 0) - logger.debug(f"Index already exists before creation - index_name: {index_name}, existing_doc_count: {doc_count}, will_recreate: {force_recreate}") - except Exception as e: - logger.debug(f"Could not check existing index stats - index_name: {index_name}, error: {str(e)}") if skip == 0: - logger.debug(f"Creating fresh index (skip=0) - index_name: {index_name}, type: {type}") create_index(index_name, type, force=force_recreate) else: logger.info(f"Skipping index creation (resuming) - index_name: {index_name}, skip: {skip}") @@ -1203,9 +1163,7 @@ def index_all_of_type_by_index_name(type, index_name, skip=0, debug=False, force TextIndexer.index_all(index_name, debug=debug) logger.debug("Completed TextIndexer.index_all") elif type == 'sheet': - logger.debug("Starting sheet indexing") index_public_sheets(index_name) - logger.debug("Completed sheet indexing") else: logger.error(f"Unknown index type - type: {type}") raise ValueError(f"Unknown index type: {type}") \ No newline at end of file