From a9864107038f2c102b56ec3e648e9c80aab90ac2 Mon Sep 17 00:00:00 2001 From: yonadavGit <92536571+yonadavGit@users.noreply.github.com> Date: Thu, 15 Jan 2026 11:47:08 +0200 Subject: [PATCH 1/2] fix(logging): add diagnostic logging for slugless topics across multiple components --- scripts/apply_tag_topic_mapping.py | 14 ++++++ sefaria/model/topic.py | 5 ++ sefaria/sheets.py | 77 +++++++++++++++++++++++++++++- sourcesheets/views.py | 23 +++++++++ static/js/AboutSheet.jsx | 17 +++++++ static/js/SheetMetadata.jsx | 17 +++++++ static/js/categorize_sheets.jsx | 17 +++++++ static/js/sefaria/sefaria.js | 24 ++++++++++ 8 files changed, 193 insertions(+), 1 deletion(-) diff --git a/scripts/apply_tag_topic_mapping.py b/scripts/apply_tag_topic_mapping.py index 7652b58c5d..67beb70e8d 100644 --- a/scripts/apply_tag_topic_mapping.py +++ b/scripts/apply_tag_topic_mapping.py @@ -44,7 +44,21 @@ if sheet.get('id', None) is None: print("Sheet id is None") continue + + # DIAGNOSTIC LOGGING: Check for slugless topics before bulk write + import logging + logger = logging.getLogger(__name__) + for idx, topic in enumerate(topics): + if not topic.get("slug"): + logger.error(f"[SLUGLESS_TOPIC_TRACKER] apply_tag_topic_mapping.py: Sheet {sheet['id']} has topic without slug at index {idx}. Topic data: {topic}") + print(f"[SLUGLESS_TOPIC_TRACKER] apply_tag_topic_mapping.py: Sheet {sheet['id']}, topic index {idx}, data: {topic}") + updates += [{'id': sheet['id'], 'topics': topics}] + +# DIAGNOSTIC LOGGING: Log bulk write operation +logger.warning(f"[SLUGLESS_TOPIC_TRACKER] apply_tag_topic_mapping.py: About to bulk write {len(updates)} sheet topic updates") +print(f"[SLUGLESS_TOPIC_TRACKER] Bulk writing {len(updates)} updates") + db.sheets.bulk_write([ UpdateOne({"id": l['id']}, {"$set": {"topics": l['topics']}}) for l in updates ]) diff --git a/sefaria/model/topic.py b/sefaria/model/topic.py index aa472cd1e0..cd936c300b 100644 --- a/sefaria/model/topic.py +++ b/sefaria/model/topic.py @@ -430,6 +430,11 @@ def merge(self, other: Union['Topic', str]) -> None: logger.warning('While merging {} into {}, link assertion failed with message "{}"'.format(other_slug, self.slug, str(e))) # source sheets + # DIAGNOSTIC LOGGING: Track when topics are bulk-updated during merge + import logging + logger = logging.getLogger(__name__) + logger.warning(f"[SLUGLESS_TOPIC_TRACKER] Topic.merge(): Merging slug '{other_slug}' into '{self.slug}' via bulk update on sheets. This should preserve slugs but logging for safety.") + print(f"[SLUGLESS_TOPIC_TRACKER] Topic.merge(): {other_slug} -> {self.slug}") db.sheets.update_many({'topics.slug': other_slug}, {"$set": {'topics.$[element].slug': self.slug}}, array_filters=[{"element.slug": other_slug}]) # indexes diff --git a/sefaria/sheets.py b/sefaria/sheets.py index 5549054fc2..84431ea16f 100755 --- a/sefaria/sheets.py +++ b/sefaria/sheets.py @@ -57,6 +57,16 @@ def get_sheet(id=None): s = db.sheets.find_one({"id": int(id)}) if not s: return {"error": "Couldn't find sheet with id: %s" % (id)} + + # DIAGNOSTIC LOGGING: Check for slugless topics when loading from DB + import logging + logger = logging.getLogger(__name__) + raw_topics = s.get("topics", []) + for idx, topic in enumerate(raw_topics): + if not topic.get("slug"): + logger.error(f"[SLUGLESS_TOPIC_TRACKER] get_sheet() DB Load: Sheet {id} loaded from DB with topic without slug at index {idx}. Topic data: {topic}. This means slugless topic exists in MongoDB!") + print(f"[SLUGLESS_TOPIC_TRACKER] get_sheet() DB Load: Sheet {id}, topic index {idx}, data: {topic}") + s["topics"] = add_langs_to_topics(s.get("topics", [])) s["_id"] = str(s["_id"]) collections = CollectionSet({"sheets": id, "listed": True}) @@ -331,7 +341,17 @@ def sheet_topics_counts(query, sort_by="count"): {"$group": {"_id": "$topics.slug", "count": {"$sum": 1}, "asTyped": {"$first": "$topics.asTyped"}}}, {"$sort": sort_query}, {"$project": {"_id": 0, "slug": "$_id", "count": "$count", "asTyped": "$asTyped"}}], cursor={}) - return add_langs_to_topics(list(topics)) + + # DIAGNOSTIC LOGGING: Check if aggregation returned topics without slugs + import logging + logger = logging.getLogger(__name__) + topics_list = list(topics) + for idx, topic in enumerate(topics_list): + if not topic.get('slug') or topic.get('slug') is None: + logger.error(f"[SLUGLESS_TOPIC_TRACKER] sheet_topics_counts(): Aggregation returned topic without slug at index {idx}. Topic data: {topic}. Query: {query}. This means DB has sheets with topics.slug = null/missing") + print(f"[SLUGLESS_TOPIC_TRACKER] sheet_topics_counts(): Aggregation returned slugless topic: {topic}") + + return add_langs_to_topics(topics_list) def order_tags_for_user(tag_counts, uid): @@ -375,6 +395,15 @@ def trending_topics(days=7, ntags=14): {"$project": {"_id": 0, "slug": "$_id", "sheet_count": "$sheet_count", "authors": "$authors"}}], cursor={}) topics_list = list(topics) + + # DIAGNOSTIC LOGGING: Check if aggregation returned topics without slugs + import logging + logger = logging.getLogger(__name__) + for idx, topic in enumerate(topics_list): + if not topic.get('slug') or topic.get('slug') is None: + logger.error(f"[SLUGLESS_TOPIC_TRACKER] trending_topics(): Aggregation returned topic without slug at index {idx}. Topic data: {topic}. This means DB has sheets with topics.slug = null/missing") + print(f"[SLUGLESS_TOPIC_TRACKER] trending_topics(): Aggregation returned slugless topic: {topic}") + results = add_langs_to_topics([{ "slug": topic['slug'], "count": topic['sheet_count'], @@ -457,6 +486,15 @@ def save_sheet(sheet, user_id, search_override=False, rebuild_nodes=False): """ Saves sheet to the db, with user_id as owner. """ + # DIAGNOSTIC LOGGING: Track slugless topics entering save_sheet + import logging + logger = logging.getLogger(__name__) + if "topics" in sheet: + for idx, topic in enumerate(sheet.get("topics", [])): + if not topic.get("slug"): + logger.error(f"[SLUGLESS_TOPIC_TRACKER] save_sheet() Entry: Sheet {sheet.get('id', 'NEW')} has topic without slug at index {idx}. Topic data: {topic}. User: {user_id}, rebuild_nodes: {rebuild_nodes}") + print(f"[SLUGLESS_TOPIC_TRACKER] save_sheet() Entry: Sheet {sheet.get('id', 'NEW')}, topic index {idx}, data: {topic}") + def next_sheet_id(): last_id = db.sheets.find().sort([['id', -1]]).limit(1) if len(list(last_id.clone())): @@ -573,6 +611,13 @@ def next_sheet_id(): sheet = rebuild_sheet_nodes(sheet) if new_sheet: + # DIAGNOSTIC LOGGING: Before inserting new sheet + if "topics" in sheet: + for idx, topic in enumerate(sheet.get("topics", [])): + if not topic.get("slug"): + logger.error(f"[SLUGLESS_TOPIC_TRACKER] Pre-INSERT: NEW sheet has topic without slug at index {idx}. Topic data: {topic}. User: {user_id}") + print(f"[SLUGLESS_TOPIC_TRACKER] Pre-INSERT: NEW sheet, topic index {idx}, data: {topic}") + # mongo enforces a unique sheet id, get a new id until a unique one has been found while True: try: @@ -583,6 +628,13 @@ def next_sheet_id(): pass else: + # DIAGNOSTIC LOGGING: Before replacing existing sheet + if "topics" in sheet: + for idx, topic in enumerate(sheet.get("topics", [])): + if not topic.get("slug"): + logger.error(f"[SLUGLESS_TOPIC_TRACKER] Pre-REPLACE: Sheet {sheet['id']} has topic without slug at index {idx}. Topic data: {topic}. User: {user_id}") + print(f"[SLUGLESS_TOPIC_TRACKER] Pre-REPLACE: Sheet {sheet['id']}, topic index {idx}, data: {topic}") + db.sheets.find_one_and_replace({"id": sheet["id"]}, sheet) if len(topics_diff["added"]) or len(topics_diff["removed"]): @@ -901,18 +953,32 @@ def update_sheet_topics(sheet_id, topics, old_topics): containing fields `asTyped` and `slug`. Performs some normalization of `asTyped` and creates new topic objects for new topics. """ + # DIAGNOSTIC LOGGING: Track topics entering update_sheet_topics + import logging + logger = logging.getLogger(__name__) + for idx, topic in enumerate(topics): + if not topic.get("slug"): + logger.error(f"[SLUGLESS_TOPIC_TRACKER] update_sheet_topics() Entry: Sheet {sheet_id} has topic without slug at index {idx}. Topic data: {topic}. Will attempt to create slug.") + print(f"[SLUGLESS_TOPIC_TRACKER] update_sheet_topics() Entry: Sheet {sheet_id}, topic index {idx}, data: {topic}") + normalized_slug_title_pairs = set() for topic in topics: # Dedupe, normalize titles, create/choose topics for any missing slugs title = normalize_new_topic_title(topic["asTyped"]) if "slug" not in topic: + logger.warning(f"[SLUGLESS_TOPIC_TRACKER] update_sheet_topics(): Creating slug for topic. Sheet {sheet_id}, asTyped: '{topic['asTyped']}', normalized_title: '{title}'") + print(f"[SLUGLESS_TOPIC_TRACKER] Creating slug for: {topic['asTyped']}") match = choose_existing_topic_for_title(title) if match: topic["slug"] = match.slug + logger.info(f"[SLUGLESS_TOPIC_TRACKER] update_sheet_topics(): Matched existing topic '{match.slug}' for '{title}'") + print(f"[SLUGLESS_TOPIC_TRACKER] Matched existing: {match.slug}") else: new_topic = create_topic_from_title(title) topic["slug"] = new_topic.slug + logger.info(f"[SLUGLESS_TOPIC_TRACKER] update_sheet_topics(): Created NEW topic '{new_topic.slug}' for '{title}'") + print(f"[SLUGLESS_TOPIC_TRACKER] Created NEW topic: {new_topic.slug}") normalized_slug_title_pairs.add((title, topic["slug"])) normalized_topics = [{"asTyped": pair[0], "slug": pair[1]} for pair in normalized_slug_title_pairs] @@ -1301,6 +1367,15 @@ def get_sheet_categorization_info(find_without, skip_ids=[]): def update_sheet_tags_categories(body, uid): + # DIAGNOSTIC LOGGING: Track topics coming through categorization endpoint + import logging + logger = logging.getLogger(__name__) + tags = body.get("tags", []) + for idx, tag in enumerate(tags): + if not tag.get("slug"): + logger.error(f"[SLUGLESS_TOPIC_TRACKER] update_sheet_tags_categories(): Sheet {body['sheetId']} has topic without slug at index {idx}. Topic data: {tag}. User: {uid}, Full tags: {tags}") + print(f"[SLUGLESS_TOPIC_TRACKER] update_sheet_tags_categories(): Sheet {body['sheetId']}, topic index {idx}, data: {tag}") + update_sheet_topics(body['sheetId'], body["tags"], []) time = datetime.now().isoformat() noTags = time if body.get("noTags", False) else False diff --git a/sourcesheets/views.py b/sourcesheets/views.py index 3117999b38..77e7e2bc3b 100644 --- a/sourcesheets/views.py +++ b/sourcesheets/views.py @@ -418,6 +418,13 @@ def collections_inclusion_api(request, slug, action, sheet_id): collection.sheets.remove(sheet_id) if request.user.id == sheet["owner"] and sheet.get("displayedCollection", None) == collection.slug: sheet["displayedCollection"] = None + # DIAGNOSTIC LOGGING: Track topics when sheet updated via collection removal + import logging + logger = logging.getLogger(__name__) + for idx, topic in enumerate(sheet.get("topics", [])): + if not topic.get("slug"): + logger.error(f"[SLUGLESS_TOPIC_TRACKER] collections_inclusion_api REMOVE: Sheet {sheet_id} has topic without slug at index {idx}. Topic data: {topic}. User: {request.user.id}") + print(f"[SLUGLESS_TOPIC_TRACKER] collections_inclusion_api REMOVE: Sheet {sheet_id}, topic index {idx}, data: {topic}") db.sheets.find_one_and_replace({"id": sheet["id"]}, sheet) else: return jsonResponse({"error": "Sheet with id {} is not in this collection.".format(sheet_id)}) @@ -428,6 +435,13 @@ def collections_inclusion_api(request, slug, action, sheet_id): # in another collection, set it to highlight this collection. if request.user.id == sheet["owner"] and not sheet.get("displayedCollection", None): sheet["displayedCollection"] = collection.slug + # DIAGNOSTIC LOGGING: Track topics when sheet updated via collection addition + import logging + logger = logging.getLogger(__name__) + for idx, topic in enumerate(sheet.get("topics", [])): + if not topic.get("slug"): + logger.error(f"[SLUGLESS_TOPIC_TRACKER] collections_inclusion_api ADD: Sheet {sheet_id} has topic without slug at index {idx}. Topic data: {topic}. User: {request.user.id}") + print(f"[SLUGLESS_TOPIC_TRACKER] collections_inclusion_api ADD: Sheet {sheet_id}, topic index {idx}, data: {topic}") db.sheets.find_one_and_replace({"id": sheet["id"]}, sheet) collection.save() @@ -574,6 +588,15 @@ def save_sheet_api(request): return jsonResponse({"error": "No JSON given in post data."}) sheet = json.loads(j) + # DIAGNOSTIC LOGGING: Track slugless topics at API entry point + if "topics" in sheet: + import logging + logger = logging.getLogger(__name__) + for idx, topic in enumerate(sheet.get("topics", [])): + if not topic.get("slug"): + logger.error(f"[SLUGLESS_TOPIC_TRACKER] API Entry Point: Sheet {sheet.get('id', 'NEW')} has topic without slug at index {idx}. Topic data: {topic}. Request user: {request.user.id if request.user.is_authenticated else 'API_KEY'}, Full topics array: {sheet['topics']}") + print(f"[SLUGLESS_TOPIC_TRACKER] API Entry Point: Sheet {sheet.get('id', 'NEW')}, topic index {idx}, data: {topic}") + if apikey: if "id" in sheet: sheet["lastModified"] = get_sheet(sheet["id"])["dateModified"] # Usually lastModified gets set on the frontend, so we need to set it here to match with the previous dateModified so that the check in `save_sheet` returns properly diff --git a/static/js/AboutSheet.jsx b/static/js/AboutSheet.jsx index 0d24d0f2d4..c57570cf25 100644 --- a/static/js/AboutSheet.jsx +++ b/static/js/AboutSheet.jsx @@ -90,6 +90,16 @@ const AboutSheet = ({ masterPanelSheetId, toggleSignUpModal }) => { slug: tag.slug, }) ) + + // DIAGNOSTIC LOGGING: Track what topics are being sent from frontend + console.log('[SLUGLESS_TOPIC_TRACKER] AboutSheet.jsx updateTopics(): Sending topics to API:', topics); + topics.forEach((topic, idx) => { + if (!topic.slug) { + console.error(`[SLUGLESS_TOPIC_TRACKER] AboutSheet.jsx: Topic at index ${idx} has NO SLUG! Topic:`, topic); + console.error('[SLUGLESS_TOPIC_TRACKER] AboutSheet.jsx: Original tag from ReactTags:', newTags[idx]); + } + }); + updatedSheet.topics = topics; updatedSheet.lastModified = lastModified; delete updatedSheet._id; @@ -167,6 +177,13 @@ const AboutSheet = ({ masterPanelSheetId, toggleSignUpModal }) => { const onTagAddition = (tag) => { + // DIAGNOSTIC LOGGING: Track what tag object ReactTags creates + console.log('[SLUGLESS_TOPIC_TRACKER] AboutSheet.jsx onTagAddition(): Tag added by user:', tag); + if (!tag.slug) { + console.error('[SLUGLESS_TOPIC_TRACKER] AboutSheet.jsx onTagAddition(): NEW TAG WITHOUT SLUG!', tag); + console.error('[SLUGLESS_TOPIC_TRACKER] AboutSheet.jsx: This is likely a user-created tag via allowNew=true'); + } + const newTags = [].concat(tags, tag); setTags(newTags); updateTopics(newTags); diff --git a/static/js/SheetMetadata.jsx b/static/js/SheetMetadata.jsx index 3928f3d256..e2e9150290 100644 --- a/static/js/SheetMetadata.jsx +++ b/static/js/SheetMetadata.jsx @@ -228,6 +228,16 @@ class SheetMetadata extends Component { slug: tag.slug, }) ) + + // DIAGNOSTIC LOGGING: Track what topics are being sent from frontend + console.log('[SLUGLESS_TOPIC_TRACKER] SheetMetadata.jsx updateTopics(): Sending topics to API:', topics); + topics.forEach((topic, idx) => { + if (!topic.slug) { + console.error(`[SLUGLESS_TOPIC_TRACKER] SheetMetadata.jsx: Topic at index ${idx} has NO SLUG! Topic:`, topic); + console.error('[SLUGLESS_TOPIC_TRACKER] SheetMetadata.jsx: Original tag from ReactTags:', tags[idx]); + } + }); + updatedSheet.topics = topics; updatedSheet.lastModified = this.state.lastModified; delete updatedSheet._id; @@ -244,6 +254,13 @@ class SheetMetadata extends Component { } onTagAddition(tag) { + // DIAGNOSTIC LOGGING: Track what tag object ReactTags creates + console.log('[SLUGLESS_TOPIC_TRACKER] SheetMetadata.jsx onTagAddition(): Tag added by user:', tag); + if (!tag.slug) { + console.error('[SLUGLESS_TOPIC_TRACKER] SheetMetadata.jsx onTagAddition(): NEW TAG WITHOUT SLUG!', tag); + console.error('[SLUGLESS_TOPIC_TRACKER] SheetMetadata.jsx: This is likely a user-created tag via allowNew=true'); + } + const tags = [].concat(this.state.tags, tag); this.setState({ tags }); this.updateTopics(tags); diff --git a/static/js/categorize_sheets.jsx b/static/js/categorize_sheets.jsx index 5eb6b33e4a..87e0d2f76b 100644 --- a/static/js/categorize_sheets.jsx +++ b/static/js/categorize_sheets.jsx @@ -45,6 +45,13 @@ class SheetCategorizer extends React.Component { } onTagAddition(tag) { + // DIAGNOSTIC LOGGING: Track what tag object ReactTags creates + console.log('[SLUGLESS_TOPIC_TRACKER] categorize_sheets.jsx onTagAddition(): Tag added by user:', tag); + if (!tag.slug) { + console.error('[SLUGLESS_TOPIC_TRACKER] categorize_sheets.jsx onTagAddition(): NEW TAG WITHOUT SLUG!', tag); + console.error('[SLUGLESS_TOPIC_TRACKER] categorize_sheets.jsx: This is likely a user-created tag via allowNew=true'); + } + const tags = [].concat(this.state.tags, tag); this.setState({ tags, previousTags: tags, noTags: false }); } @@ -70,6 +77,16 @@ class SheetCategorizer extends React.Component { asTyped: tag.name, slug: tag.slug, })); + + // DIAGNOSTIC LOGGING: Track what topics are being sent from categorization + console.log('[SLUGLESS_TOPIC_TRACKER] categorize_sheets.jsx saveAndNext(): Sending topics to API:', topics); + topics.forEach((topic, idx) => { + if (!topic.slug) { + console.error(`[SLUGLESS_TOPIC_TRACKER] categorize_sheets.jsx: Topic at index ${idx} has NO SLUG! Topic:`, topic); + console.error('[SLUGLESS_TOPIC_TRACKER] categorize_sheets.jsx: Original tag from ReactTags:', this.state.tags[idx]); + } + }); + const currentCategories = this.state.categories; const keys = Object.keys(currentCategories); const categoriesToSend = keys.filter(x => currentCategories[x]) diff --git a/static/js/sefaria/sefaria.js b/static/js/sefaria/sefaria.js index 22f6344150..2d0dafa0f1 100644 --- a/static/js/sefaria/sefaria.js +++ b/static/js/sefaria/sefaria.js @@ -3090,6 +3090,14 @@ _media: {}, } const sheet = this._loadSheetByID[id]; if (sheet) { + // DIAGNOSTIC LOGGING: Check for slugless topics when loading from cache + if (sheet.topics) { + sheet.topics.forEach((topic, idx) => { + if (!topic.slug) { + console.error(`[SLUGLESS_TOPIC_TRACKER] sefaria.js loadSheetByID (CACHE): Sheet ${id} has topic without slug at index ${idx}. Topic:`, topic); + } + }); + } if (callback) { callback(sheet); } } else if (callback) { const url = "/api/sheets/" + id +"?more_data=1"; @@ -3097,6 +3105,14 @@ _media: {}, if ("error" in data) { console.log(data["error"]) } + // DIAGNOSTIC LOGGING: Check for slugless topics when loading from API + if (data.topics) { + data.topics.forEach((topic, idx) => { + if (!topic.slug) { + console.error(`[SLUGLESS_TOPIC_TRACKER] sefaria.js loadSheetByID (API): Sheet ${id} has topic without slug at index ${idx}. Topic:`, topic); + } + }); + } this._loadSheetByID[id] = data; callback(data); }); @@ -3517,6 +3533,14 @@ Sefaria.unpackDataFromProps = function(props) { Sefaria._indexDetails[panel.bookRef] = panel.indexDetails; } if (panel.sheet) { + // DIAGNOSTIC LOGGING: Check for slugless topics when caching sheet from panel data + if (panel.sheet.topics) { + panel.sheet.topics.forEach((topic, idx) => { + if (!topic.slug) { + console.error(`[SLUGLESS_TOPIC_TRACKER] sefaria.js _cacheServerData (PANEL): Sheet ${panel.sheet.id} has topic without slug at index ${idx}. Topic:`, topic); + } + }); + } Sefaria.sheets._loadSheetByID[panel.sheet.id] = panel.sheet; } } From cebf2615565fd1f04de48b657ff5f9d06c8b0d45 Mon Sep 17 00:00:00 2001 From: yonadavGit <92536571+yonadavGit@users.noreply.github.com> Date: Thu, 15 Jan 2026 12:28:24 +0200 Subject: [PATCH 2/2] feat(sheets): add script to find and analyze sheets with slugless topics --- scripts/find_slugless_topics.py | 400 ++++++++++++++++++++++++++++++++ 1 file changed, 400 insertions(+) create mode 100755 scripts/find_slugless_topics.py diff --git a/scripts/find_slugless_topics.py b/scripts/find_slugless_topics.py new file mode 100755 index 0000000000..32defe5a84 --- /dev/null +++ b/scripts/find_slugless_topics.py @@ -0,0 +1,400 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +""" +Script to find and analyze sheets with slugless topics. + +Usage: + python scripts/find_slugless_topics.py [options] + +Options: + --count-only Only show counts, don't list sheets + --verbose Show detailed information for each sheet + --export-csv Export results to CSV file + --fix Fix all slugless topics (requires confirmation) +""" + +import sys +import os +import csv +from datetime import datetime + +# Add project root to path +p = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) +sys.path.insert(0, p) + +from sefaria.system.database import db +from sefaria.sheets import update_sheet_topics + + +def find_sheets_with_slugless_topics(): + """ + Find all sheets that have topics without slug fields. + + Returns: + cursor: MongoDB cursor with matching sheets + """ + query = { + "topics": { + "$elemMatch": { + "slug": {"$exists": False} + } + } + } + + projection = { + "id": 1, + "title": 1, + "owner": 1, + "topics": 1, + "dateCreated": 1, + "dateModified": 1, + "status": 1 + } + + return db.sheets.find(query, projection) + + +def count_sheets_with_slugless_topics(): + """ + Count total sheets with slugless topics. + + Returns: + int: Number of affected sheets + """ + query = { + "topics": { + "$elemMatch": { + "slug": {"$exists": False} + } + } + } + + return db.sheets.count_documents(query) + + +def count_total_sheets_with_topics(): + """ + Count total sheets that have any topics. + + Returns: + int: Number of sheets with topics + """ + query = { + "topics": { + "$exists": True, + "$ne": [] + } + } + + return db.sheets.count_documents(query) + + +def get_affected_users(): + """ + Get list of users who have sheets with slugless topics. + + Returns: + list: List of dicts with user_id and sheet_count + """ + pipeline = [ + { + "$match": { + "topics": { + "$elemMatch": { + "slug": {"$exists": False} + } + } + } + }, + { + "$group": { + "_id": "$owner", + "sheet_count": {"$sum": 1}, + "sheet_ids": {"$push": "$id"} + } + }, + { + "$sort": {"sheet_count": -1} + } + ] + + return list(db.sheets.aggregate(pipeline)) + + +def get_slugless_topics_summary(): + """ + Get summary of all slugless topics across sheets. + + Returns: + list: List of dicts with asTyped text and count + """ + pipeline = [ + { + "$match": { + "topics": { + "$elemMatch": { + "slug": {"$exists": False} + } + } + } + }, + { + "$unwind": "$topics" + }, + { + "$match": { + "topics.slug": {"$exists": False} + } + }, + { + "$group": { + "_id": "$topics.asTyped", + "count": {"$sum": 1}, + "sheet_ids": {"$push": "$id"} + } + }, + { + "$sort": {"count": -1} + }, + { + "$project": { + "_id": 0, + "asTyped": "$_id", + "count": 1, + "sheet_ids": 1 + } + } + ] + + return list(db.sheets.aggregate(pipeline)) + + +def print_summary(): + """Print summary statistics of the issue.""" + print("\n" + "="*70) + print("SLUGLESS TOPICS ANALYSIS SUMMARY") + print("="*70) + + # Overall counts + affected_count = count_sheets_with_slugless_topics() + total_with_topics = count_total_sheets_with_topics() + total_sheets = db.sheets.count_documents({}) + + print(f"\nšŸ“Š OVERALL STATISTICS:") + print(f" Total sheets in database: {total_sheets:,}") + print(f" Sheets with topics: {total_with_topics:,}") + print(f" Sheets with slugless topics: {affected_count:,}") + + if total_with_topics > 0: + percentage = (affected_count / total_with_topics) * 100 + print(f" Percentage affected: {percentage:.2f}%") + + # User breakdown + print(f"\nšŸ‘„ AFFECTED USERS:") + affected_users = get_affected_users() + print(f" Total users affected: {len(affected_users)}") + print(f"\n Top 10 users by affected sheets:") + for i, user in enumerate(affected_users[:10], 1): + print(f" {i}. User {user['_id']}: {user['sheet_count']} sheet(s)") + + # Topic breakdown + print(f"\nšŸ·ļø SLUGLESS TOPICS BREAKDOWN:") + slugless_topics = get_slugless_topics_summary() + print(f" Total unique slugless topics: {len(slugless_topics)}") + print(f"\n Top 20 most common slugless topics:") + for i, topic in enumerate(slugless_topics[:20], 1): + print(f" {i}. '{topic['asTyped']}': {topic['count']} occurrence(s)") + + print("\n" + "="*70 + "\n") + + +def print_detailed_list(limit=None): + """Print detailed list of affected sheets.""" + print("\n" + "="*70) + print("DETAILED SHEET LIST") + print("="*70 + "\n") + + sheets = find_sheets_with_slugless_topics() + + count = 0 + for sheet in sheets: + count += 1 + if limit and count > limit: + print(f"\n... (showing first {limit} sheets)") + break + + print(f"Sheet ID: {sheet['id']}") + print(f" Title: {sheet.get('title', 'Untitled')[:80]}") + print(f" Owner: {sheet['owner']}") + print(f" Status: {sheet.get('status', 'unknown')}") + print(f" Created: {sheet.get('dateCreated', 'unknown')}") + print(f" Modified: {sheet.get('dateModified', 'unknown')}") + print(f" Slugless topics:") + for topic in sheet.get('topics', []): + if 'slug' not in topic: + print(f" - asTyped: '{topic.get('asTyped', 'N/A')}'") + print() + + +def export_to_csv(filename=None): + """Export affected sheets to CSV file.""" + if not filename: + timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") + filename = f"slugless_topics_{timestamp}.csv" + + sheets = find_sheets_with_slugless_topics() + + with open(filename, 'w', newline='', encoding='utf-8') as csvfile: + fieldnames = ['sheet_id', 'title', 'owner', 'status', 'date_created', + 'date_modified', 'slugless_topic_asTyped', 'url'] + writer = csv.DictWriter(csvfile, fieldnames=fieldnames) + + writer.writeheader() + + count = 0 + for sheet in sheets: + slugless_topics = [t for t in sheet.get('topics', []) if 'slug' not in t] + + for topic in slugless_topics: + writer.writerow({ + 'sheet_id': sheet['id'], + 'title': sheet.get('title', 'Untitled'), + 'owner': sheet['owner'], + 'status': sheet.get('status', 'unknown'), + 'date_created': sheet.get('dateCreated', ''), + 'date_modified': sheet.get('dateModified', ''), + 'slugless_topic_asTyped': topic.get('asTyped', ''), + 'url': f"https://www.sefaria.org/sheets/{sheet['id']}" + }) + count += 1 + + print(f"\nāœ… Exported {count} slugless topics from affected sheets to: {filename}") + + +def fix_all_slugless_topics(dry_run=True): + """ + Fix all sheets with slugless topics by running them through update_sheet_topics. + + Args: + dry_run (bool): If True, only show what would be fixed without actually fixing + """ + affected_sheets = list(find_sheets_with_slugless_topics()) + total = len(affected_sheets) + + print(f"\n{'DRY RUN: ' if dry_run else ''}Found {total} sheet(s) to fix") + + if dry_run: + print("\nThis is a DRY RUN. No changes will be made.") + print("Sheets that would be fixed:") + for sheet in affected_sheets[:10]: + slugless = [t.get('asTyped', 'N/A') for t in sheet.get('topics', []) if 'slug' not in t] + print(f" Sheet {sheet['id']}: {slugless}") + if total > 10: + print(f" ... and {total - 10} more") + return + + print("\nāš ļø WARNING: This will modify sheets in the database!") + response = input("Are you sure you want to continue? (yes/no): ") + + if response.lower() != 'yes': + print("Aborted.") + return + + fixed_count = 0 + error_count = 0 + + print(f"\nFixing {total} sheet(s)...") + + for i, sheet in enumerate(affected_sheets, 1): + try: + old_topics = sheet.get('topics', []) + # update_sheet_topics will create slugs for topics without them + result = update_sheet_topics(sheet['id'], old_topics, []) + + if result.get('status') == 'ok': + fixed_count += 1 + if i % 10 == 0: + print(f" Progress: {i}/{total} ({(i/total)*100:.1f}%)") + else: + error_count += 1 + print(f" āŒ Error fixing sheet {sheet['id']}: {result}") + + except Exception as e: + error_count += 1 + print(f" āŒ Exception fixing sheet {sheet['id']}: {e}") + + print(f"\nāœ… Fixed: {fixed_count} sheet(s)") + if error_count > 0: + print(f"āŒ Errors: {error_count} sheet(s)") + + +def main(): + """Main function to run the script.""" + import argparse + + parser = argparse.ArgumentParser( + description='Find and analyze sheets with slugless topics' + ) + parser.add_argument( + '--count-only', + action='store_true', + help='Only show counts, don\'t list sheets' + ) + parser.add_argument( + '--verbose', + action='store_true', + help='Show detailed information for each sheet' + ) + parser.add_argument( + '--limit', + type=int, + default=None, + help='Limit number of sheets to display in verbose mode' + ) + parser.add_argument( + '--export-csv', + action='store_true', + help='Export results to CSV file' + ) + parser.add_argument( + '--csv-filename', + type=str, + default=None, + help='Custom filename for CSV export' + ) + parser.add_argument( + '--fix', + action='store_true', + help='Fix all slugless topics (creates slugs)' + ) + parser.add_argument( + '--dry-run', + action='store_true', + help='Dry run mode for --fix (show what would be fixed)' + ) + + args = parser.parse_args() + + # Always show summary + print_summary() + + # Handle different modes + if args.export_csv: + export_to_csv(args.csv_filename) + + if args.fix: + fix_all_slugless_topics(dry_run=args.dry_run) + + if args.verbose and not args.count_only: + print_detailed_list(limit=args.limit) + + if not args.export_csv and not args.fix and not args.verbose: + print("šŸ’” TIP: Use --verbose to see detailed sheet list") + print("šŸ’” TIP: Use --export-csv to export to CSV file") + print("šŸ’” TIP: Use --fix --dry-run to see what would be fixed") + print("šŸ’” TIP: Use --fix to actually fix the issues") + + +if __name__ == "__main__": + main() +