From a9864107038f2c102b56ec3e648e9c80aab90ac2 Mon Sep 17 00:00:00 2001
From: yonadavGit <92536571+yonadavGit@users.noreply.github.com>
Date: Thu, 15 Jan 2026 11:47:08 +0200
Subject: [PATCH 1/2] fix(logging): add diagnostic logging for slugless topics
 across multiple components

---
 scripts/apply_tag_topic_mapping.py | 14 ++++++
 sefaria/model/topic.py             |  5 ++
 sefaria/sheets.py                  | 77 +++++++++++++++++++++++++++++-
 sourcesheets/views.py              | 23 +++++++++
 static/js/AboutSheet.jsx           | 17 +++++++
 static/js/SheetMetadata.jsx        | 17 +++++++
 static/js/categorize_sheets.jsx    | 17 +++++++
 static/js/sefaria/sefaria.js       | 24 ++++++++++
 8 files changed, 193 insertions(+), 1 deletion(-)

diff --git a/scripts/apply_tag_topic_mapping.py b/scripts/apply_tag_topic_mapping.py
index 7652b58c5d..67beb70e8d 100644
--- a/scripts/apply_tag_topic_mapping.py
+++ b/scripts/apply_tag_topic_mapping.py
@@ -44,7 +44,21 @@
     if sheet.get('id', None) is None:
         print("Sheet id is None")
         continue
+
+    # DIAGNOSTIC LOGGING: Check for slugless topics before bulk write
+    import logging
+    logger = logging.getLogger(__name__)
+    for idx, topic in enumerate(topics):
+        if not topic.get("slug"):
+            logger.error(f"[SLUGLESS_TOPIC_TRACKER] apply_tag_topic_mapping.py: Sheet {sheet['id']} has topic without slug at index {idx}. Topic data: {topic}")
+            print(f"[SLUGLESS_TOPIC_TRACKER] apply_tag_topic_mapping.py: Sheet {sheet['id']}, topic index {idx}, data: {topic}")
+
     updates += [{'id': sheet['id'], 'topics': topics}]
+
+# DIAGNOSTIC LOGGING: Log bulk write operation
+logger.warning(f"[SLUGLESS_TOPIC_TRACKER] apply_tag_topic_mapping.py: About to bulk write {len(updates)} sheet topic updates")
+print(f"[SLUGLESS_TOPIC_TRACKER] Bulk writing {len(updates)} updates")
+
 db.sheets.bulk_write([
     UpdateOne({"id": l['id']}, {"$set": {"topics": l['topics']}}) for l in updates
 ])
diff --git a/sefaria/model/topic.py b/sefaria/model/topic.py
index aa472cd1e0..cd936c300b 100644
--- a/sefaria/model/topic.py
+++ b/sefaria/model/topic.py
@@ -430,6 +430,11 @@ def merge(self, other: Union['Topic', str]) -> None:
                 logger.warning('While merging {} into {}, link assertion failed with message "{}"'.format(other_slug, self.slug, str(e)))
 
         # source sheets
+        # DIAGNOSTIC LOGGING: Track when topics are bulk-updated during merge
+        import logging
+        logger = logging.getLogger(__name__)
+        logger.warning(f"[SLUGLESS_TOPIC_TRACKER] Topic.merge(): Merging slug '{other_slug}' into '{self.slug}' via bulk update on sheets. This should preserve slugs but logging for safety.")
+        print(f"[SLUGLESS_TOPIC_TRACKER] Topic.merge(): {other_slug} -> {self.slug}")
         db.sheets.update_many({'topics.slug': other_slug}, {"$set": {'topics.$[element].slug': self.slug}}, array_filters=[{"element.slug": other_slug}])
 
         # indexes
diff --git a/sefaria/sheets.py b/sefaria/sheets.py
index 5549054fc2..84431ea16f 100755
--- a/sefaria/sheets.py
+++ b/sefaria/sheets.py
@@ -57,6 +57,16 @@ def get_sheet(id=None):
 	s = db.sheets.find_one({"id": int(id)})
 	if not s:
 		return {"error": "Couldn't find sheet with id: %s" % (id)}
+
+	# DIAGNOSTIC LOGGING: Check for slugless topics when loading from DB
+	import logging
+	logger = logging.getLogger(__name__)
+	raw_topics = s.get("topics", [])
+	for idx, topic in enumerate(raw_topics):
+		if not topic.get("slug"):
+			logger.error(f"[SLUGLESS_TOPIC_TRACKER] get_sheet() DB Load: Sheet {id} loaded from DB with topic without slug at index {idx}. Topic data: {topic}. This means slugless topic exists in MongoDB!")
+			print(f"[SLUGLESS_TOPIC_TRACKER] get_sheet() DB Load: Sheet {id}, topic index {idx}, data: {topic}")
+
 	s["topics"] = add_langs_to_topics(s.get("topics", []))
 	s["_id"] = str(s["_id"])
 	collections = CollectionSet({"sheets": id, "listed": True})
@@ -331,7 +341,17 @@ def sheet_topics_counts(query, sort_by="count"):
 		{"$group": {"_id": "$topics.slug", "count": {"$sum": 1}, "asTyped": {"$first": "$topics.asTyped"}}},
 		{"$sort": sort_query},
 		{"$project": {"_id": 0, "slug": "$_id", "count": "$count", "asTyped": "$asTyped"}}], cursor={})
-	return add_langs_to_topics(list(topics))
+
+	# DIAGNOSTIC LOGGING: Check if aggregation returned topics without slugs
+	import logging
+	logger = logging.getLogger(__name__)
+	topics_list = list(topics)
+	for idx, topic in enumerate(topics_list):
+		if not topic.get('slug') or topic.get('slug') is None:
+			logger.error(f"[SLUGLESS_TOPIC_TRACKER] sheet_topics_counts(): Aggregation returned topic without slug at index {idx}. Topic data: {topic}. Query: {query}. This means DB has sheets with topics.slug = null/missing")
+			print(f"[SLUGLESS_TOPIC_TRACKER] sheet_topics_counts(): Aggregation returned slugless topic: {topic}")
+
+	return add_langs_to_topics(topics_list)
 
 
 def order_tags_for_user(tag_counts, uid):
@@ -375,6 +395,15 @@ def trending_topics(days=7, ntags=14):
 			{"$project": {"_id": 0, "slug": "$_id", "sheet_count": "$sheet_count", "authors": "$authors"}}], cursor={})
 
 	topics_list = list(topics)
+
+	# DIAGNOSTIC LOGGING: Check if aggregation returned topics without slugs
+	import logging
+	logger = logging.getLogger(__name__)
+	for idx, topic in enumerate(topics_list):
+		if not topic.get('slug') or topic.get('slug') is None:
+			logger.error(f"[SLUGLESS_TOPIC_TRACKER] trending_topics(): Aggregation returned topic without slug at index {idx}. Topic data: {topic}. This means DB has sheets with topics.slug = null/missing")
+			print(f"[SLUGLESS_TOPIC_TRACKER] trending_topics(): Aggregation returned slugless topic: {topic}")
+
 	results = add_langs_to_topics([{
 		"slug": topic['slug'],
 		"count": topic['sheet_count'],
@@ -457,6 +486,15 @@ def save_sheet(sheet, user_id, search_override=False, rebuild_nodes=False):
 	"""
 	Saves sheet to the db, with user_id as owner.
 	"""
+	# DIAGNOSTIC LOGGING: Track slugless topics entering save_sheet
+	import logging
+	logger = logging.getLogger(__name__)
+	if "topics" in sheet:
+		for idx, topic in enumerate(sheet.get("topics", [])):
+			if not topic.get("slug"):
+				logger.error(f"[SLUGLESS_TOPIC_TRACKER] save_sheet() Entry: Sheet {sheet.get('id', 'NEW')} has topic without slug at index {idx}. Topic data: {topic}. User: {user_id}, rebuild_nodes: {rebuild_nodes}")
+				print(f"[SLUGLESS_TOPIC_TRACKER] save_sheet() Entry: Sheet {sheet.get('id', 'NEW')}, topic index {idx}, data: {topic}")
+
 	def next_sheet_id():
 		last_id = db.sheets.find().sort([['id', -1]]).limit(1)
 		if len(list(last_id.clone())):
@@ -573,6 +611,13 @@ def next_sheet_id():
 		sheet = rebuild_sheet_nodes(sheet)
 
 	if new_sheet:
+		# DIAGNOSTIC LOGGING: Before inserting new sheet
+		if "topics" in sheet:
+			for idx, topic in enumerate(sheet.get("topics", [])):
+				if not topic.get("slug"):
+					logger.error(f"[SLUGLESS_TOPIC_TRACKER] Pre-INSERT: NEW sheet has topic without slug at index {idx}. Topic data: {topic}. User: {user_id}")
+					print(f"[SLUGLESS_TOPIC_TRACKER] Pre-INSERT: NEW sheet, topic index {idx}, data: {topic}")
+
 		# mongo enforces a unique sheet id, get a new id until a unique one has been found
 		while True:
 			try:
@@ -583,6 +628,13 @@ def next_sheet_id():
 				pass
 
 	else:
+		# DIAGNOSTIC LOGGING: Before replacing existing sheet
+		if "topics" in sheet:
+			for idx, topic in enumerate(sheet.get("topics", [])):
+				if not topic.get("slug"):
+					logger.error(f"[SLUGLESS_TOPIC_TRACKER] Pre-REPLACE: Sheet {sheet['id']} has topic without slug at index {idx}. Topic data: {topic}. User: {user_id}")
+					print(f"[SLUGLESS_TOPIC_TRACKER] Pre-REPLACE: Sheet {sheet['id']}, topic index {idx}, data: {topic}")
+
 		db.sheets.find_one_and_replace({"id": sheet["id"]}, sheet)
 
 	if len(topics_diff["added"]) or len(topics_diff["removed"]):
@@ -901,18 +953,32 @@ def update_sheet_topics(sheet_id, topics, old_topics):
 	containing fields `asTyped` and `slug`.
 	Performs some normalization of `asTyped` and creates new topic objects for new topics.
 	"""
+	# DIAGNOSTIC LOGGING: Track topics entering update_sheet_topics
+	import logging
+	logger = logging.getLogger(__name__)
+	for idx, topic in enumerate(topics):
+		if not topic.get("slug"):
+			logger.error(f"[SLUGLESS_TOPIC_TRACKER] update_sheet_topics() Entry: Sheet {sheet_id} has topic without slug at index {idx}. Topic data: {topic}. Will attempt to create slug.")
+			print(f"[SLUGLESS_TOPIC_TRACKER] update_sheet_topics() Entry: Sheet {sheet_id}, topic index {idx}, data: {topic}")
+
 	normalized_slug_title_pairs = set()
 
 	for topic in topics:
 	# Dedupe, normalize titles, create/choose topics for any missing slugs
 		title = normalize_new_topic_title(topic["asTyped"])
 		if "slug" not in topic:
+			logger.warning(f"[SLUGLESS_TOPIC_TRACKER] update_sheet_topics(): Creating slug for topic. Sheet {sheet_id}, asTyped: '{topic['asTyped']}', normalized_title: '{title}'")
+			print(f"[SLUGLESS_TOPIC_TRACKER] Creating slug for: {topic['asTyped']}")
 			match = choose_existing_topic_for_title(title)
 			if match:
 				topic["slug"] = match.slug
+				logger.info(f"[SLUGLESS_TOPIC_TRACKER] update_sheet_topics(): Matched existing topic '{match.slug}' for '{title}'")
+				print(f"[SLUGLESS_TOPIC_TRACKER] Matched existing: {match.slug}")
 			else:
 				new_topic = create_topic_from_title(title)
 				topic["slug"] = new_topic.slug
+				logger.info(f"[SLUGLESS_TOPIC_TRACKER] update_sheet_topics(): Created NEW topic '{new_topic.slug}' for '{title}'")
+				print(f"[SLUGLESS_TOPIC_TRACKER] Created NEW topic: {new_topic.slug}")
 		normalized_slug_title_pairs.add((title, topic["slug"]))
 
 	normalized_topics = [{"asTyped": pair[0], "slug": pair[1]} for pair in normalized_slug_title_pairs]
@@ -1301,6 +1367,15 @@ def get_sheet_categorization_info(find_without, skip_ids=[]):
 
 
 def update_sheet_tags_categories(body, uid):
+	# DIAGNOSTIC LOGGING: Track topics coming through categorization endpoint
+	import logging
+	logger = logging.getLogger(__name__)
+	tags = body.get("tags", [])
+	for idx, tag in enumerate(tags):
+		if not tag.get("slug"):
+			logger.error(f"[SLUGLESS_TOPIC_TRACKER] update_sheet_tags_categories(): Sheet {body['sheetId']} has topic without slug at index {idx}. Topic data: {tag}. User: {uid}, Full tags: {tags}")
+			print(f"[SLUGLESS_TOPIC_TRACKER] update_sheet_tags_categories(): Sheet {body['sheetId']}, topic index {idx}, data: {tag}")
+
 	update_sheet_topics(body['sheetId'], body["tags"], [])
 	time = datetime.now().isoformat()
 	noTags = time if body.get("noTags", False) else False
diff --git a/sourcesheets/views.py b/sourcesheets/views.py
index 3117999b38..77e7e2bc3b 100644
--- a/sourcesheets/views.py
+++ b/sourcesheets/views.py
@@ -418,6 +418,13 @@ def collections_inclusion_api(request, slug, action, sheet_id):
             collection.sheets.remove(sheet_id)
             if request.user.id == sheet["owner"] and sheet.get("displayedCollection", None) == collection.slug:
                 sheet["displayedCollection"] = None
+                # DIAGNOSTIC LOGGING: Track topics when sheet updated via collection removal
+                import logging
+                logger = logging.getLogger(__name__)
+                for idx, topic in enumerate(sheet.get("topics", [])):
+                    if not topic.get("slug"):
+                        logger.error(f"[SLUGLESS_TOPIC_TRACKER] collections_inclusion_api REMOVE: Sheet {sheet_id} has topic without slug at index {idx}. Topic data: {topic}. User: {request.user.id}")
+                        print(f"[SLUGLESS_TOPIC_TRACKER] collections_inclusion_api REMOVE: Sheet {sheet_id}, topic index {idx}, data: {topic}")
                 db.sheets.find_one_and_replace({"id": sheet["id"]}, sheet)
         else:
             return jsonResponse({"error": "Sheet with id {} is not in this collection.".format(sheet_id)})
@@ -428,6 +435,13 @@ def collections_inclusion_api(request, slug, action, sheet_id):
             # in another collection, set it to highlight this collection.
             if request.user.id == sheet["owner"] and not sheet.get("displayedCollection", None):
                 sheet["displayedCollection"] = collection.slug
+                # DIAGNOSTIC LOGGING: Track topics when sheet updated via collection addition
+                import logging
+                logger = logging.getLogger(__name__)
+                for idx, topic in enumerate(sheet.get("topics", [])):
+                    if not topic.get("slug"):
+                        logger.error(f"[SLUGLESS_TOPIC_TRACKER] collections_inclusion_api ADD: Sheet {sheet_id} has topic without slug at index {idx}. Topic data: {topic}. User: {request.user.id}")
+                        print(f"[SLUGLESS_TOPIC_TRACKER] collections_inclusion_api ADD: Sheet {sheet_id}, topic index {idx}, data: {topic}")
                 db.sheets.find_one_and_replace({"id": sheet["id"]}, sheet)
 
     collection.save()
@@ -574,6 +588,15 @@ def save_sheet_api(request):
             return jsonResponse({"error": "No JSON given in post data."})
         sheet = json.loads(j)
 
+        # DIAGNOSTIC LOGGING: Track slugless topics at API entry point
+        if "topics" in sheet:
+            import logging
+            logger = logging.getLogger(__name__)
+            for idx, topic in enumerate(sheet.get("topics", [])):
+                if not topic.get("slug"):
+                    logger.error(f"[SLUGLESS_TOPIC_TRACKER] API Entry Point: Sheet {sheet.get('id', 'NEW')} has topic without slug at index {idx}. Topic data: {topic}. Request user: {request.user.id if request.user.is_authenticated else 'API_KEY'}, Full topics array: {sheet['topics']}")
+                    print(f"[SLUGLESS_TOPIC_TRACKER] API Entry Point: Sheet {sheet.get('id', 'NEW')}, topic index {idx}, data: {topic}")
+
         if apikey:
             if "id" in sheet:
                 sheet["lastModified"] = get_sheet(sheet["id"])["dateModified"] # Usually lastModified gets set on the frontend, so we need to set it here to match with the previous dateModified so that the check in `save_sheet` returns properly
diff --git a/static/js/AboutSheet.jsx b/static/js/AboutSheet.jsx
index 0d24d0f2d4..c57570cf25 100644
--- a/static/js/AboutSheet.jsx
+++ b/static/js/AboutSheet.jsx
@@ -90,6 +90,16 @@ const AboutSheet = ({ masterPanelSheetId, toggleSignUpModal }) => {
             slug: tag.slug,
         })
         )
+
+        // DIAGNOSTIC LOGGING: Track what topics are being sent from frontend
+        console.log('[SLUGLESS_TOPIC_TRACKER] AboutSheet.jsx updateTopics(): Sending topics to API:', topics);
+        topics.forEach((topic, idx) => {
+            if (!topic.slug) {
+                console.error(`[SLUGLESS_TOPIC_TRACKER] AboutSheet.jsx: Topic at index ${idx} has NO SLUG! Topic:`, topic);
+                console.error('[SLUGLESS_TOPIC_TRACKER] AboutSheet.jsx: Original tag from ReactTags:', newTags[idx]);
+            }
+        });
+
         updatedSheet.topics = topics;
         updatedSheet.lastModified = lastModified;
         delete updatedSheet._id;
@@ -167,6 +177,13 @@ const AboutSheet = ({ masterPanelSheetId, toggleSignUpModal }) => {
 
 
     const onTagAddition = (tag) => {
+        // DIAGNOSTIC LOGGING: Track what tag object ReactTags creates
+        console.log('[SLUGLESS_TOPIC_TRACKER] AboutSheet.jsx onTagAddition(): Tag added by user:', tag);
+        if (!tag.slug) {
+            console.error('[SLUGLESS_TOPIC_TRACKER] AboutSheet.jsx onTagAddition(): NEW TAG WITHOUT SLUG!', tag);
+            console.error('[SLUGLESS_TOPIC_TRACKER] AboutSheet.jsx: This is likely a user-created tag via allowNew=true');
+        }
+
         const newTags = [].concat(tags, tag);
         setTags(newTags);
         updateTopics(newTags);
diff --git a/static/js/SheetMetadata.jsx b/static/js/SheetMetadata.jsx
index 3928f3d256..e2e9150290 100644
--- a/static/js/SheetMetadata.jsx
+++ b/static/js/SheetMetadata.jsx
@@ -228,6 +228,16 @@ class SheetMetadata extends Component {
           slug: tag.slug,
         })
     )
+
+    // DIAGNOSTIC LOGGING: Track what topics are being sent from frontend
+    console.log('[SLUGLESS_TOPIC_TRACKER] SheetMetadata.jsx updateTopics(): Sending topics to API:', topics);
+    topics.forEach((topic, idx) => {
+      if (!topic.slug) {
+        console.error(`[SLUGLESS_TOPIC_TRACKER] SheetMetadata.jsx: Topic at index ${idx} has NO SLUG! Topic:`, topic);
+        console.error('[SLUGLESS_TOPIC_TRACKER] SheetMetadata.jsx: Original tag from ReactTags:', tags[idx]);
+      }
+    });
+
     updatedSheet.topics = topics;
     updatedSheet.lastModified = this.state.lastModified;
     delete updatedSheet._id;
@@ -244,6 +254,13 @@ class SheetMetadata extends Component {
   }
 
   onTagAddition(tag) {
+    // DIAGNOSTIC LOGGING: Track what tag object ReactTags creates
+    console.log('[SLUGLESS_TOPIC_TRACKER] SheetMetadata.jsx onTagAddition(): Tag added by user:', tag);
+    if (!tag.slug) {
+      console.error('[SLUGLESS_TOPIC_TRACKER] SheetMetadata.jsx onTagAddition(): NEW TAG WITHOUT SLUG!', tag);
+      console.error('[SLUGLESS_TOPIC_TRACKER] SheetMetadata.jsx: This is likely a user-created tag via allowNew=true');
+    }
+
     const tags = [].concat(this.state.tags, tag);
     this.setState({ tags });
     this.updateTopics(tags);
diff --git a/static/js/categorize_sheets.jsx b/static/js/categorize_sheets.jsx
index 5eb6b33e4a..87e0d2f76b 100644
--- a/static/js/categorize_sheets.jsx
+++ b/static/js/categorize_sheets.jsx
@@ -45,6 +45,13 @@ class SheetCategorizer extends React.Component {
   }
 
   onTagAddition(tag) {
+    // DIAGNOSTIC LOGGING: Track what tag object ReactTags creates
+    console.log('[SLUGLESS_TOPIC_TRACKER] categorize_sheets.jsx onTagAddition(): Tag added by user:', tag);
+    if (!tag.slug) {
+      console.error('[SLUGLESS_TOPIC_TRACKER] categorize_sheets.jsx onTagAddition(): NEW TAG WITHOUT SLUG!', tag);
+      console.error('[SLUGLESS_TOPIC_TRACKER] categorize_sheets.jsx: This is likely a user-created tag via allowNew=true');
+    }
+
     const tags = [].concat(this.state.tags, tag);
     this.setState({ tags, previousTags: tags, noTags: false });
   }
@@ -70,6 +77,16 @@ class SheetCategorizer extends React.Component {
       asTyped: tag.name,
       slug: tag.slug,
     }));
+
+    // DIAGNOSTIC LOGGING: Track what topics are being sent from categorization
+    console.log('[SLUGLESS_TOPIC_TRACKER] categorize_sheets.jsx saveAndNext(): Sending topics to API:', topics);
+    topics.forEach((topic, idx) => {
+      if (!topic.slug) {
+        console.error(`[SLUGLESS_TOPIC_TRACKER] categorize_sheets.jsx: Topic at index ${idx} has NO SLUG! Topic:`, topic);
+        console.error('[SLUGLESS_TOPIC_TRACKER] categorize_sheets.jsx: Original tag from ReactTags:', this.state.tags[idx]);
+      }
+    });
+
     const currentCategories = this.state.categories;
     const keys = Object.keys(currentCategories);
     const categoriesToSend = keys.filter(x => currentCategories[x])
diff --git a/static/js/sefaria/sefaria.js b/static/js/sefaria/sefaria.js
index 22f6344150..2d0dafa0f1 100644
--- a/static/js/sefaria/sefaria.js
+++ b/static/js/sefaria/sefaria.js
@@ -3090,6 +3090,14 @@ _media: {},
       }
       const sheet = this._loadSheetByID[id];
       if (sheet) {
+        // DIAGNOSTIC LOGGING: Check for slugless topics when loading from cache
+        if (sheet.topics) {
+          sheet.topics.forEach((topic, idx) => {
+            if (!topic.slug) {
+              console.error(`[SLUGLESS_TOPIC_TRACKER] sefaria.js loadSheetByID (CACHE): Sheet ${id} has topic without slug at index ${idx}. Topic:`, topic);
+            }
+          });
+        }
         if (callback) { callback(sheet); }
       } else if (callback) {
         const url = "/api/sheets/" + id +"?more_data=1";
@@ -3097,6 +3105,14 @@ _media: {},
             if ("error" in data) {
                 console.log(data["error"])
             }
+            // DIAGNOSTIC LOGGING: Check for slugless topics when loading from API
+            if (data.topics) {
+              data.topics.forEach((topic, idx) => {
+                if (!topic.slug) {
+                  console.error(`[SLUGLESS_TOPIC_TRACKER] sefaria.js loadSheetByID (API): Sheet ${id} has topic without slug at index ${idx}. Topic:`, topic);
+                }
+              });
+            }
             this._loadSheetByID[id] = data;
             callback(data);
          });
@@ -3517,6 +3533,14 @@ Sefaria.unpackDataFromProps = function(props) {
         Sefaria._indexDetails[panel.bookRef] = panel.indexDetails;
       }
       if (panel.sheet) {
+        // DIAGNOSTIC LOGGING: Check for slugless topics when caching sheet from panel data
+        if (panel.sheet.topics) {
+          panel.sheet.topics.forEach((topic, idx) => {
+            if (!topic.slug) {
+              console.error(`[SLUGLESS_TOPIC_TRACKER] sefaria.js _cacheServerData (PANEL): Sheet ${panel.sheet.id} has topic without slug at index ${idx}. Topic:`, topic);
+            }
+          });
+        }
         Sefaria.sheets._loadSheetByID[panel.sheet.id] = panel.sheet;
       }
   }

From cebf2615565fd1f04de48b657ff5f9d06c8b0d45 Mon Sep 17 00:00:00 2001
From: yonadavGit <92536571+yonadavGit@users.noreply.github.com>
Date: Thu, 15 Jan 2026 12:28:24 +0200
Subject: [PATCH 2/2] feat(sheets): add script to find and analyze sheets with
 slugless topics

---
 scripts/find_slugless_topics.py | 400 ++++++++++++++++++++++++++++++++
 1 file changed, 400 insertions(+)
 create mode 100755 scripts/find_slugless_topics.py

diff --git a/scripts/find_slugless_topics.py b/scripts/find_slugless_topics.py
new file mode 100755
index 0000000000..32defe5a84
--- /dev/null
+++ b/scripts/find_slugless_topics.py
@@ -0,0 +1,400 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+"""
+Script to find and analyze sheets with slugless topics.
+
+Usage:
+    python scripts/find_slugless_topics.py [options]
+
+Options:
+    --count-only    Only show counts, don't list sheets
+    --verbose       Show detailed information for each sheet
+    --export-csv    Export results to CSV file
+    --fix           Fix all slugless topics (requires confirmation)
+"""
+
+import sys
+import os
+import csv
+from datetime import datetime
+
+# Add project root to path
+p = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
+sys.path.insert(0, p)
+
+from sefaria.system.database import db
+from sefaria.sheets import update_sheet_topics
+
+
+def find_sheets_with_slugless_topics():
+    """
+    Find all sheets that have topics without slug fields.
+
+    Returns:
+        cursor: MongoDB cursor with matching sheets
+    """
+    query = {
+        "topics": {
+            "$elemMatch": {
+                "slug": {"$exists": False}
+            }
+        }
+    }
+
+    projection = {
+        "id": 1,
+        "title": 1,
+        "owner": 1,
+        "topics": 1,
+        "dateCreated": 1,
+        "dateModified": 1,
+        "status": 1
+    }
+
+    return db.sheets.find(query, projection)
+
+
+def count_sheets_with_slugless_topics():
+    """
+    Count total sheets with slugless topics.
+
+    Returns:
+        int: Number of affected sheets
+    """
+    query = {
+        "topics": {
+            "$elemMatch": {
+                "slug": {"$exists": False}
+            }
+        }
+    }
+
+    return db.sheets.count_documents(query)
+
+
+def count_total_sheets_with_topics():
+    """
+    Count total sheets that have any topics.
+
+    Returns:
+        int: Number of sheets with topics
+    """
+    query = {
+        "topics": {
+            "$exists": True,
+            "$ne": []
+        }
+    }
+
+    return db.sheets.count_documents(query)
+
+
+def get_affected_users():
+    """
+    Get list of users who have sheets with slugless topics.
+
+    Returns:
+        list: List of dicts with user_id and sheet_count
+    """
+    pipeline = [
+        {
+            "$match": {
+                "topics": {
+                    "$elemMatch": {
+                        "slug": {"$exists": False}
+                    }
+                }
+            }
+        },
+        {
+            "$group": {
+                "_id": "$owner",
+                "sheet_count": {"$sum": 1},
+                "sheet_ids": {"$push": "$id"}
+            }
+        },
+        {
+            "$sort": {"sheet_count": -1}
+        }
+    ]
+
+    return list(db.sheets.aggregate(pipeline))
+
+
+def get_slugless_topics_summary():
+    """
+    Get summary of all slugless topics across sheets.
+
+    Returns:
+        list: List of dicts with asTyped text and count
+    """
+    pipeline = [
+        {
+            "$match": {
+                "topics": {
+                    "$elemMatch": {
+                        "slug": {"$exists": False}
+                    }
+                }
+            }
+        },
+        {
+            "$unwind": "$topics"
+        },
+        {
+            "$match": {
+                "topics.slug": {"$exists": False}
+            }
+        },
+        {
+            "$group": {
+                "_id": "$topics.asTyped",
+                "count": {"$sum": 1},
+                "sheet_ids": {"$push": "$id"}
+            }
+        },
+        {
+            "$sort": {"count": -1}
+        },
+        {
+            "$project": {
+                "_id": 0,
+                "asTyped": "$_id",
+                "count": 1,
+                "sheet_ids": 1
+            }
+        }
+    ]
+
+    return list(db.sheets.aggregate(pipeline))
+
+
+def print_summary():
+    """Print summary statistics of the issue."""
+    print("\n" + "="*70)
+    print("SLUGLESS TOPICS ANALYSIS SUMMARY")
+    print("="*70)
+
+    # Overall counts
+    affected_count = count_sheets_with_slugless_topics()
+    total_with_topics = count_total_sheets_with_topics()
+    total_sheets = db.sheets.count_documents({})
+
+    print(f"\n📊 OVERALL STATISTICS:")
+    print(f"   Total sheets in database: {total_sheets:,}")
+    print(f"   Sheets with topics: {total_with_topics:,}")
+    print(f"   Sheets with slugless topics: {affected_count:,}")
+
+    if total_with_topics > 0:
+        percentage = (affected_count / total_with_topics) * 100
+        print(f"   Percentage affected: {percentage:.2f}%")
+
+    # User breakdown
+    print(f"\n👥 AFFECTED USERS:")
+    affected_users = get_affected_users()
+    print(f"   Total users affected: {len(affected_users)}")
+    print(f"\n   Top 10 users by affected sheets:")
+    for i, user in enumerate(affected_users[:10], 1):
+        print(f"      {i}. User {user['_id']}: {user['sheet_count']} sheet(s)")
+
+    # Topic breakdown
+    print(f"\n🏷️  SLUGLESS TOPICS BREAKDOWN:")
+    slugless_topics = get_slugless_topics_summary()
+    print(f"   Total unique slugless topics: {len(slugless_topics)}")
+    print(f"\n   Top 20 most common slugless topics:")
+    for i, topic in enumerate(slugless_topics[:20], 1):
+        print(f"      {i}. '{topic['asTyped']}': {topic['count']} occurrence(s)")
+
+    print("\n" + "="*70 + "\n")
+
+
+def print_detailed_list(limit=None):
+    """Print detailed list of affected sheets."""
+    print("\n" + "="*70)
+    print("DETAILED SHEET LIST")
+    print("="*70 + "\n")
+
+    sheets = find_sheets_with_slugless_topics()
+
+    count = 0
+    for sheet in sheets:
+        count += 1
+        if limit and count > limit:
+            print(f"\n... (showing first {limit} sheets)")
+            break
+
+        print(f"Sheet ID: {sheet['id']}")
+        print(f"  Title: {sheet.get('title', 'Untitled')[:80]}")
+        print(f"  Owner: {sheet['owner']}")
+        print(f"  Status: {sheet.get('status', 'unknown')}")
+        print(f"  Created: {sheet.get('dateCreated', 'unknown')}")
+        print(f"  Modified: {sheet.get('dateModified', 'unknown')}")
+        print(f"  Slugless topics:")
+        for topic in sheet.get('topics', []):
+            if 'slug' not in topic:
+                print(f"    - asTyped: '{topic.get('asTyped', 'N/A')}'")
+        print()
+
+
+def export_to_csv(filename=None):
+    """Export affected sheets to CSV file."""
+    if not filename:
+        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+        filename = f"slugless_topics_{timestamp}.csv"
+
+    sheets = find_sheets_with_slugless_topics()
+
+    with open(filename, 'w', newline='', encoding='utf-8') as csvfile:
+        fieldnames = ['sheet_id', 'title', 'owner', 'status', 'date_created',
+                      'date_modified', 'slugless_topic_asTyped', 'url']
+        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
+
+        writer.writeheader()
+
+        count = 0
+        for sheet in sheets:
+            slugless_topics = [t for t in sheet.get('topics', []) if 'slug' not in t]
+
+            for topic in slugless_topics:
+                writer.writerow({
+                    'sheet_id': sheet['id'],
+                    'title': sheet.get('title', 'Untitled'),
+                    'owner': sheet['owner'],
+                    'status': sheet.get('status', 'unknown'),
+                    'date_created': sheet.get('dateCreated', ''),
+                    'date_modified': sheet.get('dateModified', ''),
+                    'slugless_topic_asTyped': topic.get('asTyped', ''),
+                    'url': f"https://www.sefaria.org/sheets/{sheet['id']}"
+                })
+                count += 1
+
+        print(f"\n✅ Exported {count} slugless topics from affected sheets to: {filename}")
+
+
+def fix_all_slugless_topics(dry_run=True):
+    """
+    Fix all sheets with slugless topics by running them through update_sheet_topics.
+
+    Args:
+        dry_run (bool): If True, only show what would be fixed without actually fixing
+    """
+    affected_sheets = list(find_sheets_with_slugless_topics())
+    total = len(affected_sheets)
+
+    print(f"\n{'DRY RUN: ' if dry_run else ''}Found {total} sheet(s) to fix")
+
+    if dry_run:
+        print("\nThis is a DRY RUN. No changes will be made.")
+        print("Sheets that would be fixed:")
+        for sheet in affected_sheets[:10]:
+            slugless = [t.get('asTyped', 'N/A') for t in sheet.get('topics', []) if 'slug' not in t]
+            print(f"  Sheet {sheet['id']}: {slugless}")
+        if total > 10:
+            print(f"  ... and {total - 10} more")
+        return
+
+    print("\n⚠️  WARNING: This will modify sheets in the database!")
+    response = input("Are you sure you want to continue? (yes/no): ")
+
+    if response.lower() != 'yes':
+        print("Aborted.")
+        return
+
+    fixed_count = 0
+    error_count = 0
+
+    print(f"\nFixing {total} sheet(s)...")
+
+    for i, sheet in enumerate(affected_sheets, 1):
+        try:
+            old_topics = sheet.get('topics', [])
+            # update_sheet_topics will create slugs for topics without them
+            result = update_sheet_topics(sheet['id'], old_topics, [])
+
+            if result.get('status') == 'ok':
+                fixed_count += 1
+                if i % 10 == 0:
+                    print(f"  Progress: {i}/{total} ({(i/total)*100:.1f}%)")
+            else:
+                error_count += 1
+                print(f"  ❌ Error fixing sheet {sheet['id']}: {result}")
+
+        except Exception as e:
+            error_count += 1
+            print(f"  ❌ Exception fixing sheet {sheet['id']}: {e}")
+
+    print(f"\n✅ Fixed: {fixed_count} sheet(s)")
+    if error_count > 0:
+        print(f"❌ Errors: {error_count} sheet(s)")
+
+
+def main():
+    """Main function to run the script."""
+    import argparse
+
+    parser = argparse.ArgumentParser(
+        description='Find and analyze sheets with slugless topics'
+    )
+    parser.add_argument(
+        '--count-only',
+        action='store_true',
+        help='Only show counts, don\'t list sheets'
+    )
+    parser.add_argument(
+        '--verbose',
+        action='store_true',
+        help='Show detailed information for each sheet'
+    )
+    parser.add_argument(
+        '--limit',
+        type=int,
+        default=None,
+        help='Limit number of sheets to display in verbose mode'
+    )
+    parser.add_argument(
+        '--export-csv',
+        action='store_true',
+        help='Export results to CSV file'
+    )
+    parser.add_argument(
+        '--csv-filename',
+        type=str,
+        default=None,
+        help='Custom filename for CSV export'
+    )
+    parser.add_argument(
+        '--fix',
+        action='store_true',
+        help='Fix all slugless topics (creates slugs)'
+    )
+    parser.add_argument(
+        '--dry-run',
+        action='store_true',
+        help='Dry run mode for --fix (show what would be fixed)'
+    )
+
+    args = parser.parse_args()
+
+    # Always show summary
+    print_summary()
+
+    # Handle different modes
+    if args.export_csv:
+        export_to_csv(args.csv_filename)
+
+    if args.fix:
+        fix_all_slugless_topics(dry_run=args.dry_run)
+
+    if args.verbose and not args.count_only:
+        print_detailed_list(limit=args.limit)
+
+    if not args.export_csv and not args.fix and not args.verbose:
+        print("💡 TIP: Use --verbose to see detailed sheet list")
+        print("💡 TIP: Use --export-csv to export to CSV file")
+        print("💡 TIP: Use --fix --dry-run to see what would be fixed")
+        print("💡 TIP: Use --fix to actually fix the issues")
+
+
+if __name__ == "__main__":
+    main()
+