diff --git a/src/lib/jobs/cleanup-graph.ts b/src/lib/jobs/cleanup-graph.ts index 3ec76cd..de6d304 100644 --- a/src/lib/jobs/cleanup-graph.ts +++ b/src/lib/jobs/cleanup-graph.ts @@ -10,7 +10,13 @@ import { sql, eq, gte, desc, and, inArray } from "drizzle-orm"; import { zodResponseFormat } from "openai/helpers/zod.mjs"; import { z } from "zod"; import { DrizzleDB } from "~/db"; -import { nodes, edges, nodeMetadata, sourceLinks } from "~/db/schema"; +import { + nodes, + edges, + nodeMetadata, + sourceLinks, + nodeEmbeddings, +} from "~/db/schema"; import { EdgeTypeEnum, NodeTypeEnum } from "~/types/graph"; import type { EdgeType, NodeType } from "~/types/graph"; import { TypeId, typeIdSchema } from "~/types/typeid"; @@ -775,3 +781,111 @@ export function logProposalOverview( }); } } + +/** + * Truncates all node labels longer than 255 characters for a specific user. + * This is a simple cleanup operation to prevent excessively long labels from causing issues. + */ +export async function truncateLongLabels( + userId: string, +): Promise<{ updatedCount: number }> { + const db = await useDatabase(); + + // Find all nodeMetadata records with labels longer than 255 characters for this user + const longLabelNodes = await db + .select({ + id: nodeMetadata.id, + nodeId: nodeMetadata.nodeId, + label: nodeMetadata.label, + }) + .from(nodeMetadata) + .innerJoin(nodes, eq(nodes.id, nodeMetadata.nodeId)) + .where( + and( + eq(nodes.userId, userId), + sql`${nodeMetadata.label} IS NOT NULL`, + sql`length(${nodeMetadata.label}) > 255`, + ), + ); + + if (longLabelNodes.length === 0) { + return { updatedCount: 0 }; + } + + console.log( + `Found ${longLabelNodes.length} nodes with labels longer than 255 characters`, + ); + + // Update each node's label to be truncated to 255 characters + let updatedCount = 0; + for (const node of longLabelNodes) { + if (node.label) { + const truncatedLabel = node.label.substring(0, 255); + await db + .update(nodeMetadata) + .set({ label: truncatedLabel }) + .where(eq(nodeMetadata.id, node.id)); + updatedCount++; + + console.log( + `Truncated label for node ${node.nodeId}: "${node.label.substring(0, 50)}..." -> "${truncatedLabel.substring(0, 50)}..."`, + ); + } + } + + console.log(`Successfully truncated ${updatedCount} node labels`); + return { updatedCount }; +} + +/** + * Generates embeddings for nodes that have labels but are missing embeddings. + * This is a cleanup operation to ensure all nodes with content have searchable embeddings. + */ +export async function generateMissingNodeEmbeddings( + userId: string, +): Promise<{ generatedCount: number }> { + const db = await useDatabase(); + + // Find nodes that have labels but no embeddings for this user + const nodesWithoutEmbeddings = await db + .select({ + id: nodes.id, + label: nodeMetadata.label, + description: nodeMetadata.description, + }) + .from(nodes) + .innerJoin(nodeMetadata, eq(nodeMetadata.nodeId, nodes.id)) + .leftJoin(nodeEmbeddings, eq(nodeEmbeddings.nodeId, nodes.id)) + .where( + and( + eq(nodes.userId, userId), + sql`${nodeMetadata.label} IS NOT NULL`, + sql`trim(${nodeMetadata.label}) != ''`, + sql`${nodeEmbeddings.nodeId} IS NULL`, + ), + ); + + if (nodesWithoutEmbeddings.length === 0) { + console.log("No nodes found with labels but missing embeddings"); + return { generatedCount: 0 }; + } + + console.log( + `Found ${nodesWithoutEmbeddings.length} nodes with labels but missing embeddings`, + ); + + // Use the existing central embedding generation function + await generateAndInsertNodeEmbeddings( + db, + nodesWithoutEmbeddings.map((node) => ({ + id: node.id, + label: node.label!, + description: node.description, + })), + ); + + console.log( + `Successfully generated embeddings for ${nodesWithoutEmbeddings.length} nodes`, + ); + return { generatedCount: nodesWithoutEmbeddings.length }; +} diff --git a/src/lib/queues.ts b/src/lib/queues.ts index 1c0f64f..5bb3367 100644 --- a/src/lib/queues.ts +++ b/src/lib/queues.ts @@ -179,6 +179,21 @@ const worker = new Worker( `Starting cleanup-graph job for user ${data.userId}, since ${data.since.toISOString()}`, ); + // First, run basic cleanup operations + const { truncateLongLabels, generateMissingNodeEmbeddings } = + await import("./jobs/cleanup-graph"); + + console.log("Running basic cleanup operations..."); + const [truncateResult, embeddingsResult] = await Promise.all([ + truncateLongLabels(data.userId), + generateMissingNodeEmbeddings(data.userId), + ]); + + console.log( + `Basic cleanup completed: truncated ${truncateResult.updatedCount} labels, generated ${embeddingsResult.generatedCount} embeddings`, + ); + + // Then run the iterative graph cleanup const { runIterativeCleanup } = await import( "./jobs/run-iterative-cleanup" );