From c4709bcd2a17268678569fa554a93e45802449a7 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Sat, 7 Jun 2025 08:07:53 +0000 Subject: [PATCH 1/5] Initial plan for issue From 990d96fb5ec88989c04e019b603c62c6b954dd80 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Sat, 7 Jun 2025 08:15:15 +0000 Subject: [PATCH 2/5] Add label truncation functionality and endpoint Co-authored-by: iamarcel <1212234+iamarcel@users.noreply.github.com> --- src/lib/jobs/cleanup-graph.ts | 51 +++++++++++++++++++++++++++++++++++ src/routes/truncate-labels.ts | 27 +++++++++++++++++++ 2 files changed, 78 insertions(+) create mode 100644 src/routes/truncate-labels.ts diff --git a/src/lib/jobs/cleanup-graph.ts b/src/lib/jobs/cleanup-graph.ts index 3ec76cd..83ae460 100644 --- a/src/lib/jobs/cleanup-graph.ts +++ b/src/lib/jobs/cleanup-graph.ts @@ -775,3 +775,54 @@ export function logProposalOverview( }); } } + +/** + * Truncates all node labels longer than 255 characters for a specific user. + * This is a simple cleanup operation to prevent excessively long labels from causing issues. + */ +export async function truncateLongLabels( + userId: string, +): Promise<{ updatedCount: number }> { + const db = await useDatabase(); + + // Find all nodeMetadata records with labels longer than 255 characters for this user + const longLabelNodes = await db + .select({ + id: nodeMetadata.id, + nodeId: nodeMetadata.nodeId, + label: nodeMetadata.label, + }) + .from(nodeMetadata) + .innerJoin(nodes, eq(nodes.id, nodeMetadata.nodeId)) + .where( + and( + eq(nodes.userId, userId), + sql`${nodeMetadata.label} IS NOT NULL`, + sql`length(${nodeMetadata.label}) > 255`, + ), + ); + + if (longLabelNodes.length === 0) { + return { updatedCount: 0 }; + } + + console.log(`Found ${longLabelNodes.length} nodes with labels longer than 255 characters`); + + // Update each node's label to be truncated to 255 characters + let updatedCount = 0; + for (const node of longLabelNodes) { + if (node.label) { + const truncatedLabel = node.label.substring(0, 255); + await db + .update(nodeMetadata) + .set({ label: truncatedLabel }) + .where(eq(nodeMetadata.id, node.id)); + updatedCount++; + + console.log(`Truncated label for node ${node.nodeId}: "${node.label.substring(0, 50)}..." -> "${truncatedLabel.substring(0, 50)}..."`); + } + } + + console.log(`Successfully truncated ${updatedCount} node labels`); + return { updatedCount }; +} diff --git a/src/routes/truncate-labels.ts b/src/routes/truncate-labels.ts new file mode 100644 index 0000000..b405b04 --- /dev/null +++ b/src/routes/truncate-labels.ts @@ -0,0 +1,27 @@ +import { defineEventHandler, readBody } from "h3"; +import { z } from "zod"; +import { truncateLongLabels } from "~/lib/jobs/cleanup-graph"; + +const truncateLabelsRequestSchema = z.object({ + userId: z.string(), +}); + +const truncateLabelsResponseSchema = z.object({ + message: z.string(), + updatedCount: z.number(), +}); + +export default defineEventHandler(async (event) => { + const params = truncateLabelsRequestSchema.parse(await readBody(event)); + + const result = await truncateLongLabels(params.userId); + + console.log( + `Truncated ${result.updatedCount} labels for user ${params.userId}`, + ); + + return truncateLabelsResponseSchema.parse({ + message: `Successfully truncated ${result.updatedCount} labels for user ${params.userId}`, + updatedCount: result.updatedCount, + }); +}); \ No newline at end of file From 6709f2fcfc5de6d9018361f8f37b1493b4414813 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Sat, 7 Jun 2025 10:48:14 +0000 Subject: [PATCH 3/5] =?UTF-8?q?=E2=9C=A8=20feat:=20Add=20missing=20embeddi?= =?UTF-8?q?ngs=20generation=20during=20cleanup=20phase?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: iamarcel <1212234+iamarcel@users.noreply.github.com> --- src/lib/jobs/cleanup-graph.ts | 51 ++++++++++++++++++++++++++++++++++- src/routes/truncate-labels.ts | 16 +++++++---- 2 files changed, 61 insertions(+), 6 deletions(-) diff --git a/src/lib/jobs/cleanup-graph.ts b/src/lib/jobs/cleanup-graph.ts index 83ae460..d953f19 100644 --- a/src/lib/jobs/cleanup-graph.ts +++ b/src/lib/jobs/cleanup-graph.ts @@ -10,7 +10,7 @@ import { sql, eq, gte, desc, and, inArray } from "drizzle-orm"; import { zodResponseFormat } from "openai/helpers/zod.mjs"; import { z } from "zod"; import { DrizzleDB } from "~/db"; -import { nodes, edges, nodeMetadata, sourceLinks } from "~/db/schema"; +import { nodes, edges, nodeMetadata, sourceLinks, nodeEmbeddings } from "~/db/schema"; import { EdgeTypeEnum, NodeTypeEnum } from "~/types/graph"; import type { EdgeType, NodeType } from "~/types/graph"; import { TypeId, typeIdSchema } from "~/types/typeid"; @@ -826,3 +826,52 @@ export async function truncateLongLabels( console.log(`Successfully truncated ${updatedCount} node labels`); return { updatedCount }; } + +/** + * Generates embeddings for nodes that have labels but are missing embeddings. + * This is a cleanup operation to ensure all nodes with content have searchable embeddings. + */ +export async function generateMissingNodeEmbeddings( + userId: string, +): Promise<{ generatedCount: number }> { + const db = await useDatabase(); + + // Find nodes that have labels but no embeddings for this user + const nodesWithoutEmbeddings = await db + .select({ + id: nodes.id, + label: nodeMetadata.label, + description: nodeMetadata.description, + }) + .from(nodes) + .innerJoin(nodeMetadata, eq(nodeMetadata.nodeId, nodes.id)) + .leftJoin(nodeEmbeddings, eq(nodeEmbeddings.nodeId, nodes.id)) + .where( + and( + eq(nodes.userId, userId), + sql`${nodeMetadata.label} IS NOT NULL`, + sql`trim(${nodeMetadata.label}) != ''`, + sql`${nodeEmbeddings.nodeId} IS NULL`, + ), + ); + + if (nodesWithoutEmbeddings.length === 0) { + console.log("No nodes found with labels but missing embeddings"); + return { generatedCount: 0 }; + } + + console.log(`Found ${nodesWithoutEmbeddings.length} nodes with labels but missing embeddings`); + + // Use the existing central embedding generation function + await generateAndInsertNodeEmbeddings( + db, + nodesWithoutEmbeddings.map((node) => ({ + id: node.id, + label: node.label!, + description: node.description, + })), + ); + + console.log(`Successfully generated embeddings for ${nodesWithoutEmbeddings.length} nodes`); + return { generatedCount: nodesWithoutEmbeddings.length }; +} diff --git a/src/routes/truncate-labels.ts b/src/routes/truncate-labels.ts index b405b04..356b230 100644 --- a/src/routes/truncate-labels.ts +++ b/src/routes/truncate-labels.ts @@ -1,6 +1,6 @@ import { defineEventHandler, readBody } from "h3"; import { z } from "zod"; -import { truncateLongLabels } from "~/lib/jobs/cleanup-graph"; +import { truncateLongLabels, generateMissingNodeEmbeddings } from "~/lib/jobs/cleanup-graph"; const truncateLabelsRequestSchema = z.object({ userId: z.string(), @@ -9,19 +9,25 @@ const truncateLabelsRequestSchema = z.object({ const truncateLabelsResponseSchema = z.object({ message: z.string(), updatedCount: z.number(), + embeddingsGeneratedCount: z.number(), }); export default defineEventHandler(async (event) => { const params = truncateLabelsRequestSchema.parse(await readBody(event)); - const result = await truncateLongLabels(params.userId); + // Run both cleanup operations + const [truncateResult, embeddingsResult] = await Promise.all([ + truncateLongLabels(params.userId), + generateMissingNodeEmbeddings(params.userId), + ]); console.log( - `Truncated ${result.updatedCount} labels for user ${params.userId}`, + `Cleanup completed for user ${params.userId}: truncated ${truncateResult.updatedCount} labels, generated ${embeddingsResult.generatedCount} embeddings`, ); return truncateLabelsResponseSchema.parse({ - message: `Successfully truncated ${result.updatedCount} labels for user ${params.userId}`, - updatedCount: result.updatedCount, + message: `Successfully truncated ${truncateResult.updatedCount} labels and generated ${embeddingsResult.generatedCount} embeddings for user ${params.userId}`, + updatedCount: truncateResult.updatedCount, + embeddingsGeneratedCount: embeddingsResult.generatedCount, }); }); \ No newline at end of file From 2f2bb606c803c69d463b0d97a43aeb5a2661943a Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Sun, 8 Jun 2025 07:44:57 +0000 Subject: [PATCH 4/5] =?UTF-8?q?=F0=9F=94=A7=20chore:=20Integrate=20label?= =?UTF-8?q?=20truncation=20into=20cleanup=20job=20instead=20of=20separate?= =?UTF-8?q?=20endpoint?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: iamarcel <1212234+iamarcel@users.noreply.github.com> --- src/lib/queues.ts | 16 ++++++++++++++++ src/routes/truncate-labels.ts | 33 --------------------------------- 2 files changed, 16 insertions(+), 33 deletions(-) delete mode 100644 src/routes/truncate-labels.ts diff --git a/src/lib/queues.ts b/src/lib/queues.ts index 1c0f64f..c5c6c90 100644 --- a/src/lib/queues.ts +++ b/src/lib/queues.ts @@ -179,6 +179,22 @@ const worker = new Worker( `Starting cleanup-graph job for user ${data.userId}, since ${data.since.toISOString()}`, ); + // First, run basic cleanup operations + const { truncateLongLabels, generateMissingNodeEmbeddings } = await import( + "./jobs/cleanup-graph" + ); + + console.log("Running basic cleanup operations..."); + const [truncateResult, embeddingsResult] = await Promise.all([ + truncateLongLabels(data.userId), + generateMissingNodeEmbeddings(data.userId), + ]); + + console.log( + `Basic cleanup completed: truncated ${truncateResult.updatedCount} labels, generated ${embeddingsResult.generatedCount} embeddings`, + ); + + // Then run the iterative graph cleanup const { runIterativeCleanup } = await import( "./jobs/run-iterative-cleanup" ); diff --git a/src/routes/truncate-labels.ts b/src/routes/truncate-labels.ts deleted file mode 100644 index 356b230..0000000 --- a/src/routes/truncate-labels.ts +++ /dev/null @@ -1,33 +0,0 @@ -import { defineEventHandler, readBody } from "h3"; -import { z } from "zod"; -import { truncateLongLabels, generateMissingNodeEmbeddings } from "~/lib/jobs/cleanup-graph"; - -const truncateLabelsRequestSchema = z.object({ - userId: z.string(), -}); - -const truncateLabelsResponseSchema = z.object({ - message: z.string(), - updatedCount: z.number(), - embeddingsGeneratedCount: z.number(), -}); - -export default defineEventHandler(async (event) => { - const params = truncateLabelsRequestSchema.parse(await readBody(event)); - - // Run both cleanup operations - const [truncateResult, embeddingsResult] = await Promise.all([ - truncateLongLabels(params.userId), - generateMissingNodeEmbeddings(params.userId), - ]); - - console.log( - `Cleanup completed for user ${params.userId}: truncated ${truncateResult.updatedCount} labels, generated ${embeddingsResult.generatedCount} embeddings`, - ); - - return truncateLabelsResponseSchema.parse({ - message: `Successfully truncated ${truncateResult.updatedCount} labels and generated ${embeddingsResult.generatedCount} embeddings for user ${params.userId}`, - updatedCount: truncateResult.updatedCount, - embeddingsGeneratedCount: embeddingsResult.generatedCount, - }); -}); \ No newline at end of file From 3c0c801b668442f202e44407ca9177ba5eb4dc46 Mon Sep 17 00:00:00 2001 From: Marcel Samyn Date: Sun, 8 Jun 2025 13:00:04 +0200 Subject: [PATCH 5/5] format --- src/lib/jobs/cleanup-graph.ts | 30 ++++++++++++++++++++++-------- src/lib/queues.ts | 9 ++++----- 2 files changed, 26 insertions(+), 13 deletions(-) diff --git a/src/lib/jobs/cleanup-graph.ts b/src/lib/jobs/cleanup-graph.ts index d953f19..de6d304 100644 --- a/src/lib/jobs/cleanup-graph.ts +++ b/src/lib/jobs/cleanup-graph.ts @@ -10,7 +10,13 @@ import { sql, eq, gte, desc, and, inArray } from "drizzle-orm"; import { zodResponseFormat } from "openai/helpers/zod.mjs"; import { z } from "zod"; import { DrizzleDB } from "~/db"; -import { nodes, edges, nodeMetadata, sourceLinks, nodeEmbeddings } from "~/db/schema"; +import { + nodes, + edges, + nodeMetadata, + sourceLinks, + nodeEmbeddings, +} from "~/db/schema"; import { EdgeTypeEnum, NodeTypeEnum } from "~/types/graph"; import type { EdgeType, NodeType } from "~/types/graph"; import { TypeId, typeIdSchema } from "~/types/typeid"; @@ -784,7 +790,7 @@ export async function truncateLongLabels( userId: string, ): Promise<{ updatedCount: number }> { const db = await useDatabase(); - + // Find all nodeMetadata records with labels longer than 255 characters for this user const longLabelNodes = await db .select({ @@ -806,7 +812,9 @@ export async function truncateLongLabels( return { updatedCount: 0 }; } - console.log(`Found ${longLabelNodes.length} nodes with labels longer than 255 characters`); + console.log( + `Found ${longLabelNodes.length} nodes with labels longer than 255 characters`, + ); // Update each node's label to be truncated to 255 characters let updatedCount = 0; @@ -818,8 +826,10 @@ export async function truncateLongLabels( .set({ label: truncatedLabel }) .where(eq(nodeMetadata.id, node.id)); updatedCount++; - - console.log(`Truncated label for node ${node.nodeId}: "${node.label.substring(0, 50)}..." -> "${truncatedLabel.substring(0, 50)}..."`); + + console.log( + `Truncated label for node ${node.nodeId}: "${node.label.substring(0, 50)}..." -> "${truncatedLabel.substring(0, 50)}..."`, + ); } } @@ -835,7 +845,7 @@ export async function generateMissingNodeEmbeddings( userId: string, ): Promise<{ generatedCount: number }> { const db = await useDatabase(); - + // Find nodes that have labels but no embeddings for this user const nodesWithoutEmbeddings = await db .select({ @@ -860,7 +870,9 @@ export async function generateMissingNodeEmbeddings( return { generatedCount: 0 }; } - console.log(`Found ${nodesWithoutEmbeddings.length} nodes with labels but missing embeddings`); + console.log( + `Found ${nodesWithoutEmbeddings.length} nodes with labels but missing embeddings`, + ); // Use the existing central embedding generation function await generateAndInsertNodeEmbeddings( @@ -872,6 +884,8 @@ export async function generateMissingNodeEmbeddings( })), ); - console.log(`Successfully generated embeddings for ${nodesWithoutEmbeddings.length} nodes`); + console.log( + `Successfully generated embeddings for ${nodesWithoutEmbeddings.length} nodes`, + ); return { generatedCount: nodesWithoutEmbeddings.length }; } diff --git a/src/lib/queues.ts b/src/lib/queues.ts index c5c6c90..5bb3367 100644 --- a/src/lib/queues.ts +++ b/src/lib/queues.ts @@ -180,16 +180,15 @@ const worker = new Worker( ); // First, run basic cleanup operations - const { truncateLongLabels, generateMissingNodeEmbeddings } = await import( - "./jobs/cleanup-graph" - ); - + const { truncateLongLabels, generateMissingNodeEmbeddings } = + await import("./jobs/cleanup-graph"); + console.log("Running basic cleanup operations..."); const [truncateResult, embeddingsResult] = await Promise.all([ truncateLongLabels(data.userId), generateMissingNodeEmbeddings(data.userId), ]); - + console.log( `Basic cleanup completed: truncated ${truncateResult.updatedCount} labels, generated ${embeddingsResult.generatedCount} embeddings`, );