diff --git a/ldai/client.go b/ldai/client.go index d6feb388..4536bda0 100644 --- a/ldai/client.go +++ b/ldai/client.go @@ -102,7 +102,11 @@ func (c *Client) Config( builder := NewConfig(). WithModelName(parsed.Model.Name). WithProviderName(parsed.Provider.Name). - WithEnabled(parsed.Meta.Enabled) + WithEnabled(parsed.Meta.Enabled). + WithMode(parsed.Mode). + WithEvaluationMetricKey(parsed.EvaluationMetricKey). + WithEvaluationMetricKeys(parsed.EvaluationMetricKeys). + WithJudgeConfiguration(parsed.JudgeConfiguration) for k, v := range parsed.Model.Parameters { builder.WithModelParam(k, v) @@ -174,3 +178,38 @@ func interpolateTemplate(template string, variables map[string]interface{}) (str } return m.RenderString(variables) } + +// JudgeConfig evaluates an AI Config, tracking it as a judge function. See Config for details. +// +// This method extends the provided variables with reserved judge variables: +// - "message_history": "{{message_history}}" +// - "response_to_evaluate": "{{response_to_evaluate}}" +// +// These literal placeholder strings preserve the Mustache templates through the first interpolation +// (during config fetch), allowing Judge.Evaluate() to perform a second interpolation with actual values. +func (c *Client) JudgeConfig( + key string, + context ldcontext.Context, + defaultValue Config, + variables map[string]interface{}, +) (Config, *Tracker) { + _ = c.sdk.TrackMetric("$ld:ai:judge:function:single", context, 1, ldvalue.String(key)) + + // Extend variables with reserved judge placeholders + extendedVariables := make(map[string]interface{}) + for k, v := range variables { + // Warn if user tries to override reserved variables + if k == "message_history" || k == "response_to_evaluate" { + c.logger.Warnf("AI Config '%s': variable '%s' is reserved by judge and will be ignored", key, k) + continue + } + extendedVariables[k] = v + } + + // Inject reserved variables as literal placeholder strings + // These will be preserved through the first interpolation and resolved during Judge.Evaluate() + extendedVariables["message_history"] = "{{message_history}}" + extendedVariables["response_to_evaluate"] = "{{response_to_evaluate}}" + + return c.Config(key, context, defaultValue, extendedVariables) +} diff --git a/ldai/client_test.go b/ldai/client_test.go index 525ea42b..6ef58927 100644 --- a/ldai/client_test.go +++ b/ldai/client_test.go @@ -579,3 +579,167 @@ func TestInterpolation(t *testing.T) { assert.Equal(t, "user_kind=<>,cat_kind=<>", result) }) } + +func TestParseJudgeSpecificFields(t *testing.T) { + json := []byte(`{ + "_ldMeta": {"variationKey": "1", "enabled": true}, + "mode": "judge", + "evaluationMetricKey": "toxicity", + "judgeConfiguration": { + "judges": [ + {"key": "judge1", "samplingRate": 0.5}, + {"key": "judge2", "samplingRate": 1.0} + ] + }, + "messages": [ + {"content": "test", "role": "system"} + ] + }`) + + client, err := NewClient(newMockSDK(json, nil)) + require.NoError(t, err) + require.NotNil(t, client) + + cfg, _ := client.Config("key", ldcontext.New("user"), Disabled(), nil) + + assert.Equal(t, "judge", cfg.Mode()) + assert.Equal(t, "toxicity", cfg.EvaluationMetricKey()) + + judgeConfig := cfg.JudgeConfiguration() + require.NotNil(t, judgeConfig) + require.Len(t, judgeConfig.Judges, 2) + assert.Equal(t, "judge1", judgeConfig.Judges[0].Key) + assert.Equal(t, 0.5, judgeConfig.Judges[0].SamplingRate) + assert.Equal(t, "judge2", judgeConfig.Judges[1].Key) + assert.Equal(t, 1.0, judgeConfig.Judges[1].SamplingRate) +} + +func TestParseEvaluationMetricKeys(t *testing.T) { + json := []byte(`{ + "_ldMeta": {"variationKey": "1", "enabled": true}, + "mode": "judge", + "evaluationMetricKeys": ["relevance", "accuracy"], + "messages": [ + {"content": "test", "role": "system"} + ] + }`) + + client, err := NewClient(newMockSDK(json, nil)) + require.NoError(t, err) + require.NotNil(t, client) + + cfg, _ := client.Config("key", ldcontext.New("user"), Disabled(), nil) + + assert.Equal(t, "judge", cfg.Mode()) + assert.Equal(t, "", cfg.EvaluationMetricKey()) + assert.Equal(t, []string{"relevance", "accuracy"}, cfg.EvaluationMetricKeys()) +} + +func TestParseEvaluationMetricKeyPriority(t *testing.T) { + json := []byte(`{ + "_ldMeta": {"variationKey": "1", "enabled": true}, + "mode": "judge", + "evaluationMetricKey": "toxicity", + "evaluationMetricKeys": ["relevance", "accuracy"], + "messages": [ + {"content": "test", "role": "system"} + ] + }`) + + client, err := NewClient(newMockSDK(json, nil)) + require.NoError(t, err) + require.NotNil(t, client) + + cfg, _ := client.Config("key", ldcontext.New("user"), Disabled(), nil) + + assert.Equal(t, "judge", cfg.Mode()) + // Both fields should be parsed + assert.Equal(t, "toxicity", cfg.EvaluationMetricKey()) + assert.Equal(t, []string{"relevance", "accuracy"}, cfg.EvaluationMetricKeys()) +} + +func TestJudgeConfigurationImmutable(t *testing.T) { + // Test that mutations to JudgeConfiguration don't affect the Config + judgeConfig := &datamodel.JudgeConfiguration{ + Judges: []datamodel.Judge{ + {Key: "judge1", SamplingRate: 0.5}, + {Key: "judge2", SamplingRate: 1.0}, + }, + } + + builder := NewConfig(). + Enable(). + WithJudgeConfiguration(judgeConfig) + cfg := builder.Build() + + // Mutate the original + judgeConfig.Judges[0].Key = "mutated" + judgeConfig.Judges = append(judgeConfig.Judges, datamodel.Judge{Key: "judge3", SamplingRate: 0.3}) + + // Config should not be affected + retrieved := cfg.JudgeConfiguration() + require.NotNil(t, retrieved) + require.Len(t, retrieved.Judges, 2) + assert.Equal(t, "judge1", retrieved.Judges[0].Key) // Should still be original value + assert.Equal(t, "judge2", retrieved.Judges[1].Key) + + // Mutate the retrieved config + retrieved.Judges[0].Key = "mutated_again" + retrieved.Judges = append(retrieved.Judges, datamodel.Judge{Key: "judge4", SamplingRate: 0.4}) + + // Config should still not be affected + retrieved2 := cfg.JudgeConfiguration() + require.NotNil(t, retrieved2) + require.Len(t, retrieved2.Judges, 2) + assert.Equal(t, "judge1", retrieved2.Judges[0].Key) // Should still be original value + assert.Equal(t, "judge2", retrieved2.Judges[1].Key) +} + +// TestJudgeConfig_PreservesReservedPlaceholders verifies that JudgeConfig injects reserved variables +// so that {{message_history}} and {{response_to_evaluate}} are preserved for the second interpolation +// pass during Judge.Evaluate(). Without this, Config's first Mustache pass would render them as empty. +func TestJudgeConfig_PreservesReservedPlaceholders(t *testing.T) { + json := []byte(`{ + "_ldMeta": {"variationKey": "1", "enabled": true}, + "mode": "judge", + "evaluationMetricKey": "toxicity", + "messages": [ + {"content": "You are a judge.", "role": "system"}, + {"content": "Input: {{message_history}}\nOutput: {{response_to_evaluate}}", "role": "user"} + ] + }`) + + client, err := NewClient(newMockSDK(json, nil)) + require.NoError(t, err) + require.NotNil(t, client) + + cfg, _ := client.JudgeConfig("judge-key", ldcontext.New("user"), Disabled(), nil) + + msgs := cfg.Messages() + require.Len(t, msgs, 2) + assert.Equal(t, "You are a judge.", msgs[0].Content) + assert.Contains(t, msgs[1].Content, "{{message_history}}", "JudgeConfig must preserve placeholder for second interpolation") + assert.Contains(t, msgs[1].Content, "{{response_to_evaluate}}", "JudgeConfig must preserve placeholder for second interpolation") + assert.Equal(t, "Input: {{message_history}}\nOutput: {{response_to_evaluate}}", msgs[1].Content) +} + +// TestConfig_WithoutReservedVarsWipesJudgePlaceholders documents that Config (without reserved vars) +// renders {{message_history}} and {{response_to_evaluate}} as empty when used for judge templates. +func TestConfig_WithoutReservedVarsWipesJudgePlaceholders(t *testing.T) { + json := []byte(`{ + "_ldMeta": {"variationKey": "1", "enabled": true}, + "messages": [ + {"content": "Input: {{message_history}}\nOutput: {{response_to_evaluate}}", "role": "user"} + ] + }`) + + client, err := NewClient(newMockSDK(json, nil)) + require.NoError(t, err) + require.NotNil(t, client) + + cfg, _ := client.Config("key", ldcontext.New("user"), Disabled(), nil) + + msgs := cfg.Messages() + require.Len(t, msgs, 1) + assert.Equal(t, "Input: \nOutput: ", msgs[0].Content, "Config without reserved vars renders placeholders as empty") +} diff --git a/ldai/config.go b/ldai/config.go index fd9e30aa..5fba4e7e 100644 --- a/ldai/config.go +++ b/ldai/config.go @@ -60,6 +60,33 @@ func (c *Config) CustomModelParam(key string) (ldvalue.Value, bool) { return val, ok } +// Mode returns the AI Config mode (e.g., "completion", "agent", "judge"). +func (c *Config) Mode() string { + return c.c.Mode +} + +// EvaluationMetricKey returns the evaluation metric key for judge mode configs. +func (c *Config) EvaluationMetricKey() string { + return c.c.EvaluationMetricKey +} + +// EvaluationMetricKeys returns the deprecated array of evaluation metric keys. +// Use EvaluationMetricKey instead. +func (c *Config) EvaluationMetricKeys() []string { + return slices.Clone(c.c.EvaluationMetricKeys) +} + +// JudgeConfiguration returns the judge configuration attached to this config, if any. +// Returns a defensive copy to prevent mutations. +func (c *Config) JudgeConfiguration() *datamodel.JudgeConfiguration { + if c.c.JudgeConfiguration == nil { + return nil + } + return &datamodel.JudgeConfiguration{ + Judges: slices.Clone(c.c.JudgeConfiguration.Judges), + } +} + // AsLdValue is used internally. func (c *Config) AsLdValue() ldvalue.Value { return ldvalue.FromJSONMarshal(c.c) @@ -68,12 +95,16 @@ func (c *Config) AsLdValue() ldvalue.Value { // ConfigBuilder is used to define a default AI Config, returned when LaunchDarkly is unreachable or there // is an error evaluating the Config. type ConfigBuilder struct { - messages []datamodel.Message - enabled bool - providerName string - modelName string - modelParams map[string]ldvalue.Value - modelCustomParams map[string]ldvalue.Value + messages []datamodel.Message + enabled bool + providerName string + modelName string + modelParams map[string]ldvalue.Value + modelCustomParams map[string]ldvalue.Value + mode string + evaluationMetricKey string + evaluationMetricKeys []string + judgeConfiguration *datamodel.JudgeConfiguration } // NewConfig returns a new ConfigBuilder. By default, the Config is disabled. @@ -141,8 +172,47 @@ func (cb *ConfigBuilder) WithCustomModelParam(key string, value ldvalue.Value) * return cb } +// WithMode sets the AI Config mode (e.g., "completion", "agent", "judge"). +func (cb *ConfigBuilder) WithMode(mode string) *ConfigBuilder { + cb.mode = mode + return cb +} + +// WithEvaluationMetricKey sets the evaluation metric key for judge mode configs. +func (cb *ConfigBuilder) WithEvaluationMetricKey(key string) *ConfigBuilder { + cb.evaluationMetricKey = key + return cb +} + +// WithEvaluationMetricKeys sets the deprecated array of evaluation metric keys. +// Use WithEvaluationMetricKey instead. +func (cb *ConfigBuilder) WithEvaluationMetricKeys(keys []string) *ConfigBuilder { + cb.evaluationMetricKeys = slices.Clone(keys) + return cb +} + +// WithJudgeConfiguration sets the judge configuration for this config. +// The provided judgeConfig is defensively copied. +func (cb *ConfigBuilder) WithJudgeConfiguration(judgeConfig *datamodel.JudgeConfiguration) *ConfigBuilder { + if judgeConfig == nil { + cb.judgeConfiguration = nil + return cb + } + cb.judgeConfiguration = &datamodel.JudgeConfiguration{ + Judges: slices.Clone(judgeConfig.Judges), + } + return cb +} + // Build creates a Config from the current builder state. func (cb *ConfigBuilder) Build() Config { + var judgeConfig *datamodel.JudgeConfiguration + if cb.judgeConfiguration != nil { + judgeConfig = &datamodel.JudgeConfiguration{ + Judges: slices.Clone(cb.judgeConfiguration.Judges), + } + } + return Config{ c: datamodel.Config{ Messages: slices.Clone(cb.messages), @@ -157,6 +227,10 @@ func (cb *ConfigBuilder) Build() Config { Provider: datamodel.Provider{ Name: cb.providerName, }, + Mode: cb.mode, + EvaluationMetricKey: cb.evaluationMetricKey, + EvaluationMetricKeys: slices.Clone(cb.evaluationMetricKeys), + JudgeConfiguration: judgeConfig, }, } } diff --git a/ldai/datamodel/datamodel.go b/ldai/datamodel/datamodel.go index 04c56d1d..b5de1961 100644 --- a/ldai/datamodel/datamodel.go +++ b/ldai/datamodel/datamodel.go @@ -68,4 +68,56 @@ type Config struct { // Provider is the provider. Provider Provider `json:"provider,omitempty"` + + // Mode is the AI Config mode (e.g., "completion", "agent", "judge"). + Mode string `json:"mode,omitempty"` + + // EvaluationMetricKey is the evaluation metric key for judge mode configs. + EvaluationMetricKey string `json:"evaluationMetricKey,omitempty"` + + // EvaluationMetricKeys is a deprecated array of evaluation metric keys. + // Use EvaluationMetricKey instead. + EvaluationMetricKeys []string `json:"evaluationMetricKeys,omitempty"` + + // JudgeConfiguration specifies judges attached to this config. + JudgeConfiguration *JudgeConfiguration `json:"judgeConfiguration,omitempty"` +} + +// JudgeConfiguration defines the configuration for judges attached to a config. +type JudgeConfiguration struct { + // Judges is a list of judges to evaluate this config's outputs. + Judges []Judge `json:"judges,omitempty"` +} + +// Judge defines a single judge reference with key and sampling rate. +type Judge struct { + // Key is the judge config key. + Key string `json:"key"` + + // SamplingRate is the probability (0.0-1.0) that the judge will evaluate. + SamplingRate float64 `json:"samplingRate"` +} + +// EvalScore represents a single evaluation metric result. +type EvalScore struct { + // Score is the evaluation score between 0.0 and 1.0. + Score float64 `json:"score"` + + // Reasoning is the explanation for the score. + Reasoning string `json:"reasoning"` +} + +// JudgeResponse represents the response from a judge evaluation. +type JudgeResponse struct { + // Evals contains the evaluation results keyed by metric name. + Evals map[string]EvalScore `json:"evals"` + + // Success indicates whether the evaluation completed successfully. + Success bool `json:"success"` + + // JudgeConfigKey is the key of the judge config that produced this response. + JudgeConfigKey string `json:"judgeConfigKey,omitempty"` + + // Error contains the error message if the evaluation failed. + Error string `json:"error,omitempty"` } diff --git a/ldai/judge/judge.go b/ldai/judge/judge.go new file mode 100644 index 00000000..392cb13b --- /dev/null +++ b/ldai/judge/judge.go @@ -0,0 +1,220 @@ +package judge + +import ( + "fmt" + "math/rand" + "strings" + + "github.com/alexkappa/mustache" + "github.com/launchdarkly/go-sdk-common/v3/ldvalue" + "github.com/launchdarkly/go-server-sdk/ldai" + "github.com/launchdarkly/go-server-sdk/ldai/datamodel" + "github.com/launchdarkly/go-server-sdk/v7/interfaces" +) + +type Config interface { + Messages() []datamodel.Message + ModelParam(key string) (ldvalue.Value, bool) + CustomModelParam(key string) (ldvalue.Value, bool) + EvaluationMetricKey() string + EvaluationMetricKeys() []string +} + +type Tracker interface { + TrackJudgeResponse(response datamodel.JudgeResponse) error + TrackUsage(usage ldai.TokenUsage) error +} + +type StructuredResponse struct { + Content map[string]interface{} + Usage ldai.TokenUsage +} + +type Provider interface { + InvokeStructuredModel(messages []datamodel.Message, schema map[string]interface{}) (StructuredResponse, error) +} + +type Judge struct { + config Config + tracker Tracker + provider Provider + metricKey string + judgeConfigKey string + logger interfaces.LDLoggers +} + +func New(config Config, tracker Tracker, provider Provider, configKey string, logger interfaces.LDLoggers) (*Judge, error) { + if config == nil { + return nil, fmt.Errorf("config must not be nil") + } + if tracker == nil { + return nil, fmt.Errorf("tracker must not be nil") + } + if provider == nil { + return nil, fmt.Errorf("provider must not be nil") + } + + metricKey, err := getMetricKey(config, logger, configKey) + if err != nil { + return nil, err + } + + return &Judge{ + config: config, + tracker: tracker, + provider: provider, + metricKey: metricKey, + judgeConfigKey: configKey, + logger: logger, + }, nil +} + +func (j *Judge) Evaluate(input, output string, samplingRate float64) (*datamodel.JudgeResponse, error) { + if len(j.config.Messages()) == 0 { + return nil, nil + } + + if samplingRate < 1.0 && rand.Float64() > samplingRate { + return nil, nil + } + + messages := j.buildMessages(input, output) + schema := buildSchema(j.metricKey) + + response, err := j.provider.InvokeStructuredModel(messages, schema) + if err != nil { + return &datamodel.JudgeResponse{ + Evals: map[string]datamodel.EvalScore{}, + Success: false, + Error: err.Error(), + JudgeConfigKey: j.judgeConfigKey, + }, nil + } + + if response.Usage.Total > 0 || response.Usage.Input > 0 || response.Usage.Output > 0 { + _ = j.tracker.TrackUsage(response.Usage) + } + + result := j.parseResponse(response.Content) + // Note: Judge response tracking should be done by the caller (AI config being evaluated) + // not by the judge itself. This matches Python and JavaScript SDK behavior. + + return result, nil +} + +func (j *Judge) EvaluateMessages(messages []datamodel.Message, response string, samplingRate float64) (*datamodel.JudgeResponse, error) { + parts := make([]string, len(messages)) + for i, msg := range messages { + parts[i] = msg.Content + } + input := strings.Join(parts, "\r\n") + return j.Evaluate(input, response, samplingRate) +} + +func (j *Judge) GetConfig() Config { + return j.config +} + +func (j *Judge) GetTracker() Tracker { + return j.tracker +} + +func (j *Judge) GetProvider() Provider { + return j.provider +} + +func (j *Judge) buildMessages(input, output string) []datamodel.Message { + vars := map[string]interface{}{ + "message_history": input, + "response_to_evaluate": output, + } + + messages := j.config.Messages() + result := make([]datamodel.Message, len(messages)) + + for i, msg := range messages { + m := mustache.New() + if err := m.ParseString(msg.Content); err != nil { + result[i] = datamodel.Message{Content: msg.Content, Role: msg.Role} + continue + } + content, err := m.RenderString(vars) + if err != nil { + result[i] = datamodel.Message{Content: msg.Content, Role: msg.Role} + continue + } + result[i] = datamodel.Message{Content: content, Role: msg.Role} + } + + return result +} + +func (j *Judge) parseResponse(data map[string]interface{}) *datamodel.JudgeResponse { + evaluations, ok := data["evaluations"].(map[string]interface{}) + if !ok { + return &datamodel.JudgeResponse{ + Evals: map[string]datamodel.EvalScore{}, + Success: false, + Error: "missing evaluations object", + JudgeConfigKey: j.judgeConfigKey, + } + } + + evalData, ok := evaluations[j.metricKey].(map[string]interface{}) + if !ok { + return &datamodel.JudgeResponse{ + Evals: map[string]datamodel.EvalScore{}, + Success: false, + Error: fmt.Sprintf("missing evaluation for %s", j.metricKey), + JudgeConfigKey: j.judgeConfigKey, + } + } + + score, ok := evalData["score"].(float64) + if !ok || score < 0 || score > 1 { + return &datamodel.JudgeResponse{ + Evals: map[string]datamodel.EvalScore{}, + Success: false, + Error: "invalid score", + JudgeConfigKey: j.judgeConfigKey, + } + } + + reasoning, ok := evalData["reasoning"].(string) + if !ok { + return &datamodel.JudgeResponse{ + Evals: map[string]datamodel.EvalScore{}, + Success: false, + Error: "invalid reasoning", + JudgeConfigKey: j.judgeConfigKey, + } + } + + return &datamodel.JudgeResponse{ + Evals: map[string]datamodel.EvalScore{ + j.metricKey: { + Score: score, + Reasoning: reasoning, + }, + }, + Success: true, + JudgeConfigKey: j.judgeConfigKey, + } +} + +func getMetricKey(config Config, logger interfaces.LDLoggers, configKey string) (string, error) { + // Priority 1: Check top-level evaluationMetricKey field (primary field) + if metricKey := config.EvaluationMetricKey(); strings.TrimSpace(metricKey) != "" { + return strings.TrimSpace(metricKey), nil + } + + // Priority 2: Check top-level evaluationMetricKeys array (deprecated) + keys := config.EvaluationMetricKeys() + for _, key := range keys { + if trimmed := strings.TrimSpace(key); trimmed != "" { + return trimmed, nil + } + } + + return "", fmt.Errorf("missing evaluationMetricKey") +} diff --git a/ldai/judge/judge_test.go b/ldai/judge/judge_test.go new file mode 100644 index 00000000..a9fbbe90 --- /dev/null +++ b/ldai/judge/judge_test.go @@ -0,0 +1,949 @@ +package judge + +import ( + "fmt" + "testing" + + "github.com/launchdarkly/go-sdk-common/v3/ldvalue" + "github.com/launchdarkly/go-server-sdk/ldai" + "github.com/launchdarkly/go-server-sdk/ldai/datamodel" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +type mockConfig struct { + messages []datamodel.Message + modelParam map[string]ldvalue.Value + customParam map[string]ldvalue.Value + evaluationMetricKey string + evaluationMetricKeys []string +} + +func (m *mockConfig) Messages() []datamodel.Message { + return m.messages +} + +func (m *mockConfig) ModelParam(key string) (ldvalue.Value, bool) { + val, ok := m.modelParam[key] + return val, ok +} + +func (m *mockConfig) CustomModelParam(key string) (ldvalue.Value, bool) { + val, ok := m.customParam[key] + return val, ok +} + +func (m *mockConfig) EvaluationMetricKey() string { + return m.evaluationMetricKey +} + +func (m *mockConfig) EvaluationMetricKeys() []string { + return m.evaluationMetricKeys +} + +type mockTracker struct { + judgeResponses []datamodel.JudgeResponse + usages []ldai.TokenUsage +} + +func (m *mockTracker) TrackJudgeResponse(response datamodel.JudgeResponse) error { + m.judgeResponses = append(m.judgeResponses, response) + return nil +} + +func (m *mockTracker) TrackUsage(usage ldai.TokenUsage) error { + m.usages = append(m.usages, usage) + return nil +} + +type mockProvider struct { + response StructuredResponse + err error + calls [][]datamodel.Message +} + +func (m *mockProvider) InvokeStructuredModel(messages []datamodel.Message, schema map[string]interface{}) (StructuredResponse, error) { + m.calls = append(m.calls, messages) + return m.response, m.err +} + +func TestNew(t *testing.T) { + config := &mockConfig{ + evaluationMetricKey: "$ld:ai:judge:relevance", + } + tracker := &mockTracker{} + provider := &mockProvider{} + + judge, err := New(config, tracker, provider, "test-judge", nil) + require.NoError(t, err) + assert.NotNil(t, judge) + assert.Equal(t, "$ld:ai:judge:relevance", judge.metricKey) + assert.Equal(t, "test-judge", judge.judgeConfigKey) +} + +func TestNew_MissingMetricKey(t *testing.T) { + config := &mockConfig{} + tracker := &mockTracker{} + provider := &mockProvider{} + + judge, err := New(config, tracker, provider, "test-judge", nil) + assert.Error(t, err) + assert.Nil(t, judge) + assert.Contains(t, err.Error(), "missing evaluationMetricKey") +} + +func TestNew_NilInputs(t *testing.T) { + config := &mockConfig{evaluationMetricKey: "test"} + tracker := &mockTracker{} + provider := &mockProvider{} + + _, err := New(nil, tracker, provider, "test", nil) + assert.Error(t, err) + + _, err = New(config, nil, provider, "test", nil) + assert.Error(t, err) + + _, err = New(config, tracker, nil, "test", nil) + assert.Error(t, err) +} + +func TestEvaluate_Success(t *testing.T) { + config := &mockConfig{ + evaluationMetricKey: "$ld:ai:judge:relevance", + messages: []datamodel.Message{ + {Role: datamodel.System, Content: "Evaluate this"}, + {Role: datamodel.User, Content: "Input: {{message_history}}"}, + {Role: datamodel.User, Content: "Output: {{response_to_evaluate}}"}, + }, + } + tracker := &mockTracker{} + provider := &mockProvider{ + response: StructuredResponse{ + Content: map[string]interface{}{ + "evaluations": map[string]interface{}{ + "$ld:ai:judge:relevance": map[string]interface{}{ + "score": 0.85, + "reasoning": "Highly relevant", + }, + }, + }, + Usage: ldai.TokenUsage{Total: 100, Input: 60, Output: 40}, + }, + } + + judge, err := New(config, tracker, provider, "test-judge", nil) + require.NoError(t, err) + + result, err := judge.Evaluate("test input", "test output", 1.0) + require.NoError(t, err) + assert.NotNil(t, result) + assert.True(t, result.Success) + assert.Equal(t, "test-judge", result.JudgeConfigKey) + assert.Len(t, result.Evals, 1) + assert.Equal(t, 0.85, result.Evals["$ld:ai:judge:relevance"].Score) + assert.Equal(t, "Highly relevant", result.Evals["$ld:ai:judge:relevance"].Reasoning) + + assert.Len(t, tracker.usages, 1) + assert.Equal(t, 100, tracker.usages[0].Total) + // Note: Judge should NOT track responses internally - this is caller's responsibility + // The judge's tracker is only used for usage/duration metrics + assert.Len(t, tracker.judgeResponses, 0, "Judge should not track responses internally") + + require.Len(t, provider.calls, 1) + assert.Equal(t, "Evaluate this", provider.calls[0][0].Content) + assert.Equal(t, "Input: test input", provider.calls[0][1].Content) + assert.Equal(t, "Output: test output", provider.calls[0][2].Content) +} + +func TestEvaluate_NoMessages(t *testing.T) { + config := &mockConfig{ + evaluationMetricKey: "$ld:ai:judge:relevance", + messages: []datamodel.Message{}, + } + tracker := &mockTracker{} + provider := &mockProvider{} + + judge, err := New(config, tracker, provider, "test-judge", nil) + require.NoError(t, err) + + result, err := judge.Evaluate("input", "output", 1.0) + assert.NoError(t, err) + assert.Nil(t, result) +} + +func TestEvaluate_Sampling(t *testing.T) { + config := &mockConfig{ + evaluationMetricKey: "$ld:ai:judge:relevance", + messages: []datamodel.Message{{Role: datamodel.User, Content: "test"}}, + } + tracker := &mockTracker{} + provider := &mockProvider{} + + judge, err := New(config, tracker, provider, "test-judge", nil) + require.NoError(t, err) + + sampled := 0 + for i := 0; i < 100; i++ { + result, _ := judge.Evaluate("input", "output", 0.0) + if result != nil { + sampled++ + } + } + assert.Equal(t, 0, sampled) +} + +func TestEvaluate_ProviderError(t *testing.T) { + config := &mockConfig{ + evaluationMetricKey: "$ld:ai:judge:relevance", + messages: []datamodel.Message{{Role: datamodel.User, Content: "test"}}, + } + tracker := &mockTracker{} + provider := &mockProvider{err: fmt.Errorf("provider error")} + + judge, err := New(config, tracker, provider, "test-judge", nil) + require.NoError(t, err) + + result, err := judge.Evaluate("input", "output", 1.0) + assert.NoError(t, err) + assert.NotNil(t, result) + assert.False(t, result.Success) + assert.Equal(t, "test-judge", result.JudgeConfigKey) + assert.Contains(t, result.Error, "provider error") +} + +func TestEvaluate_InvalidResponse(t *testing.T) { + tests := []struct { + name string + response map[string]interface{} + }{ + { + name: "missing evaluations", + response: map[string]interface{}{}, + }, + { + name: "missing metric key", + response: map[string]interface{}{ + "evaluations": map[string]interface{}{}, + }, + }, + { + name: "invalid score type", + response: map[string]interface{}{ + "evaluations": map[string]interface{}{ + "$ld:ai:judge:relevance": map[string]interface{}{ + "score": "not a number", + "reasoning": "test", + }, + }, + }, + }, + { + name: "score out of range", + response: map[string]interface{}{ + "evaluations": map[string]interface{}{ + "$ld:ai:judge:relevance": map[string]interface{}{ + "score": 1.5, + "reasoning": "test", + }, + }, + }, + }, + { + name: "invalid reasoning type", + response: map[string]interface{}{ + "evaluations": map[string]interface{}{ + "$ld:ai:judge:relevance": map[string]interface{}{ + "score": 0.5, + "reasoning": 123, + }, + }, + }, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + config := &mockConfig{ + evaluationMetricKey: "$ld:ai:judge:relevance", + messages: []datamodel.Message{{Role: datamodel.User, Content: "test"}}, + } + tracker := &mockTracker{} + provider := &mockProvider{ + response: StructuredResponse{Content: tt.response}, + } + + judge, err := New(config, tracker, provider, "test-judge", nil) + require.NoError(t, err) + + result, err := judge.Evaluate("input", "output", 1.0) + assert.NoError(t, err) + assert.NotNil(t, result) + assert.False(t, result.Success) + assert.NotEmpty(t, result.Error) + }) + } +} + +func TestEvaluateMessages(t *testing.T) { + config := &mockConfig{ + evaluationMetricKey: "$ld:ai:judge:relevance", + messages: []datamodel.Message{{Role: datamodel.User, Content: "{{message_history}}"}}, + } + tracker := &mockTracker{} + provider := &mockProvider{ + response: StructuredResponse{ + Content: map[string]interface{}{ + "evaluations": map[string]interface{}{ + "$ld:ai:judge:relevance": map[string]interface{}{ + "score": 0.9, + "reasoning": "Excellent", + }, + }, + }, + }, + } + + judge, err := New(config, tracker, provider, "test-judge", nil) + require.NoError(t, err) + + messages := []datamodel.Message{ + {Role: datamodel.User, Content: "Hello"}, + {Role: datamodel.Assistant, Content: "Hi there"}, + } + + result, err := judge.EvaluateMessages(messages, "response", 1.0) + require.NoError(t, err) + assert.NotNil(t, result) + assert.True(t, result.Success) + + require.Len(t, provider.calls, 1) + assert.Contains(t, provider.calls[0][0].Content, "Hello\r\nHi there") +} + +func TestGetMetricKey(t *testing.T) { + tests := []struct { + name string + evaluationMetricKey string + evaluationMetricKeys []string + want string + wantErr bool + }{ + { + name: "from top-level field (primary)", + evaluationMetricKey: "$ld:ai:judge:toplevel", + want: "$ld:ai:judge:toplevel", + }, + { + name: "top-level field has priority over array", + evaluationMetricKey: "$ld:ai:judge:toplevel", + evaluationMetricKeys: []string{"$ld:ai:judge:array"}, + want: "$ld:ai:judge:toplevel", + }, + { + name: "missing", + wantErr: true, + }, + { + name: "trim whitespace from top-level", + evaluationMetricKey: " $ld:ai:judge:toplevel ", + want: "$ld:ai:judge:toplevel", + }, + { + name: "from evaluationMetricKeys array", + evaluationMetricKeys: []string{"$ld:ai:judge:relevance", "$ld:ai:judge:accuracy"}, + want: "$ld:ai:judge:relevance", + }, + { + name: "skip empty strings in array", + evaluationMetricKeys: []string{"", " ", "$ld:ai:judge:relevance"}, + want: "$ld:ai:judge:relevance", + }, + { + name: "trim whitespace from array entry", + evaluationMetricKeys: []string{" $ld:ai:judge:relevance "}, + want: "$ld:ai:judge:relevance", + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + config := &mockConfig{ + evaluationMetricKey: tt.evaluationMetricKey, + evaluationMetricKeys: tt.evaluationMetricKeys, + } + got, err := getMetricKey(config, nil, "test-judge") + if tt.wantErr { + assert.Error(t, err) + } else { + assert.NoError(t, err) + assert.Equal(t, tt.want, got) + } + }) + } +} + +func TestBuildSchema(t *testing.T) { + schema := buildSchema("$ld:ai:judge:relevance") + + assert.Equal(t, "object", schema["type"]) + assert.Contains(t, schema, "properties") + assert.Contains(t, schema, "required") + + props := schema["properties"].(map[string]interface{}) + evals := props["evaluations"].(map[string]interface{}) + evalProps := evals["properties"].(map[string]interface{}) + + assert.Contains(t, evalProps, "$ld:ai:judge:relevance") + + metricSchema := evalProps["$ld:ai:judge:relevance"].(map[string]interface{}) + metricProps := metricSchema["properties"].(map[string]interface{}) + + assert.Contains(t, metricProps, "score") + assert.Contains(t, metricProps, "reasoning") + + scoreSchema := metricProps["score"].(map[string]interface{}) + assert.Equal(t, "number", scoreSchema["type"]) + assert.Equal(t, 0.0, scoreSchema["minimum"]) + assert.Equal(t, 1.0, scoreSchema["maximum"]) +} + +func TestBuildSchema_Empty(t *testing.T) { + schema := buildSchema("") + assert.Empty(t, schema) +} + +func TestEvaluate_NegativeScore(t *testing.T) { + config := &mockConfig{ + evaluationMetricKey: "$ld:ai:judge:relevance", + messages: []datamodel.Message{{Role: datamodel.User, Content: "test"}}, + } + tracker := &mockTracker{} + provider := &mockProvider{ + response: StructuredResponse{ + Content: map[string]interface{}{ + "evaluations": map[string]interface{}{ + "$ld:ai:judge:relevance": map[string]interface{}{ + "score": -0.5, + "reasoning": "negative score", + }, + }, + }, + }, + } + + judge, err := New(config, tracker, provider, "test-judge", nil) + require.NoError(t, err) + + result, err := judge.Evaluate("input", "output", 1.0) + assert.NoError(t, err) + assert.NotNil(t, result) + assert.False(t, result.Success) + assert.Contains(t, result.Error, "invalid score") +} + +func TestEvaluate_ScoreGreaterThanOne(t *testing.T) { + config := &mockConfig{ + evaluationMetricKey: "$ld:ai:judge:relevance", + messages: []datamodel.Message{{Role: datamodel.User, Content: "test"}}, + } + tracker := &mockTracker{} + provider := &mockProvider{ + response: StructuredResponse{ + Content: map[string]interface{}{ + "evaluations": map[string]interface{}{ + "$ld:ai:judge:relevance": map[string]interface{}{ + "score": 1.5, + "reasoning": "over limit", + }, + }, + }, + }, + } + + judge, err := New(config, tracker, provider, "test-judge", nil) + require.NoError(t, err) + + result, err := judge.Evaluate("input", "output", 1.0) + assert.NoError(t, err) + assert.NotNil(t, result) + assert.False(t, result.Success) + assert.Contains(t, result.Error, "invalid score") +} + +func TestEvaluate_NullEvaluationValue(t *testing.T) { + config := &mockConfig{ + evaluationMetricKey: "$ld:ai:judge:relevance", + messages: []datamodel.Message{{Role: datamodel.User, Content: "test"}}, + } + tracker := &mockTracker{} + provider := &mockProvider{ + response: StructuredResponse{ + Content: map[string]interface{}{ + "evaluations": map[string]interface{}{ + "$ld:ai:judge:relevance": nil, + }, + }, + }, + } + + judge, err := New(config, tracker, provider, "test-judge", nil) + require.NoError(t, err) + + result, err := judge.Evaluate("input", "output", 1.0) + assert.NoError(t, err) + assert.NotNil(t, result) + assert.False(t, result.Success) + assert.Contains(t, result.Error, "missing evaluation") +} + +func TestEvaluate_NonObjectEvaluationValue(t *testing.T) { + config := &mockConfig{ + evaluationMetricKey: "$ld:ai:judge:relevance", + messages: []datamodel.Message{{Role: datamodel.User, Content: "test"}}, + } + tracker := &mockTracker{} + provider := &mockProvider{ + response: StructuredResponse{ + Content: map[string]interface{}{ + "evaluations": map[string]interface{}{ + "$ld:ai:judge:relevance": "not an object", + }, + }, + }, + } + + judge, err := New(config, tracker, provider, "test-judge", nil) + require.NoError(t, err) + + result, err := judge.Evaluate("input", "output", 1.0) + assert.NoError(t, err) + assert.NotNil(t, result) + assert.False(t, result.Success) + assert.Contains(t, result.Error, "missing evaluation") +} + +func TestEvaluate_ScoreAsString(t *testing.T) { + config := &mockConfig{ + evaluationMetricKey: "$ld:ai:judge:relevance", + messages: []datamodel.Message{{Role: datamodel.User, Content: "test"}}, + } + tracker := &mockTracker{} + provider := &mockProvider{ + response: StructuredResponse{ + Content: map[string]interface{}{ + "evaluations": map[string]interface{}{ + "$ld:ai:judge:relevance": map[string]interface{}{ + "score": "0.5", + "reasoning": "test", + }, + }, + }, + }, + } + + judge, err := New(config, tracker, provider, "test-judge", nil) + require.NoError(t, err) + + result, err := judge.Evaluate("input", "output", 1.0) + assert.NoError(t, err) + assert.NotNil(t, result) + assert.False(t, result.Success) + assert.Contains(t, result.Error, "invalid score") +} + +func TestEvaluate_ReasoningAsNumber(t *testing.T) { + config := &mockConfig{ + evaluationMetricKey: "$ld:ai:judge:relevance", + messages: []datamodel.Message{{Role: datamodel.User, Content: "test"}}, + } + tracker := &mockTracker{} + provider := &mockProvider{ + response: StructuredResponse{ + Content: map[string]interface{}{ + "evaluations": map[string]interface{}{ + "$ld:ai:judge:relevance": map[string]interface{}{ + "score": 0.5, + "reasoning": 123, + }, + }, + }, + }, + } + + judge, err := New(config, tracker, provider, "test-judge", nil) + require.NoError(t, err) + + result, err := judge.Evaluate("input", "output", 1.0) + assert.NoError(t, err) + assert.NotNil(t, result) + assert.False(t, result.Success) + assert.Contains(t, result.Error, "invalid reasoning") +} + +func TestEvaluate_EmptyEvaluationsObject(t *testing.T) { + config := &mockConfig{ + evaluationMetricKey: "$ld:ai:judge:relevance", + messages: []datamodel.Message{{Role: datamodel.User, Content: "test"}}, + } + tracker := &mockTracker{} + provider := &mockProvider{ + response: StructuredResponse{ + Content: map[string]interface{}{ + "evaluations": map[string]interface{}{}, + }, + }, + } + + judge, err := New(config, tracker, provider, "test-judge", nil) + require.NoError(t, err) + + result, err := judge.Evaluate("input", "output", 1.0) + assert.NoError(t, err) + assert.NotNil(t, result) + assert.False(t, result.Success) + assert.Contains(t, result.Error, "missing evaluation") +} + +func TestGetMetricKey_EmptyArray(t *testing.T) { + config := &mockConfig{ + evaluationMetricKeys: []string{}, + } + + _, err := getMetricKey(config, nil, "test-judge") + assert.Error(t, err) + assert.Contains(t, err.Error(), "missing evaluationMetricKey") +} + +func TestGetMetricKey_ArrayWithOnlyEmptyStrings(t *testing.T) { + config := &mockConfig{ + evaluationMetricKeys: []string{"", " ", "\t"}, + } + + _, err := getMetricKey(config, nil, "test-judge") + assert.Error(t, err) + assert.Contains(t, err.Error(), "missing evaluationMetricKey") +} + +func TestEvaluate_ReturnsCorrectResponse(t *testing.T) { + config := &mockConfig{ + evaluationMetricKey: "$ld:ai:judge:relevance", + messages: []datamodel.Message{{Role: datamodel.User, Content: "test"}}, + } + tracker := &mockTracker{} + provider := &mockProvider{ + response: StructuredResponse{ + Content: map[string]interface{}{ + "evaluations": map[string]interface{}{ + "$ld:ai:judge:relevance": map[string]interface{}{ + "score": 0.75, + "reasoning": "Good response", + }, + }, + }, + }, + } + + judge, err := New(config, tracker, provider, "my-judge-config", nil) + require.NoError(t, err) + + result, err := judge.Evaluate("input", "output", 1.0) + require.NoError(t, err) + require.NotNil(t, result) + + // Verify the returned response has correct values + assert.Equal(t, "my-judge-config", result.JudgeConfigKey) + assert.True(t, result.Success) + assert.Equal(t, 0.75, result.Evals["$ld:ai:judge:relevance"].Score) + assert.Equal(t, "Good response", result.Evals["$ld:ai:judge:relevance"].Reasoning) + + // Judge should NOT track responses internally - this is caller's responsibility + assert.Len(t, tracker.judgeResponses, 0, "Judge should not track responses internally") +} + +func TestEvaluate_TokenUsageTracked(t *testing.T) { + config := &mockConfig{ + evaluationMetricKey: "$ld:ai:judge:relevance", + messages: []datamodel.Message{{Role: datamodel.User, Content: "test"}}, + } + tracker := &mockTracker{} + provider := &mockProvider{ + response: StructuredResponse{ + Content: map[string]interface{}{ + "evaluations": map[string]interface{}{ + "$ld:ai:judge:relevance": map[string]interface{}{ + "score": 0.5, + "reasoning": "test", + }, + }, + }, + Usage: ldai.TokenUsage{Total: 150, Input: 90, Output: 60}, + }, + } + + judge, err := New(config, tracker, provider, "test-judge", nil) + require.NoError(t, err) + + _, err = judge.Evaluate("input", "output", 1.0) + require.NoError(t, err) + + require.Len(t, tracker.usages, 1) + assert.Equal(t, 150, tracker.usages[0].Total) + assert.Equal(t, 90, tracker.usages[0].Input) + assert.Equal(t, 60, tracker.usages[0].Output) +} + +func TestEvaluate_NoTokenUsageWhenZero(t *testing.T) { + config := &mockConfig{ + evaluationMetricKey: "$ld:ai:judge:relevance", + messages: []datamodel.Message{{Role: datamodel.User, Content: "test"}}, + } + tracker := &mockTracker{} + provider := &mockProvider{ + response: StructuredResponse{ + Content: map[string]interface{}{ + "evaluations": map[string]interface{}{ + "$ld:ai:judge:relevance": map[string]interface{}{ + "score": 0.5, + "reasoning": "test", + }, + }, + }, + Usage: ldai.TokenUsage{}, + }, + } + + judge, err := New(config, tracker, provider, "test-judge", nil) + require.NoError(t, err) + + _, err = judge.Evaluate("input", "output", 1.0) + require.NoError(t, err) + + assert.Len(t, tracker.usages, 0) +} + +func TestEvaluate_ErrorResponseIncludesJudgeConfigKey(t *testing.T) { + config := &mockConfig{ + evaluationMetricKey: "$ld:ai:judge:relevance", + messages: []datamodel.Message{{Role: datamodel.User, Content: "test"}}, + } + tracker := &mockTracker{} + provider := &mockProvider{err: fmt.Errorf("test error")} + + judge, err := New(config, tracker, provider, "error-judge", nil) + require.NoError(t, err) + + result, err := judge.Evaluate("input", "output", 1.0) + assert.NoError(t, err) + assert.NotNil(t, result) + assert.False(t, result.Success) + assert.Equal(t, "error-judge", result.JudgeConfigKey) +} + +// Integration Tests - These verify end-to-end behavior and patterns not caught by unit tests + +// TestDoubleInterpolation_ReservedVariables verifies that the double interpolation pattern works: +// 1. During config fetch, pass literal strings "{{message_history}}" and "{{response_to_evaluate}}" +// 2. First interpolation preserves these placeholders in the template +// 3. During evaluation, second interpolation replaces placeholders with actual values +func TestDoubleInterpolation_ReservedVariables(t *testing.T) { + // Simulate what the client does when fetching a judge config + // The config from LaunchDarkly has templates with {{message_history}} and {{response_to_evaluate}} + rawTemplate := "Input: {{message_history}}\nOutput: {{response_to_evaluate}}" + + // Simulate first interpolation (done by client.Config when fetching judge config) + // Variables passed should include literal placeholder strings + variablesForFirstInterpolation := map[string]interface{}{ + "message_history": "{{message_history}}", // Literal string! + "response_to_evaluate": "{{response_to_evaluate}}", // Literal string! + } + + // First interpolation - should preserve placeholders + firstInterpolated := interpolateTemplateForTest(rawTemplate, variablesForFirstInterpolation) + assert.Equal(t, rawTemplate, firstInterpolated, "First interpolation should preserve placeholders") + + // Now simulate what the judge does during Evaluate() + // Second interpolation with actual values + actualInput := "What is LaunchDarkly?" + actualOutput := "LaunchDarkly is a feature management platform." + + variablesForSecondInterpolation := map[string]interface{}{ + "message_history": actualInput, + "response_to_evaluate": actualOutput, + } + + // Second interpolation - should replace with actual values + secondInterpolated := interpolateTemplateForTest(firstInterpolated, variablesForSecondInterpolation) + expected := "Input: What is LaunchDarkly?\nOutput: LaunchDarkly is a feature management platform." + assert.Equal(t, expected, secondInterpolated, "Second interpolation should replace with actual values") +} + +// TestJudgeEvaluation_WithTemplateInterpolation verifies end-to-end template interpolation +func TestJudgeEvaluation_WithTemplateInterpolation(t *testing.T) { + // Config with templates that should be interpolated during evaluation + config := &mockConfig{ + evaluationMetricKey: "$ld:ai:judge:test", + messages: []datamodel.Message{ + {Role: datamodel.System, Content: "You are a judge"}, + {Role: datamodel.User, Content: "Input: {{message_history}}"}, + {Role: datamodel.User, Content: "Output: {{response_to_evaluate}}"}, + }, + } + + tracker := &mockTracker{} + provider := &mockProvider{ + response: StructuredResponse{ + Content: map[string]interface{}{ + "evaluations": map[string]interface{}{ + "$ld:ai:judge:test": map[string]interface{}{ + "score": 0.9, + "reasoning": "Good response", + }, + }, + }, + }, + } + + judge, err := New(config, tracker, provider, "test-judge", nil) + require.NoError(t, err) + + // Evaluate with actual input/output + actualInput := "What is AI?" + actualOutput := "AI is artificial intelligence" + + result, err := judge.Evaluate(actualInput, actualOutput, 1.0) + require.NoError(t, err) + require.NotNil(t, result) + + // Verify the provider received interpolated messages + require.Len(t, provider.calls, 1) + messages := provider.calls[0] + + assert.Equal(t, "You are a judge", messages[0].Content) + assert.Equal(t, "Input: What is AI?", messages[1].Content, "message_history should be interpolated") + assert.Equal(t, "Output: AI is artificial intelligence", messages[2].Content, "response_to_evaluate should be interpolated") +} + +// TestJudgeTracking_ShouldNotTrackInternally verifies that the judge does NOT track responses internally. +// Tracking should be the responsibility of the caller (AI config being evaluated). +func TestJudgeTracking_ShouldNotTrackInternally(t *testing.T) { + config := &mockConfig{ + evaluationMetricKey: "$ld:ai:judge:test", + messages: []datamodel.Message{{Role: datamodel.User, Content: "test"}}, + } + + tracker := &mockTracker{} + provider := &mockProvider{ + response: StructuredResponse{ + Content: map[string]interface{}{ + "evaluations": map[string]interface{}{ + "$ld:ai:judge:test": map[string]interface{}{ + "score": 0.5, + "reasoning": "Test", + }, + }, + }, + }, + } + + judge, err := New(config, tracker, provider, "test-judge", nil) + require.NoError(t, err) + + result, err := judge.Evaluate("input", "output", 1.0) + require.NoError(t, err) + require.NotNil(t, result) + + // CRITICAL: Judge should NOT track responses internally + // This should be done by the caller (AI config being evaluated) + assert.Len(t, tracker.judgeResponses, 0, "Judge should not track responses internally - caller's responsibility") + + // Verify usage is still tracked (this is judge-specific) + assert.Len(t, tracker.usages, 0, "No usage was set in this test") +} + +// TestIntegration_AIConfigTracksJudgeResults simulates the real-world pattern where +// an AI config evaluates with a judge and tracks results on its own tracker. +func TestIntegration_AIConfigTracksJudgeResults(t *testing.T) { + // Simulate AI config's tracker + aiConfigTracker := &mockTracker{} + + // Simulate judge's tracker (should NOT be used for judge response tracking) + judgeTracker := &mockTracker{} + + // Judge configuration + judgeConfig := &mockConfig{ + evaluationMetricKey: "$ld:ai:judge:relevance", + messages: []datamodel.Message{ + {Role: datamodel.User, Content: "Evaluate: {{message_history}} -> {{response_to_evaluate}}"}, + }, + } + + provider := &mockProvider{ + response: StructuredResponse{ + Content: map[string]interface{}{ + "evaluations": map[string]interface{}{ + "$ld:ai:judge:relevance": map[string]interface{}{ + "score": 0.85, + "reasoning": "Highly relevant", + }, + }, + }, + }, + } + + // Create judge with its own tracker + judge, err := New(judgeConfig, judgeTracker, provider, "test-judge", nil) + require.NoError(t, err) + + // AI config evaluates with the judge + result, err := judge.Evaluate("What is AI?", "AI is artificial intelligence", 1.0) + require.NoError(t, err) + require.NotNil(t, result) + + // AI config tracks the result on its own tracker (NOT the judge's tracker) + err = aiConfigTracker.TrackJudgeResponse(*result) + require.NoError(t, err) + + // Verify tracking happened on AI config's tracker + assert.Len(t, aiConfigTracker.judgeResponses, 1, "AI config should track judge response") + assert.Equal(t, 0.85, aiConfigTracker.judgeResponses[0].Evals["$ld:ai:judge:relevance"].Score) + + // Verify judge did NOT track on its own tracker + assert.Len(t, judgeTracker.judgeResponses, 0, "Judge should not track responses internally") +} + +// Helper function to test template interpolation +func interpolateTemplateForTest(template string, vars map[string]interface{}) string { + // Simple string replacement for testing + // Real code uses: mustache.New().ParseString(template).RenderString(vars) + result := template + for key, value := range vars { + placeholder := "{{" + key + "}}" + if str, ok := value.(string); ok { + result = replaceAllForTest(result, placeholder, str) + } + } + return result +} + +func replaceAllForTest(s, old, new string) string { + result := "" + for { + i := indexOfForTest(s, old) + if i == -1 { + result += s + break + } + result += s[:i] + new + s = s[i+len(old):] + } + return result +} + +func indexOfForTest(s, substr string) int { + for i := 0; i <= len(s)-len(substr); i++ { + if s[i:i+len(substr)] == substr { + return i + } + } + return -1 +} diff --git a/ldai/judge/schema.go b/ldai/judge/schema.go new file mode 100644 index 00000000..864e28a8 --- /dev/null +++ b/ldai/judge/schema.go @@ -0,0 +1,40 @@ +package judge + +func buildSchema(metricKey string) map[string]interface{} { + if metricKey == "" { + return map[string]interface{}{} + } + + return map[string]interface{}{ + "type": "object", + "properties": map[string]interface{}{ + "evaluations": map[string]interface{}{ + "type": "object", + "description": "Object containing evaluation results for " + metricKey + " metric", + "properties": map[string]interface{}{ + metricKey: map[string]interface{}{ + "type": "object", + "properties": map[string]interface{}{ + "score": map[string]interface{}{ + "type": "number", + "minimum": 0.0, + "maximum": 1.0, + "description": "Score between 0.0 and 1.0 for " + metricKey, + }, + "reasoning": map[string]interface{}{ + "type": "string", + "description": "Reasoning behind the score for " + metricKey, + }, + }, + "required": []string{"score", "reasoning"}, + "additionalProperties": false, + }, + }, + "required": []string{metricKey}, + "additionalProperties": false, + }, + }, + "required": []string{"evaluations"}, + "additionalProperties": false, + } +} diff --git a/ldai/tracker.go b/ldai/tracker.go index 9da020c2..90655242 100644 --- a/ldai/tracker.go +++ b/ldai/tracker.go @@ -7,6 +7,7 @@ import ( ldcommon "github.com/launchdarkly/go-sdk-common/v3" "github.com/launchdarkly/go-sdk-common/v3/ldcontext" "github.com/launchdarkly/go-sdk-common/v3/ldvalue" + "github.com/launchdarkly/go-server-sdk/ldai/datamodel" "github.com/launchdarkly/go-server-sdk/v7/interfaces" ) @@ -340,3 +341,33 @@ func (t *Tracker) TrackRequest(task func(c *Config) (ProviderResponse, error)) ( return usage, nil } + +// TrackJudgeResponse tracks the evaluation scores from a judge response. +func (t *Tracker) TrackJudgeResponse(response datamodel.JudgeResponse) error { + if !response.Success { + return nil + } + + // Build the data object once, since it's constant across all iterations + data := t.trackData + if response.JudgeConfigKey != "" { + builder := ldvalue.ObjectBuild() + for _, key := range t.trackData.Keys(nil) { + builder.Set(key, t.trackData.GetByKey(key)) + } + data = builder.Set("judgeConfigKey", ldvalue.String(response.JudgeConfigKey)).Build() + } + + var failed bool + for metricKey, evalScore := range response.Evals { + if err := t.events.TrackMetric(metricKey, t.context, evalScore.Score, data); err != nil { + t.logWarning("error tracking metric %s: %v", metricKey, err) + failed = true + } + } + + if failed { + return fmt.Errorf("error tracking evaluation scores") + } + return nil +}