From e431bc234b9d81eeab612e845150dca818f9257c Mon Sep 17 00:00:00 2001 From: Ettore Di Giacinto Date: Sun, 11 May 2025 18:31:04 +0200 Subject: [PATCH] feat(evaluation): add deep evaluation mechanism (#145) * feat(evaluation): add deep evaluation mechanism Signed-off-by: Ettore Di Giacinto * consider whole conversation when evaluating Signed-off-by: mudler --------- Signed-off-by: Ettore Di Giacinto Signed-off-by: mudler --- core/agent/agent.go | 44 +++++++-- core/agent/evaluation.go | 162 +++++++++++++++++++++++++++++++++ core/agent/identity.go | 2 +- core/agent/options.go | 24 ++++- core/state/config.go | 20 ++++ core/state/pool.go | 9 +- core/types/job.go | 44 ++++++--- pkg/llm/json.go | 20 ++-- services/filters/classifier.go | 2 +- webui/app.go | 2 +- 10 files changed, 293 insertions(+), 36 deletions(-) create mode 100644 core/agent/evaluation.go diff --git a/core/agent/agent.go b/core/agent/agent.go index 658fec9..016d9bf 100644 --- a/core/agent/agent.go +++ b/core/agent/agent.go @@ -525,7 +525,7 @@ func (a *Agent) filterJob(job *types.Job) (ok bool, err error) { if ok { triggeredBy = name xlog.Info("Job triggered by filter", "filter", name) - } + } } else if !ok { failedBy = name xlog.Info("Job failed filter", "filter", name) @@ -560,7 +560,6 @@ func (a *Agent) filterJob(job *types.Job) (ok bool, err error) { } func (a *Agent) consumeJob(job *types.Job, role string, retries int) { - if err := job.GetContext().Err(); err != nil { job.Result.Finish(fmt.Errorf("expired")) return @@ -659,12 +658,9 @@ func (a *Agent) consumeJob(job *types.Job, role string, retries int) { } } - //xlog.Debug("Picked action", "agent", a.Character.Name, "action", chosenAction.Definition().Name, "reasoning", reasoning) if chosenAction == nil { // If no action was picked up, the reasoning is the message returned by the assistant // so we can consume it as if it was a reply. - //job.Result.SetResult(ActionState{ActionCurrentState{nil, nil, "No action to do, just reply"}, ""}) - //job.Result.Finish(fmt.Errorf("no action to do"))\ xlog.Info("No action to do, just reply", "agent", a.Character.Name, "reasoning", reasoning) if reasoning != "" { @@ -684,6 +680,23 @@ func (a *Agent) consumeJob(job *types.Job, role string, retries int) { reasoning = msg.Content } + var satisfied bool + var err error + // Evaluate the response + satisfied, conv, err = a.handleEvaluation(job, conv, job.GetEvaluationLoop()) + if err != nil { + job.Result.Finish(fmt.Errorf("error evaluating response: %w", err)) + return + } + + if !satisfied { + // If not satisfied, continue with the conversation + job.ConversationHistory = conv + job.IncrementEvaluationLoop() + a.consumeJob(job, role, retries) + return + } + xlog.Debug("Finish job with reasoning", "reasoning", reasoning, "agent", a.Character.Name, "conversation", fmt.Sprintf("%+v", conv)) job.Result.Conversation = conv job.Result.AddFinalizer(func(conv []openai.ChatCompletionMessage) { @@ -773,8 +786,6 @@ func (a *Agent) consumeJob(job *types.Job, role string, retries int) { conv, err = a.handlePlanning(job.GetContext(), job, chosenAction, actionParams, reasoning, pickTemplate, conv) if err != nil { xlog.Error("error handling planning", "error", err) - //job.Result.Conversation = conv - //job.Result.SetResponse(msg.Content) a.reply(job, role, append(conv, openai.ChatCompletionMessage{ Role: "assistant", Content: fmt.Sprintf("Error handling planning: %v", err), @@ -821,9 +832,6 @@ func (a *Agent) consumeJob(job *types.Job, role string, retries int) { if !chosenAction.Definition().Name.Is(action.PlanActionName) { result, err := a.runAction(job, chosenAction, actionParams) if err != nil { - //job.Result.Finish(fmt.Errorf("error running action: %w", err)) - //return - // make the LLM aware of the error of running the action instead of stopping the job here result.Result = fmt.Sprintf("Error running tool: %v", err) } @@ -866,6 +874,22 @@ func (a *Agent) consumeJob(job *types.Job, role string, retries int) { return } + // Evaluate the final response + var satisfied bool + satisfied, conv, err = a.handleEvaluation(job, conv, job.GetEvaluationLoop()) + if err != nil { + job.Result.Finish(fmt.Errorf("error evaluating response: %w", err)) + return + } + + if !satisfied { + // If not satisfied, continue with the conversation + job.ConversationHistory = conv + job.IncrementEvaluationLoop() + a.consumeJob(job, role, retries) + return + } + a.reply(job, role, conv, actionParams, chosenAction, reasoning) } diff --git a/core/agent/evaluation.go b/core/agent/evaluation.go new file mode 100644 index 0000000..7373f91 --- /dev/null +++ b/core/agent/evaluation.go @@ -0,0 +1,162 @@ +package agent + +import ( + "fmt" + + "github.com/mudler/LocalAGI/core/types" + "github.com/mudler/LocalAGI/pkg/llm" + "github.com/mudler/LocalAGI/pkg/xlog" + "github.com/sashabaranov/go-openai" + "github.com/sashabaranov/go-openai/jsonschema" +) + +type EvaluationResult struct { + Satisfied bool `json:"satisfied"` + Gaps []string `json:"gaps"` + Reasoning string `json:"reasoning"` +} + +type GoalExtraction struct { + Goal string `json:"goal"` + Constraints []string `json:"constraints"` + Context string `json:"context"` +} + +func (a *Agent) extractGoal(job *types.Job, conv []openai.ChatCompletionMessage) (*GoalExtraction, error) { + // Create the goal extraction schema + schema := jsonschema.Definition{ + Type: jsonschema.Object, + Properties: map[string]jsonschema.Definition{ + "goal": { + Type: jsonschema.String, + Description: "The main goal or request from the user", + }, + "constraints": { + Type: jsonschema.Array, + Items: &jsonschema.Definition{ + Type: jsonschema.String, + }, + Description: "Any constraints or requirements specified by the user", + }, + "context": { + Type: jsonschema.String, + Description: "Additional context that might be relevant for understanding the goal", + }, + }, + Required: []string{"goal", "constraints", "context"}, + } + + // Create the goal extraction prompt + prompt := `Analyze the conversation and extract the user's main goal, any constraints, and relevant context. +Consider the entire conversation history to understand the complete context and requirements. +Focus on identifying the primary objective and any specific requirements or limitations mentioned.` + + var result GoalExtraction + err := llm.GenerateTypedJSONWithConversation(job.GetContext(), a.client, + append( + []openai.ChatCompletionMessage{ + { + Role: "system", + Content: prompt, + }, + }, + conv...), a.options.LLMAPI.Model, schema, &result) + if err != nil { + return nil, fmt.Errorf("error extracting goal: %w", err) + } + + return &result, nil +} + +func (a *Agent) evaluateJob(job *types.Job, conv []openai.ChatCompletionMessage) (*EvaluationResult, error) { + if !a.options.enableEvaluation { + return &EvaluationResult{Satisfied: true}, nil + } + + // Extract the goal first + goal, err := a.extractGoal(job, conv) + if err != nil { + return nil, fmt.Errorf("error extracting goal: %w", err) + } + + // Create the evaluation schema + schema := jsonschema.Definition{ + Type: jsonschema.Object, + Properties: map[string]jsonschema.Definition{ + "satisfied": { + Type: jsonschema.Boolean, + }, + "gaps": { + Type: jsonschema.Array, + Items: &jsonschema.Definition{ + Type: jsonschema.String, + }, + }, + "reasoning": { + Type: jsonschema.String, + }, + }, + Required: []string{"satisfied", "gaps", "reasoning"}, + } + + // Create the evaluation prompt + prompt := fmt.Sprintf(`Evaluate if the assistant has satisfied the user's request. Consider: +1. The identified goal: %s +2. Constraints and requirements: %v +3. Context: %s +4. The conversation history +5. Any gaps or missing information +6. Whether the response fully addresses the user's needs + +Provide a detailed evaluation with specific gaps if any are found.`, + goal.Goal, + goal.Constraints, + goal.Context) + + var result EvaluationResult + err = llm.GenerateTypedJSONWithConversation(job.GetContext(), a.client, + append( + []openai.ChatCompletionMessage{ + { + Role: "system", + Content: prompt, + }, + }, + conv...), + a.options.LLMAPI.Model, schema, &result) + if err != nil { + return nil, fmt.Errorf("error generating evaluation: %w", err) + } + + return &result, nil +} + +func (a *Agent) handleEvaluation(job *types.Job, conv []openai.ChatCompletionMessage, currentLoop int) (bool, []openai.ChatCompletionMessage, error) { + if !a.options.enableEvaluation || currentLoop >= a.options.maxEvaluationLoops { + return true, conv, nil + } + + result, err := a.evaluateJob(job, conv) + if err != nil { + return false, conv, err + } + + if result.Satisfied { + return true, conv, nil + } + + // If there are gaps, we need to address them + if len(result.Gaps) > 0 { + // Add the evaluation result to the conversation + conv = append(conv, openai.ChatCompletionMessage{ + Role: "system", + Content: fmt.Sprintf("Evaluation found gaps that need to be addressed:\n%s\nReasoning: %s", + result.Gaps, result.Reasoning), + }) + + xlog.Debug("Evaluation found gaps, incrementing loop count", "loop", currentLoop+1) + return false, conv, nil + } + + return true, conv, nil +} diff --git a/core/agent/identity.go b/core/agent/identity.go index a474d67..60ff14f 100644 --- a/core/agent/identity.go +++ b/core/agent/identity.go @@ -12,7 +12,7 @@ func (a *Agent) generateIdentity(guidance string) error { guidance = "Generate a random character for roleplaying." } - err := llm.GenerateTypedJSON(a.context.Context, a.client, "Generate a character as JSON data. "+guidance, a.options.LLMAPI.Model, a.options.character.ToJSONSchema(), &a.options.character) + err := llm.GenerateTypedJSONWithGuidance(a.context.Context, a.client, "Generate a character as JSON data. "+guidance, a.options.LLMAPI.Model, a.options.character.ToJSONSchema(), &a.options.character) //err := llm.GenerateJSONFromStruct(a.context.Context, a.client, guidance, a.options.LLMAPI.Model, &a.options.character) a.Character = a.options.character if err != nil { diff --git a/core/agent/options.go b/core/agent/options.go index 4943dad..c7f4514 100644 --- a/core/agent/options.go +++ b/core/agent/options.go @@ -42,6 +42,10 @@ type options struct { kbResults int ragdb RAGDB + // Evaluation settings + maxEvaluationLoops int + enableEvaluation bool + prompts []DynamicPrompt systemPrompt string @@ -68,9 +72,11 @@ func (o *options) SeparatedMultimodalModel() bool { func defaultOptions() *options { return &options{ - parallelJobs: 1, - periodicRuns: 15 * time.Minute, + parallelJobs: 1, + periodicRuns: 15 * time.Minute, loopDetectionSteps: 10, + maxEvaluationLoops: 2, + enableEvaluation: false, LLMAPI: llmOptions{ APIURL: "http://localhost:8080", Model: "gpt-4", @@ -392,3 +398,17 @@ var EnableStripThinkingTags = func(o *options) error { o.stripThinkingTags = true return nil } + +func WithMaxEvaluationLoops(loops int) Option { + return func(o *options) error { + o.maxEvaluationLoops = loops + return nil + } +} + +func EnableEvaluation() Option { + return func(o *options) error { + o.enableEvaluation = true + return nil + } +} diff --git a/core/state/config.go b/core/state/config.go index d807ef7..0898975 100644 --- a/core/state/config.go +++ b/core/state/config.go @@ -74,6 +74,8 @@ type AgentConfig struct { SummaryLongTermMemory bool `json:"summary_long_term_memory" form:"summary_long_term_memory"` ParallelJobs int `json:"parallel_jobs" form:"parallel_jobs"` StripThinkingTags bool `json:"strip_thinking_tags" form:"strip_thinking_tags"` + EnableEvaluation bool `json:"enable_evaluation" form:"enable_evaluation"` + MaxEvaluationLoops int `json:"max_evaluation_loops" form:"max_evaluation_loops"` } type AgentConfigMeta struct { @@ -309,6 +311,24 @@ func NewAgentConfigMeta( HelpText: "Remove content between and tags from agent responses", Tags: config.Tags{Section: "ModelSettings"}, }, + { + Name: "enable_evaluation", + Label: "Enable Evaluation", + Type: "checkbox", + DefaultValue: false, + HelpText: "Enable automatic evaluation of agent responses to ensure they meet user requirements", + Tags: config.Tags{Section: "AdvancedSettings"}, + }, + { + Name: "max_evaluation_loops", + Label: "Max Evaluation Loops", + Type: "number", + DefaultValue: 2, + Min: 1, + Step: 1, + HelpText: "Maximum number of evaluation loops to perform when addressing gaps in responses", + Tags: config.Tags{Section: "AdvancedSettings"}, + }, }, MCPServers: []config.Field{ { diff --git a/core/state/pool.go b/core/state/pool.go index 5cbac14..1ca8415 100644 --- a/core/state/pool.go +++ b/core/state/pool.go @@ -247,7 +247,7 @@ func createAgentAvatar(APIURL, APIKey, model, imageModel, avatarDir string, agen ImagePrompt string `json:"image_prompt"` } - err := llm.GenerateTypedJSON( + err := llm.GenerateTypedJSONWithGuidance( context.Background(), llm.NewClient(APIKey, APIURL, "10m"), "Generate a prompt that I can use to create a random avatar for the bot '"+agent.Name+"', the description of the bot is: "+agent.Description, @@ -561,6 +561,13 @@ func (a *AgentPool) startAgentWithConfig(name string, config *AgentConfig, obs O opts = append(opts, WithParallelJobs(config.ParallelJobs)) } + if config.EnableEvaluation { + opts = append(opts, EnableEvaluation()) + if config.MaxEvaluationLoops > 0 { + opts = append(opts, WithMaxEvaluationLoops(config.MaxEvaluationLoops)) + } + } + xlog.Info("Starting agent", "name", name, "config", config) agent, err := New(opts...) diff --git a/core/types/job.go b/core/types/job.go index 7a48ee1..c701c05 100644 --- a/core/types/job.go +++ b/core/types/job.go @@ -162,23 +162,23 @@ func newUUID() string { // To wait for a Job result, use JobResult.WaitResult() func NewJob(opts ...JobOption) *Job { j := &Job{ - Result: NewJobResult(), - UUID: newUUID(), - } - for _, o := range opts { - o(j) + Result: NewJobResult(), + UUID: uuid.New().String(), + Metadata: make(map[string]interface{}), + context: context.Background(), + ConversationHistory: []openai.ChatCompletionMessage{}, } - var ctx context.Context - if j.context == nil { - ctx = context.Background() - } else { - ctx = j.context + for _, opt := range opts { + opt(j) } - context, cancel := context.WithCancel(ctx) - j.context = context + // Store the original request if it exists in the conversation history + + ctx, cancel := context.WithCancel(j.context) + j.context = ctx j.cancel = cancel + return j } @@ -207,3 +207,23 @@ func WithObservable(obs *Observable) JobOption { j.Obs = obs } } + +// GetEvaluationLoop returns the current evaluation loop count +func (j *Job) GetEvaluationLoop() int { + if j.Metadata == nil { + j.Metadata = make(map[string]interface{}) + } + if loop, ok := j.Metadata["evaluation_loop"].(int); ok { + return loop + } + return 0 +} + +// IncrementEvaluationLoop increments the evaluation loop count +func (j *Job) IncrementEvaluationLoop() { + if j.Metadata == nil { + j.Metadata = make(map[string]interface{}) + } + currentLoop := j.GetEvaluationLoop() + j.Metadata["evaluation_loop"] = currentLoop + 1 +} diff --git a/pkg/llm/json.go b/pkg/llm/json.go index 5386fe1..c4f48d1 100644 --- a/pkg/llm/json.go +++ b/pkg/llm/json.go @@ -10,16 +10,20 @@ import ( "github.com/sashabaranov/go-openai/jsonschema" ) -func GenerateTypedJSON(ctx context.Context, client *openai.Client, guidance, model string, i jsonschema.Definition, dst any) error { +func GenerateTypedJSONWithGuidance(ctx context.Context, client *openai.Client, guidance, model string, i jsonschema.Definition, dst any) error { + return GenerateTypedJSONWithConversation(ctx, client, []openai.ChatCompletionMessage{ + { + Role: "user", + Content: guidance, + }, + }, model, i, dst) +} + +func GenerateTypedJSONWithConversation(ctx context.Context, client *openai.Client, conv []openai.ChatCompletionMessage, model string, i jsonschema.Definition, dst any) error { toolName := "json" decision := openai.ChatCompletionRequest{ - Model: model, - Messages: []openai.ChatCompletionMessage{ - { - Role: "user", - Content: guidance, - }, - }, + Model: model, + Messages: conv, Tools: []openai.Tool{ { diff --git a/services/filters/classifier.go b/services/filters/classifier.go index d85aa4f..e517a7c 100644 --- a/services/filters/classifier.go +++ b/services/filters/classifier.go @@ -78,7 +78,7 @@ func (f *ClassifierFilter) Apply(job *types.Job) (bool, error) { var result struct { Asserted bool `json:"answer"` } - err := llm.GenerateTypedJSON(job.GetContext(), f.client, guidance, f.model, jsonschema.Definition{ + err := llm.GenerateTypedJSONWithGuidance(job.GetContext(), f.client, guidance, f.model, jsonschema.Definition{ Type: jsonschema.Object, Properties: map[string]jsonschema.Definition{ "answer": { diff --git a/webui/app.go b/webui/app.go index acf5291..ff4b29e 100644 --- a/webui/app.go +++ b/webui/app.go @@ -576,7 +576,7 @@ func (a *App) GenerateGroupProfiles(pool *state.AgentPool) func(c *fiber.Ctx) er xlog.Debug("Generating group", "description", request.Descript) client := llm.NewClient(a.config.LLMAPIKey, a.config.LLMAPIURL, "10m") - err := llm.GenerateTypedJSON(c.Context(), client, request.Descript, a.config.LLMModel, jsonschema.Definition{ + err := llm.GenerateTypedJSONWithGuidance(c.Context(), client, request.Descript, a.config.LLMModel, jsonschema.Definition{ Type: jsonschema.Object, Properties: map[string]jsonschema.Definition{ "agents": {