feat(evaluation): add deep evaluation mechanism (#145)

* feat(evaluation): add deep evaluation mechanism

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>

* consider whole conversation when evaluating

Signed-off-by: mudler <mudler@localai.io>

---------

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
Signed-off-by: mudler <mudler@localai.io>
This commit is contained in:
Ettore Di Giacinto
2025-05-11 18:31:04 +02:00
committed by GitHub
parent 289edb67a6
commit e431bc234b
10 changed files with 293 additions and 36 deletions

View File

@@ -560,7 +560,6 @@ func (a *Agent) filterJob(job *types.Job) (ok bool, err error) {
} }
func (a *Agent) consumeJob(job *types.Job, role string, retries int) { func (a *Agent) consumeJob(job *types.Job, role string, retries int) {
if err := job.GetContext().Err(); err != nil { if err := job.GetContext().Err(); err != nil {
job.Result.Finish(fmt.Errorf("expired")) job.Result.Finish(fmt.Errorf("expired"))
return return
@@ -659,12 +658,9 @@ func (a *Agent) consumeJob(job *types.Job, role string, retries int) {
} }
} }
//xlog.Debug("Picked action", "agent", a.Character.Name, "action", chosenAction.Definition().Name, "reasoning", reasoning)
if chosenAction == nil { if chosenAction == nil {
// If no action was picked up, the reasoning is the message returned by the assistant // If no action was picked up, the reasoning is the message returned by the assistant
// so we can consume it as if it was a reply. // so we can consume it as if it was a reply.
//job.Result.SetResult(ActionState{ActionCurrentState{nil, nil, "No action to do, just reply"}, ""})
//job.Result.Finish(fmt.Errorf("no action to do"))\
xlog.Info("No action to do, just reply", "agent", a.Character.Name, "reasoning", reasoning) xlog.Info("No action to do, just reply", "agent", a.Character.Name, "reasoning", reasoning)
if reasoning != "" { if reasoning != "" {
@@ -684,6 +680,23 @@ func (a *Agent) consumeJob(job *types.Job, role string, retries int) {
reasoning = msg.Content reasoning = msg.Content
} }
var satisfied bool
var err error
// Evaluate the response
satisfied, conv, err = a.handleEvaluation(job, conv, job.GetEvaluationLoop())
if err != nil {
job.Result.Finish(fmt.Errorf("error evaluating response: %w", err))
return
}
if !satisfied {
// If not satisfied, continue with the conversation
job.ConversationHistory = conv
job.IncrementEvaluationLoop()
a.consumeJob(job, role, retries)
return
}
xlog.Debug("Finish job with reasoning", "reasoning", reasoning, "agent", a.Character.Name, "conversation", fmt.Sprintf("%+v", conv)) xlog.Debug("Finish job with reasoning", "reasoning", reasoning, "agent", a.Character.Name, "conversation", fmt.Sprintf("%+v", conv))
job.Result.Conversation = conv job.Result.Conversation = conv
job.Result.AddFinalizer(func(conv []openai.ChatCompletionMessage) { job.Result.AddFinalizer(func(conv []openai.ChatCompletionMessage) {
@@ -773,8 +786,6 @@ func (a *Agent) consumeJob(job *types.Job, role string, retries int) {
conv, err = a.handlePlanning(job.GetContext(), job, chosenAction, actionParams, reasoning, pickTemplate, conv) conv, err = a.handlePlanning(job.GetContext(), job, chosenAction, actionParams, reasoning, pickTemplate, conv)
if err != nil { if err != nil {
xlog.Error("error handling planning", "error", err) xlog.Error("error handling planning", "error", err)
//job.Result.Conversation = conv
//job.Result.SetResponse(msg.Content)
a.reply(job, role, append(conv, openai.ChatCompletionMessage{ a.reply(job, role, append(conv, openai.ChatCompletionMessage{
Role: "assistant", Role: "assistant",
Content: fmt.Sprintf("Error handling planning: %v", err), Content: fmt.Sprintf("Error handling planning: %v", err),
@@ -821,9 +832,6 @@ func (a *Agent) consumeJob(job *types.Job, role string, retries int) {
if !chosenAction.Definition().Name.Is(action.PlanActionName) { if !chosenAction.Definition().Name.Is(action.PlanActionName) {
result, err := a.runAction(job, chosenAction, actionParams) result, err := a.runAction(job, chosenAction, actionParams)
if err != nil { if err != nil {
//job.Result.Finish(fmt.Errorf("error running action: %w", err))
//return
// make the LLM aware of the error of running the action instead of stopping the job here
result.Result = fmt.Sprintf("Error running tool: %v", err) result.Result = fmt.Sprintf("Error running tool: %v", err)
} }
@@ -866,6 +874,22 @@ func (a *Agent) consumeJob(job *types.Job, role string, retries int) {
return return
} }
// Evaluate the final response
var satisfied bool
satisfied, conv, err = a.handleEvaluation(job, conv, job.GetEvaluationLoop())
if err != nil {
job.Result.Finish(fmt.Errorf("error evaluating response: %w", err))
return
}
if !satisfied {
// If not satisfied, continue with the conversation
job.ConversationHistory = conv
job.IncrementEvaluationLoop()
a.consumeJob(job, role, retries)
return
}
a.reply(job, role, conv, actionParams, chosenAction, reasoning) a.reply(job, role, conv, actionParams, chosenAction, reasoning)
} }

162
core/agent/evaluation.go Normal file
View File

@@ -0,0 +1,162 @@
package agent
import (
"fmt"
"github.com/mudler/LocalAGI/core/types"
"github.com/mudler/LocalAGI/pkg/llm"
"github.com/mudler/LocalAGI/pkg/xlog"
"github.com/sashabaranov/go-openai"
"github.com/sashabaranov/go-openai/jsonschema"
)
type EvaluationResult struct {
Satisfied bool `json:"satisfied"`
Gaps []string `json:"gaps"`
Reasoning string `json:"reasoning"`
}
type GoalExtraction struct {
Goal string `json:"goal"`
Constraints []string `json:"constraints"`
Context string `json:"context"`
}
func (a *Agent) extractGoal(job *types.Job, conv []openai.ChatCompletionMessage) (*GoalExtraction, error) {
// Create the goal extraction schema
schema := jsonschema.Definition{
Type: jsonschema.Object,
Properties: map[string]jsonschema.Definition{
"goal": {
Type: jsonschema.String,
Description: "The main goal or request from the user",
},
"constraints": {
Type: jsonschema.Array,
Items: &jsonschema.Definition{
Type: jsonschema.String,
},
Description: "Any constraints or requirements specified by the user",
},
"context": {
Type: jsonschema.String,
Description: "Additional context that might be relevant for understanding the goal",
},
},
Required: []string{"goal", "constraints", "context"},
}
// Create the goal extraction prompt
prompt := `Analyze the conversation and extract the user's main goal, any constraints, and relevant context.
Consider the entire conversation history to understand the complete context and requirements.
Focus on identifying the primary objective and any specific requirements or limitations mentioned.`
var result GoalExtraction
err := llm.GenerateTypedJSONWithConversation(job.GetContext(), a.client,
append(
[]openai.ChatCompletionMessage{
{
Role: "system",
Content: prompt,
},
},
conv...), a.options.LLMAPI.Model, schema, &result)
if err != nil {
return nil, fmt.Errorf("error extracting goal: %w", err)
}
return &result, nil
}
func (a *Agent) evaluateJob(job *types.Job, conv []openai.ChatCompletionMessage) (*EvaluationResult, error) {
if !a.options.enableEvaluation {
return &EvaluationResult{Satisfied: true}, nil
}
// Extract the goal first
goal, err := a.extractGoal(job, conv)
if err != nil {
return nil, fmt.Errorf("error extracting goal: %w", err)
}
// Create the evaluation schema
schema := jsonschema.Definition{
Type: jsonschema.Object,
Properties: map[string]jsonschema.Definition{
"satisfied": {
Type: jsonschema.Boolean,
},
"gaps": {
Type: jsonschema.Array,
Items: &jsonschema.Definition{
Type: jsonschema.String,
},
},
"reasoning": {
Type: jsonschema.String,
},
},
Required: []string{"satisfied", "gaps", "reasoning"},
}
// Create the evaluation prompt
prompt := fmt.Sprintf(`Evaluate if the assistant has satisfied the user's request. Consider:
1. The identified goal: %s
2. Constraints and requirements: %v
3. Context: %s
4. The conversation history
5. Any gaps or missing information
6. Whether the response fully addresses the user's needs
Provide a detailed evaluation with specific gaps if any are found.`,
goal.Goal,
goal.Constraints,
goal.Context)
var result EvaluationResult
err = llm.GenerateTypedJSONWithConversation(job.GetContext(), a.client,
append(
[]openai.ChatCompletionMessage{
{
Role: "system",
Content: prompt,
},
},
conv...),
a.options.LLMAPI.Model, schema, &result)
if err != nil {
return nil, fmt.Errorf("error generating evaluation: %w", err)
}
return &result, nil
}
func (a *Agent) handleEvaluation(job *types.Job, conv []openai.ChatCompletionMessage, currentLoop int) (bool, []openai.ChatCompletionMessage, error) {
if !a.options.enableEvaluation || currentLoop >= a.options.maxEvaluationLoops {
return true, conv, nil
}
result, err := a.evaluateJob(job, conv)
if err != nil {
return false, conv, err
}
if result.Satisfied {
return true, conv, nil
}
// If there are gaps, we need to address them
if len(result.Gaps) > 0 {
// Add the evaluation result to the conversation
conv = append(conv, openai.ChatCompletionMessage{
Role: "system",
Content: fmt.Sprintf("Evaluation found gaps that need to be addressed:\n%s\nReasoning: %s",
result.Gaps, result.Reasoning),
})
xlog.Debug("Evaluation found gaps, incrementing loop count", "loop", currentLoop+1)
return false, conv, nil
}
return true, conv, nil
}

View File

@@ -12,7 +12,7 @@ func (a *Agent) generateIdentity(guidance string) error {
guidance = "Generate a random character for roleplaying." guidance = "Generate a random character for roleplaying."
} }
err := llm.GenerateTypedJSON(a.context.Context, a.client, "Generate a character as JSON data. "+guidance, a.options.LLMAPI.Model, a.options.character.ToJSONSchema(), &a.options.character) err := llm.GenerateTypedJSONWithGuidance(a.context.Context, a.client, "Generate a character as JSON data. "+guidance, a.options.LLMAPI.Model, a.options.character.ToJSONSchema(), &a.options.character)
//err := llm.GenerateJSONFromStruct(a.context.Context, a.client, guidance, a.options.LLMAPI.Model, &a.options.character) //err := llm.GenerateJSONFromStruct(a.context.Context, a.client, guidance, a.options.LLMAPI.Model, &a.options.character)
a.Character = a.options.character a.Character = a.options.character
if err != nil { if err != nil {

View File

@@ -42,6 +42,10 @@ type options struct {
kbResults int kbResults int
ragdb RAGDB ragdb RAGDB
// Evaluation settings
maxEvaluationLoops int
enableEvaluation bool
prompts []DynamicPrompt prompts []DynamicPrompt
systemPrompt string systemPrompt string
@@ -68,9 +72,11 @@ func (o *options) SeparatedMultimodalModel() bool {
func defaultOptions() *options { func defaultOptions() *options {
return &options{ return &options{
parallelJobs: 1, parallelJobs: 1,
periodicRuns: 15 * time.Minute, periodicRuns: 15 * time.Minute,
loopDetectionSteps: 10, loopDetectionSteps: 10,
maxEvaluationLoops: 2,
enableEvaluation: false,
LLMAPI: llmOptions{ LLMAPI: llmOptions{
APIURL: "http://localhost:8080", APIURL: "http://localhost:8080",
Model: "gpt-4", Model: "gpt-4",
@@ -392,3 +398,17 @@ var EnableStripThinkingTags = func(o *options) error {
o.stripThinkingTags = true o.stripThinkingTags = true
return nil return nil
} }
func WithMaxEvaluationLoops(loops int) Option {
return func(o *options) error {
o.maxEvaluationLoops = loops
return nil
}
}
func EnableEvaluation() Option {
return func(o *options) error {
o.enableEvaluation = true
return nil
}
}

View File

@@ -74,6 +74,8 @@ type AgentConfig struct {
SummaryLongTermMemory bool `json:"summary_long_term_memory" form:"summary_long_term_memory"` SummaryLongTermMemory bool `json:"summary_long_term_memory" form:"summary_long_term_memory"`
ParallelJobs int `json:"parallel_jobs" form:"parallel_jobs"` ParallelJobs int `json:"parallel_jobs" form:"parallel_jobs"`
StripThinkingTags bool `json:"strip_thinking_tags" form:"strip_thinking_tags"` StripThinkingTags bool `json:"strip_thinking_tags" form:"strip_thinking_tags"`
EnableEvaluation bool `json:"enable_evaluation" form:"enable_evaluation"`
MaxEvaluationLoops int `json:"max_evaluation_loops" form:"max_evaluation_loops"`
} }
type AgentConfigMeta struct { type AgentConfigMeta struct {
@@ -309,6 +311,24 @@ func NewAgentConfigMeta(
HelpText: "Remove content between <thinking></thinking> and <think></think> tags from agent responses", HelpText: "Remove content between <thinking></thinking> and <think></think> tags from agent responses",
Tags: config.Tags{Section: "ModelSettings"}, Tags: config.Tags{Section: "ModelSettings"},
}, },
{
Name: "enable_evaluation",
Label: "Enable Evaluation",
Type: "checkbox",
DefaultValue: false,
HelpText: "Enable automatic evaluation of agent responses to ensure they meet user requirements",
Tags: config.Tags{Section: "AdvancedSettings"},
},
{
Name: "max_evaluation_loops",
Label: "Max Evaluation Loops",
Type: "number",
DefaultValue: 2,
Min: 1,
Step: 1,
HelpText: "Maximum number of evaluation loops to perform when addressing gaps in responses",
Tags: config.Tags{Section: "AdvancedSettings"},
},
}, },
MCPServers: []config.Field{ MCPServers: []config.Field{
{ {

View File

@@ -247,7 +247,7 @@ func createAgentAvatar(APIURL, APIKey, model, imageModel, avatarDir string, agen
ImagePrompt string `json:"image_prompt"` ImagePrompt string `json:"image_prompt"`
} }
err := llm.GenerateTypedJSON( err := llm.GenerateTypedJSONWithGuidance(
context.Background(), context.Background(),
llm.NewClient(APIKey, APIURL, "10m"), llm.NewClient(APIKey, APIURL, "10m"),
"Generate a prompt that I can use to create a random avatar for the bot '"+agent.Name+"', the description of the bot is: "+agent.Description, "Generate a prompt that I can use to create a random avatar for the bot '"+agent.Name+"', the description of the bot is: "+agent.Description,
@@ -561,6 +561,13 @@ func (a *AgentPool) startAgentWithConfig(name string, config *AgentConfig, obs O
opts = append(opts, WithParallelJobs(config.ParallelJobs)) opts = append(opts, WithParallelJobs(config.ParallelJobs))
} }
if config.EnableEvaluation {
opts = append(opts, EnableEvaluation())
if config.MaxEvaluationLoops > 0 {
opts = append(opts, WithMaxEvaluationLoops(config.MaxEvaluationLoops))
}
}
xlog.Info("Starting agent", "name", name, "config", config) xlog.Info("Starting agent", "name", name, "config", config)
agent, err := New(opts...) agent, err := New(opts...)

View File

@@ -162,23 +162,23 @@ func newUUID() string {
// To wait for a Job result, use JobResult.WaitResult() // To wait for a Job result, use JobResult.WaitResult()
func NewJob(opts ...JobOption) *Job { func NewJob(opts ...JobOption) *Job {
j := &Job{ j := &Job{
Result: NewJobResult(), Result: NewJobResult(),
UUID: newUUID(), UUID: uuid.New().String(),
} Metadata: make(map[string]interface{}),
for _, o := range opts { context: context.Background(),
o(j) ConversationHistory: []openai.ChatCompletionMessage{},
} }
var ctx context.Context for _, opt := range opts {
if j.context == nil { opt(j)
ctx = context.Background()
} else {
ctx = j.context
} }
context, cancel := context.WithCancel(ctx) // Store the original request if it exists in the conversation history
j.context = context
ctx, cancel := context.WithCancel(j.context)
j.context = ctx
j.cancel = cancel j.cancel = cancel
return j return j
} }
@@ -207,3 +207,23 @@ func WithObservable(obs *Observable) JobOption {
j.Obs = obs j.Obs = obs
} }
} }
// GetEvaluationLoop returns the current evaluation loop count
func (j *Job) GetEvaluationLoop() int {
if j.Metadata == nil {
j.Metadata = make(map[string]interface{})
}
if loop, ok := j.Metadata["evaluation_loop"].(int); ok {
return loop
}
return 0
}
// IncrementEvaluationLoop increments the evaluation loop count
func (j *Job) IncrementEvaluationLoop() {
if j.Metadata == nil {
j.Metadata = make(map[string]interface{})
}
currentLoop := j.GetEvaluationLoop()
j.Metadata["evaluation_loop"] = currentLoop + 1
}

View File

@@ -10,16 +10,20 @@ import (
"github.com/sashabaranov/go-openai/jsonschema" "github.com/sashabaranov/go-openai/jsonschema"
) )
func GenerateTypedJSON(ctx context.Context, client *openai.Client, guidance, model string, i jsonschema.Definition, dst any) error { func GenerateTypedJSONWithGuidance(ctx context.Context, client *openai.Client, guidance, model string, i jsonschema.Definition, dst any) error {
return GenerateTypedJSONWithConversation(ctx, client, []openai.ChatCompletionMessage{
{
Role: "user",
Content: guidance,
},
}, model, i, dst)
}
func GenerateTypedJSONWithConversation(ctx context.Context, client *openai.Client, conv []openai.ChatCompletionMessage, model string, i jsonschema.Definition, dst any) error {
toolName := "json" toolName := "json"
decision := openai.ChatCompletionRequest{ decision := openai.ChatCompletionRequest{
Model: model, Model: model,
Messages: []openai.ChatCompletionMessage{ Messages: conv,
{
Role: "user",
Content: guidance,
},
},
Tools: []openai.Tool{ Tools: []openai.Tool{
{ {

View File

@@ -78,7 +78,7 @@ func (f *ClassifierFilter) Apply(job *types.Job) (bool, error) {
var result struct { var result struct {
Asserted bool `json:"answer"` Asserted bool `json:"answer"`
} }
err := llm.GenerateTypedJSON(job.GetContext(), f.client, guidance, f.model, jsonschema.Definition{ err := llm.GenerateTypedJSONWithGuidance(job.GetContext(), f.client, guidance, f.model, jsonschema.Definition{
Type: jsonschema.Object, Type: jsonschema.Object,
Properties: map[string]jsonschema.Definition{ Properties: map[string]jsonschema.Definition{
"answer": { "answer": {

View File

@@ -576,7 +576,7 @@ func (a *App) GenerateGroupProfiles(pool *state.AgentPool) func(c *fiber.Ctx) er
xlog.Debug("Generating group", "description", request.Descript) xlog.Debug("Generating group", "description", request.Descript)
client := llm.NewClient(a.config.LLMAPIKey, a.config.LLMAPIURL, "10m") client := llm.NewClient(a.config.LLMAPIKey, a.config.LLMAPIURL, "10m")
err := llm.GenerateTypedJSON(c.Context(), client, request.Descript, a.config.LLMModel, jsonschema.Definition{ err := llm.GenerateTypedJSONWithGuidance(c.Context(), client, request.Descript, a.config.LLMModel, jsonschema.Definition{
Type: jsonschema.Object, Type: jsonschema.Object,
Properties: map[string]jsonschema.Definition{ Properties: map[string]jsonschema.Definition{
"agents": { "agents": {