From e431bc234b9d81eeab612e845150dca818f9257c Mon Sep 17 00:00:00 2001
From: Ettore Di Giacinto <mudler@users.noreply.github.com>
Date: Sun, 11 May 2025 18:31:04 +0200
Subject: [PATCH] feat(evaluation): add deep evaluation mechanism (#145)

* feat(evaluation): add deep evaluation mechanism

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>

* consider whole conversation when evaluating

Signed-off-by: mudler <mudler@localai.io>

---------

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
Signed-off-by: mudler <mudler@localai.io>
---
 core/agent/agent.go            |  44 +++++++--
 core/agent/evaluation.go       | 162 +++++++++++++++++++++++++++++++++
 core/agent/identity.go         |   2 +-
 core/agent/options.go          |  24 ++++-
 core/state/config.go           |  20 ++++
 core/state/pool.go             |   9 +-
 core/types/job.go              |  44 ++++++---
 pkg/llm/json.go                |  20 ++--
 services/filters/classifier.go |   2 +-
 webui/app.go                   |   2 +-
 10 files changed, 293 insertions(+), 36 deletions(-)
 create mode 100644 core/agent/evaluation.go

diff --git a/core/agent/agent.go b/core/agent/agent.go
index 658fec9..016d9bf 100644
--- a/core/agent/agent.go
+++ b/core/agent/agent.go
@@ -525,7 +525,7 @@ func (a *Agent) filterJob(job *types.Job) (ok bool, err error) {
 			if ok {
 				triggeredBy = name
 				xlog.Info("Job triggered by filter", "filter", name)
-			} 
+			}
 		} else if !ok {
 			failedBy = name
 			xlog.Info("Job failed filter", "filter", name)
@@ -560,7 +560,6 @@ func (a *Agent) filterJob(job *types.Job) (ok bool, err error) {
 }
 
 func (a *Agent) consumeJob(job *types.Job, role string, retries int) {
-
 	if err := job.GetContext().Err(); err != nil {
 		job.Result.Finish(fmt.Errorf("expired"))
 		return
@@ -659,12 +658,9 @@ func (a *Agent) consumeJob(job *types.Job, role string, retries int) {
 		}
 	}
 
-	//xlog.Debug("Picked action", "agent", a.Character.Name, "action", chosenAction.Definition().Name, "reasoning", reasoning)
 	if chosenAction == nil {
 		// If no action was picked up, the reasoning is the message returned by the assistant
 		// so we can consume it as if it was a reply.
-		//job.Result.SetResult(ActionState{ActionCurrentState{nil, nil, "No action to do, just reply"}, ""})
-		//job.Result.Finish(fmt.Errorf("no action to do"))\
 		xlog.Info("No action to do, just reply", "agent", a.Character.Name, "reasoning", reasoning)
 
 		if reasoning != "" {
@@ -684,6 +680,23 @@ func (a *Agent) consumeJob(job *types.Job, role string, retries int) {
 			reasoning = msg.Content
 		}
 
+		var satisfied bool
+		var err error
+		// Evaluate the response
+		satisfied, conv, err = a.handleEvaluation(job, conv, job.GetEvaluationLoop())
+		if err != nil {
+			job.Result.Finish(fmt.Errorf("error evaluating response: %w", err))
+			return
+		}
+
+		if !satisfied {
+			// If not satisfied, continue with the conversation
+			job.ConversationHistory = conv
+			job.IncrementEvaluationLoop()
+			a.consumeJob(job, role, retries)
+			return
+		}
+
 		xlog.Debug("Finish job with reasoning", "reasoning", reasoning, "agent", a.Character.Name, "conversation", fmt.Sprintf("%+v", conv))
 		job.Result.Conversation = conv
 		job.Result.AddFinalizer(func(conv []openai.ChatCompletionMessage) {
@@ -773,8 +786,6 @@ func (a *Agent) consumeJob(job *types.Job, role string, retries int) {
 	conv, err = a.handlePlanning(job.GetContext(), job, chosenAction, actionParams, reasoning, pickTemplate, conv)
 	if err != nil {
 		xlog.Error("error handling planning", "error", err)
-		//job.Result.Conversation = conv
-		//job.Result.SetResponse(msg.Content)
 		a.reply(job, role, append(conv, openai.ChatCompletionMessage{
 			Role:    "assistant",
 			Content: fmt.Sprintf("Error handling planning: %v", err),
@@ -821,9 +832,6 @@ func (a *Agent) consumeJob(job *types.Job, role string, retries int) {
 	if !chosenAction.Definition().Name.Is(action.PlanActionName) {
 		result, err := a.runAction(job, chosenAction, actionParams)
 		if err != nil {
-			//job.Result.Finish(fmt.Errorf("error running action: %w", err))
-			//return
-			// make the LLM aware of the error of running the action instead of stopping the job here
 			result.Result = fmt.Sprintf("Error running tool: %v", err)
 		}
 
@@ -866,6 +874,22 @@ func (a *Agent) consumeJob(job *types.Job, role string, retries int) {
 		return
 	}
 
+	// Evaluate the final response
+	var satisfied bool
+	satisfied, conv, err = a.handleEvaluation(job, conv, job.GetEvaluationLoop())
+	if err != nil {
+		job.Result.Finish(fmt.Errorf("error evaluating response: %w", err))
+		return
+	}
+
+	if !satisfied {
+		// If not satisfied, continue with the conversation
+		job.ConversationHistory = conv
+		job.IncrementEvaluationLoop()
+		a.consumeJob(job, role, retries)
+		return
+	}
+
 	a.reply(job, role, conv, actionParams, chosenAction, reasoning)
 }
 
diff --git a/core/agent/evaluation.go b/core/agent/evaluation.go
new file mode 100644
index 0000000..7373f91
--- /dev/null
+++ b/core/agent/evaluation.go
@@ -0,0 +1,162 @@
+package agent
+
+import (
+	"fmt"
+
+	"github.com/mudler/LocalAGI/core/types"
+	"github.com/mudler/LocalAGI/pkg/llm"
+	"github.com/mudler/LocalAGI/pkg/xlog"
+	"github.com/sashabaranov/go-openai"
+	"github.com/sashabaranov/go-openai/jsonschema"
+)
+
+type EvaluationResult struct {
+	Satisfied bool     `json:"satisfied"`
+	Gaps      []string `json:"gaps"`
+	Reasoning string   `json:"reasoning"`
+}
+
+type GoalExtraction struct {
+	Goal        string   `json:"goal"`
+	Constraints []string `json:"constraints"`
+	Context     string   `json:"context"`
+}
+
+func (a *Agent) extractGoal(job *types.Job, conv []openai.ChatCompletionMessage) (*GoalExtraction, error) {
+	// Create the goal extraction schema
+	schema := jsonschema.Definition{
+		Type: jsonschema.Object,
+		Properties: map[string]jsonschema.Definition{
+			"goal": {
+				Type:        jsonschema.String,
+				Description: "The main goal or request from the user",
+			},
+			"constraints": {
+				Type: jsonschema.Array,
+				Items: &jsonschema.Definition{
+					Type: jsonschema.String,
+				},
+				Description: "Any constraints or requirements specified by the user",
+			},
+			"context": {
+				Type:        jsonschema.String,
+				Description: "Additional context that might be relevant for understanding the goal",
+			},
+		},
+		Required: []string{"goal", "constraints", "context"},
+	}
+
+	// Create the goal extraction prompt
+	prompt := `Analyze the conversation and extract the user's main goal, any constraints, and relevant context.
+Consider the entire conversation history to understand the complete context and requirements.
+Focus on identifying the primary objective and any specific requirements or limitations mentioned.`
+
+	var result GoalExtraction
+	err := llm.GenerateTypedJSONWithConversation(job.GetContext(), a.client,
+		append(
+			[]openai.ChatCompletionMessage{
+				{
+					Role:    "system",
+					Content: prompt,
+				},
+			},
+			conv...), a.options.LLMAPI.Model, schema, &result)
+	if err != nil {
+		return nil, fmt.Errorf("error extracting goal: %w", err)
+	}
+
+	return &result, nil
+}
+
+func (a *Agent) evaluateJob(job *types.Job, conv []openai.ChatCompletionMessage) (*EvaluationResult, error) {
+	if !a.options.enableEvaluation {
+		return &EvaluationResult{Satisfied: true}, nil
+	}
+
+	// Extract the goal first
+	goal, err := a.extractGoal(job, conv)
+	if err != nil {
+		return nil, fmt.Errorf("error extracting goal: %w", err)
+	}
+
+	// Create the evaluation schema
+	schema := jsonschema.Definition{
+		Type: jsonschema.Object,
+		Properties: map[string]jsonschema.Definition{
+			"satisfied": {
+				Type: jsonschema.Boolean,
+			},
+			"gaps": {
+				Type: jsonschema.Array,
+				Items: &jsonschema.Definition{
+					Type: jsonschema.String,
+				},
+			},
+			"reasoning": {
+				Type: jsonschema.String,
+			},
+		},
+		Required: []string{"satisfied", "gaps", "reasoning"},
+	}
+
+	// Create the evaluation prompt
+	prompt := fmt.Sprintf(`Evaluate if the assistant has satisfied the user's request. Consider:
+1. The identified goal: %s
+2. Constraints and requirements: %v
+3. Context: %s
+4. The conversation history
+5. Any gaps or missing information
+6. Whether the response fully addresses the user's needs
+
+Provide a detailed evaluation with specific gaps if any are found.`,
+		goal.Goal,
+		goal.Constraints,
+		goal.Context)
+
+	var result EvaluationResult
+	err = llm.GenerateTypedJSONWithConversation(job.GetContext(), a.client,
+		append(
+			[]openai.ChatCompletionMessage{
+				{
+					Role:    "system",
+					Content: prompt,
+				},
+			},
+			conv...),
+		a.options.LLMAPI.Model, schema, &result)
+	if err != nil {
+		return nil, fmt.Errorf("error generating evaluation: %w", err)
+	}
+
+	return &result, nil
+}
+
+func (a *Agent) handleEvaluation(job *types.Job, conv []openai.ChatCompletionMessage, currentLoop int) (bool, []openai.ChatCompletionMessage, error) {
+	if !a.options.enableEvaluation || currentLoop >= a.options.maxEvaluationLoops {
+		return true, conv, nil
+	}
+
+	result, err := a.evaluateJob(job, conv)
+	if err != nil {
+		return false, conv, err
+	}
+
+	if result.Satisfied {
+		return true, conv, nil
+	}
+
+	// If there are gaps, we need to address them
+	if len(result.Gaps) > 0 {
+		// Add the evaluation result to the conversation
+		conv = append(conv, openai.ChatCompletionMessage{
+			Role: "system",
+			Content: fmt.Sprintf("Evaluation found gaps that need to be addressed:\n%s\nReasoning: %s",
+				result.Gaps, result.Reasoning),
+		})
+
+		xlog.Debug("Evaluation found gaps, incrementing loop count", "loop", currentLoop+1)
+		return false, conv, nil
+	}
+
+	return true, conv, nil
+}
diff --git a/core/agent/identity.go b/core/agent/identity.go
index a474d67..60ff14f 100644
--- a/core/agent/identity.go
+++ b/core/agent/identity.go
@@ -12,7 +12,7 @@ func (a *Agent) generateIdentity(guidance string) error {
 		guidance = "Generate a random character for roleplaying."
 	}
 
-	err := llm.GenerateTypedJSON(a.context.Context, a.client, "Generate a character as JSON data. "+guidance, a.options.LLMAPI.Model, a.options.character.ToJSONSchema(), &a.options.character)
+	err := llm.GenerateTypedJSONWithGuidance(a.context.Context, a.client, "Generate a character as JSON data. "+guidance, a.options.LLMAPI.Model, a.options.character.ToJSONSchema(), &a.options.character)
 	//err := llm.GenerateJSONFromStruct(a.context.Context, a.client, guidance, a.options.LLMAPI.Model, &a.options.character)
 	a.Character = a.options.character
 	if err != nil {
diff --git a/core/agent/options.go b/core/agent/options.go
index 4943dad..c7f4514 100644
--- a/core/agent/options.go
+++ b/core/agent/options.go
@@ -42,6 +42,10 @@ type options struct {
 	kbResults             int
 	ragdb                 RAGDB
 
+	// Evaluation settings
+	maxEvaluationLoops int
+	enableEvaluation   bool
+
 	prompts []DynamicPrompt
 
 	systemPrompt string
@@ -68,9 +72,11 @@ func (o *options) SeparatedMultimodalModel() bool {
 
 func defaultOptions() *options {
 	return &options{
-		parallelJobs: 1,
-		periodicRuns: 15 * time.Minute,
+		parallelJobs:       1,
+		periodicRuns:       15 * time.Minute,
 		loopDetectionSteps: 10,
+		maxEvaluationLoops: 2,
+		enableEvaluation:   false,
 		LLMAPI: llmOptions{
 			APIURL: "http://localhost:8080",
 			Model:  "gpt-4",
@@ -392,3 +398,17 @@ var EnableStripThinkingTags = func(o *options) error {
 	o.stripThinkingTags = true
 	return nil
 }
+
+func WithMaxEvaluationLoops(loops int) Option {
+	return func(o *options) error {
+		o.maxEvaluationLoops = loops
+		return nil
+	}
+}
+
+func EnableEvaluation() Option {
+	return func(o *options) error {
+		o.enableEvaluation = true
+		return nil
+	}
+}
diff --git a/core/state/config.go b/core/state/config.go
index d807ef7..0898975 100644
--- a/core/state/config.go
+++ b/core/state/config.go
@@ -74,6 +74,8 @@ type AgentConfig struct {
 	SummaryLongTermMemory bool   `json:"summary_long_term_memory" form:"summary_long_term_memory"`
 	ParallelJobs          int    `json:"parallel_jobs" form:"parallel_jobs"`
 	StripThinkingTags     bool   `json:"strip_thinking_tags" form:"strip_thinking_tags"`
+	EnableEvaluation      bool   `json:"enable_evaluation" form:"enable_evaluation"`
+	MaxEvaluationLoops    int    `json:"max_evaluation_loops" form:"max_evaluation_loops"`
 }
 
 type AgentConfigMeta struct {
@@ -309,6 +311,24 @@ func NewAgentConfigMeta(
 				HelpText:     "Remove content between <thinking></thinking> and <think></think> tags from agent responses",
 				Tags:         config.Tags{Section: "ModelSettings"},
 			},
+			{
+				Name:         "enable_evaluation",
+				Label:        "Enable Evaluation",
+				Type:         "checkbox",
+				DefaultValue: false,
+				HelpText:     "Enable automatic evaluation of agent responses to ensure they meet user requirements",
+				Tags:         config.Tags{Section: "AdvancedSettings"},
+			},
+			{
+				Name:         "max_evaluation_loops",
+				Label:        "Max Evaluation Loops",
+				Type:         "number",
+				DefaultValue: 2,
+				Min:          1,
+				Step:         1,
+				HelpText:     "Maximum number of evaluation loops to perform when addressing gaps in responses",
+				Tags:         config.Tags{Section: "AdvancedSettings"},
+			},
 		},
 		MCPServers: []config.Field{
 			{
diff --git a/core/state/pool.go b/core/state/pool.go
index 5cbac14..1ca8415 100644
--- a/core/state/pool.go
+++ b/core/state/pool.go
@@ -247,7 +247,7 @@ func createAgentAvatar(APIURL, APIKey, model, imageModel, avatarDir string, agen
 		ImagePrompt string `json:"image_prompt"`
 	}
 
-	err := llm.GenerateTypedJSON(
+	err := llm.GenerateTypedJSONWithGuidance(
 		context.Background(),
 		llm.NewClient(APIKey, APIURL, "10m"),
 		"Generate a prompt that I can use to create a random avatar for the bot '"+agent.Name+"', the description of the bot is: "+agent.Description,
@@ -561,6 +561,13 @@ func (a *AgentPool) startAgentWithConfig(name string, config *AgentConfig, obs O
 		opts = append(opts, WithParallelJobs(config.ParallelJobs))
 	}
 
+	if config.EnableEvaluation {
+		opts = append(opts, EnableEvaluation())
+		if config.MaxEvaluationLoops > 0 {
+			opts = append(opts, WithMaxEvaluationLoops(config.MaxEvaluationLoops))
+		}
+	}
+
 	xlog.Info("Starting agent", "name", name, "config", config)
 
 	agent, err := New(opts...)
diff --git a/core/types/job.go b/core/types/job.go
index 7a48ee1..c701c05 100644
--- a/core/types/job.go
+++ b/core/types/job.go
@@ -162,23 +162,23 @@ func newUUID() string {
 // To wait for a Job result, use JobResult.WaitResult()
 func NewJob(opts ...JobOption) *Job {
 	j := &Job{
-		Result: NewJobResult(),
-		UUID:   newUUID(),
-	}
-	for _, o := range opts {
-		o(j)
+		Result:              NewJobResult(),
+		UUID:                uuid.New().String(),
+		Metadata:            make(map[string]interface{}),
+		context:             context.Background(),
+		ConversationHistory: []openai.ChatCompletionMessage{},
 	}
 
-	var ctx context.Context
-	if j.context == nil {
-		ctx = context.Background()
-	} else {
-		ctx = j.context
+	for _, opt := range opts {
+		opt(j)
 	}
 
-	context, cancel := context.WithCancel(ctx)
-	j.context = context
+	// Store the original request if it exists in the conversation history
+
+	ctx, cancel := context.WithCancel(j.context)
+	j.context = ctx
 	j.cancel = cancel
+
 	return j
 }
 
@@ -207,3 +207,23 @@ func WithObservable(obs *Observable) JobOption {
 		j.Obs = obs
 	}
 }
+
+// GetEvaluationLoop returns the current evaluation loop count
+func (j *Job) GetEvaluationLoop() int {
+	if j.Metadata == nil {
+		j.Metadata = make(map[string]interface{})
+	}
+	if loop, ok := j.Metadata["evaluation_loop"].(int); ok {
+		return loop
+	}
+	return 0
+}
+
+// IncrementEvaluationLoop increments the evaluation loop count
+func (j *Job) IncrementEvaluationLoop() {
+	if j.Metadata == nil {
+		j.Metadata = make(map[string]interface{})
+	}
+	currentLoop := j.GetEvaluationLoop()
+	j.Metadata["evaluation_loop"] = currentLoop + 1
+}
diff --git a/pkg/llm/json.go b/pkg/llm/json.go
index 5386fe1..c4f48d1 100644
--- a/pkg/llm/json.go
+++ b/pkg/llm/json.go
@@ -10,16 +10,20 @@ import (
 	"github.com/sashabaranov/go-openai/jsonschema"
 )
 
-func GenerateTypedJSON(ctx context.Context, client *openai.Client, guidance, model string, i jsonschema.Definition, dst any) error {
+func GenerateTypedJSONWithGuidance(ctx context.Context, client *openai.Client, guidance, model string, i jsonschema.Definition, dst any) error {
+	return GenerateTypedJSONWithConversation(ctx, client, []openai.ChatCompletionMessage{
+		{
+			Role:    "user",
+			Content: guidance,
+		},
+	}, model, i, dst)
+}
+
+func GenerateTypedJSONWithConversation(ctx context.Context, client *openai.Client, conv []openai.ChatCompletionMessage, model string, i jsonschema.Definition, dst any) error {
 	toolName := "json"
 	decision := openai.ChatCompletionRequest{
-		Model: model,
-		Messages: []openai.ChatCompletionMessage{
-			{
-				Role:    "user",
-				Content: guidance,
-			},
-		},
+		Model:    model,
+		Messages: conv,
 		Tools: []openai.Tool{
 			{
 
diff --git a/services/filters/classifier.go b/services/filters/classifier.go
index d85aa4f..e517a7c 100644
--- a/services/filters/classifier.go
+++ b/services/filters/classifier.go
@@ -78,7 +78,7 @@ func (f *ClassifierFilter) Apply(job *types.Job) (bool, error) {
 	var result struct {
 		Asserted bool `json:"answer"`
 	}
-	err := llm.GenerateTypedJSON(job.GetContext(), f.client, guidance, f.model, jsonschema.Definition{
+	err := llm.GenerateTypedJSONWithGuidance(job.GetContext(), f.client, guidance, f.model, jsonschema.Definition{
 		Type: jsonschema.Object,
 		Properties: map[string]jsonschema.Definition{
 			"answer": {
diff --git a/webui/app.go b/webui/app.go
index acf5291..ff4b29e 100644
--- a/webui/app.go
+++ b/webui/app.go
@@ -576,7 +576,7 @@ func (a *App) GenerateGroupProfiles(pool *state.AgentPool) func(c *fiber.Ctx) er
 
 		xlog.Debug("Generating group", "description", request.Descript)
 		client := llm.NewClient(a.config.LLMAPIKey, a.config.LLMAPIURL, "10m")
-		err := llm.GenerateTypedJSON(c.Context(), client, request.Descript, a.config.LLMModel, jsonschema.Definition{
+		err := llm.GenerateTypedJSONWithGuidance(c.Context(), client, request.Descript, a.config.LLMModel, jsonschema.Definition{
 			Type: jsonschema.Object,
 			Properties: map[string]jsonschema.Definition{
 				"agents": {