feat(evaluation): add deep evaluation mechanism (#145)

* feat(evaluation): add deep evaluation mechanism Signed-off-by: Ettore Di Giacinto <mudler@localai.io> * consider whole conversation when evaluating Signed-off-by: mudler <mudler@localai.io> --------- Signed-off-by: Ettore Di Giacinto <mudler@localai.io> Signed-off-by: mudler <mudler@localai.io>
2025-05-11 18:31:04 +02:00
parent 289edb67a6
commit e431bc234b
10 changed files with 293 additions and 36 deletions
--- a/core/agent/agent.go
+++ b/core/agent/agent.go
@@ -560,7 +560,6 @@ func (a *Agent) filterJob(job *types.Job) (ok bool, err error) {
 }
 func (a *Agent) consumeJob(job *types.Job, role string, retries int) {
 	if err := job.GetContext().Err(); err != nil {
 		job.Result.Finish(fmt.Errorf("expired"))
 		return
@@ -659,12 +658,9 @@ func (a *Agent) consumeJob(job *types.Job, role string, retries int) {
 		}
 	}
 	//xlog.Debug("Picked action", "agent", a.Character.Name, "action", chosenAction.Definition().Name, "reasoning", reasoning)
 	if chosenAction == nil {
 		// If no action was picked up, the reasoning is the message returned by the assistant
 		// so we can consume it as if it was a reply.
 		//job.Result.SetResult(ActionState{ActionCurrentState{nil, nil, "No action to do, just reply"}, ""})
 		//job.Result.Finish(fmt.Errorf("no action to do"))\
 		xlog.Info("No action to do, just reply", "agent", a.Character.Name, "reasoning", reasoning)
 		if reasoning != "" {
@@ -684,6 +680,23 @@ func (a *Agent) consumeJob(job *types.Job, role string, retries int) {
 			reasoning = msg.Content
 		}
 		var satisfied bool
 		var err error
 		// Evaluate the response
 		satisfied, conv, err = a.handleEvaluation(job, conv, job.GetEvaluationLoop())
 		if err != nil {
 			job.Result.Finish(fmt.Errorf("error evaluating response: %w", err))
 			return
 		}
 		if !satisfied {
 			// If not satisfied, continue with the conversation
 			job.ConversationHistory = conv
 			job.IncrementEvaluationLoop()
 			a.consumeJob(job, role, retries)
 			return
 		}
 		xlog.Debug("Finish job with reasoning", "reasoning", reasoning, "agent", a.Character.Name, "conversation", fmt.Sprintf("%+v", conv))
 		job.Result.Conversation = conv
 		job.Result.AddFinalizer(func(conv []openai.ChatCompletionMessage) {
@@ -773,8 +786,6 @@ func (a *Agent) consumeJob(job *types.Job, role string, retries int) {
 	conv, err = a.handlePlanning(job.GetContext(), job, chosenAction, actionParams, reasoning, pickTemplate, conv)
 	if err != nil {
 		xlog.Error("error handling planning", "error", err)
 		//job.Result.Conversation = conv
 		//job.Result.SetResponse(msg.Content)
 		a.reply(job, role, append(conv, openai.ChatCompletionMessage{
 			Role:    "assistant",
 			Content: fmt.Sprintf("Error handling planning: %v", err),
@@ -821,9 +832,6 @@ func (a *Agent) consumeJob(job *types.Job, role string, retries int) {
 	if !chosenAction.Definition().Name.Is(action.PlanActionName) {
 		result, err := a.runAction(job, chosenAction, actionParams)
 		if err != nil {
 			//job.Result.Finish(fmt.Errorf("error running action: %w", err))
 			//return
 			// make the LLM aware of the error of running the action instead of stopping the job here
 			result.Result = fmt.Sprintf("Error running tool: %v", err)
 		}
@@ -866,6 +874,22 @@ func (a *Agent) consumeJob(job *types.Job, role string, retries int) {
 		return
 	}
 	// Evaluate the final response
 	var satisfied bool
 	satisfied, conv, err = a.handleEvaluation(job, conv, job.GetEvaluationLoop())
 	if err != nil {
 		job.Result.Finish(fmt.Errorf("error evaluating response: %w", err))
 		return
 	}
 	if !satisfied {
 		// If not satisfied, continue with the conversation
 		job.ConversationHistory = conv
 		job.IncrementEvaluationLoop()
 		a.consumeJob(job, role, retries)
 		return
 	}
 	a.reply(job, role, conv, actionParams, chosenAction, reasoning)
 }
--- a/core/agent/evaluation.go
+++ b/core/agent/evaluation.go
@@ -0,0 +1,162 @@
 package agent
 import (
 	"fmt"
 	"github.com/mudler/LocalAGI/core/types"
 	"github.com/mudler/LocalAGI/pkg/llm"
 	"github.com/mudler/LocalAGI/pkg/xlog"
 	"github.com/sashabaranov/go-openai"
 	"github.com/sashabaranov/go-openai/jsonschema"
 )
 type EvaluationResult struct {
 	Satisfied bool     `json:"satisfied"`
 	Gaps      []string `json:"gaps"`
 	Reasoning string   `json:"reasoning"`
 }
 type GoalExtraction struct {
 	Goal        string   `json:"goal"`
 	Constraints []string `json:"constraints"`
 	Context     string   `json:"context"`
 }
 func (a *Agent) extractGoal(job *types.Job, conv []openai.ChatCompletionMessage) (*GoalExtraction, error) {
 	// Create the goal extraction schema
 	schema := jsonschema.Definition{
 		Type: jsonschema.Object,
 		Properties: map[string]jsonschema.Definition{
 			"goal": {
 				Type:        jsonschema.String,
 				Description: "The main goal or request from the user",
 			},
 			"constraints": {
 				Type: jsonschema.Array,
 				Items: &jsonschema.Definition{
 					Type: jsonschema.String,
 				},
 				Description: "Any constraints or requirements specified by the user",
 			},
 			"context": {
 				Type:        jsonschema.String,
 				Description: "Additional context that might be relevant for understanding the goal",
 			},
 		},
 		Required: []string{"goal", "constraints", "context"},
 	}
 	// Create the goal extraction prompt
 	prompt := `Analyze the conversation and extract the user's main goal, any constraints, and relevant context.
 Consider the entire conversation history to understand the complete context and requirements.
 Focus on identifying the primary objective and any specific requirements or limitations mentioned.`
 	var result GoalExtraction
 	err := llm.GenerateTypedJSONWithConversation(job.GetContext(), a.client,
 		append(
 			[]openai.ChatCompletionMessage{
 				{
 					Role:    "system",
 					Content: prompt,
 				},
 			},
 			conv...), a.options.LLMAPI.Model, schema, &result)
 	if err != nil {
 		return nil, fmt.Errorf("error extracting goal: %w", err)
 	}
 	return &result, nil
 }
 func (a *Agent) evaluateJob(job *types.Job, conv []openai.ChatCompletionMessage) (*EvaluationResult, error) {
 	if !a.options.enableEvaluation {
 		return &EvaluationResult{Satisfied: true}, nil
 	}
 	// Extract the goal first
 	goal, err := a.extractGoal(job, conv)
 	if err != nil {
 		return nil, fmt.Errorf("error extracting goal: %w", err)
 	}
 	// Create the evaluation schema
 	schema := jsonschema.Definition{
 		Type: jsonschema.Object,
 		Properties: map[string]jsonschema.Definition{
 			"satisfied": {
 				Type: jsonschema.Boolean,
 			},
 			"gaps": {
 				Type: jsonschema.Array,
 				Items: &jsonschema.Definition{
 					Type: jsonschema.String,
 				},
 			},
 			"reasoning": {
 				Type: jsonschema.String,
 			},
 		},
 		Required: []string{"satisfied", "gaps", "reasoning"},
 	}
 	// Create the evaluation prompt
 	prompt := fmt.Sprintf(`Evaluate if the assistant has satisfied the user's request. Consider:
 1. The identified goal: %s
 2. Constraints and requirements: %v
 3. Context: %s
 4. The conversation history
 5. Any gaps or missing information
 6. Whether the response fully addresses the user's needs
 Provide a detailed evaluation with specific gaps if any are found.`,
 		goal.Goal,
 		goal.Constraints,
 		goal.Context)
 	var result EvaluationResult
 	err = llm.GenerateTypedJSONWithConversation(job.GetContext(), a.client,
 		append(
 			[]openai.ChatCompletionMessage{
 				{
 					Role:    "system",
 					Content: prompt,
 				},
 			},
 			conv...),
 		a.options.LLMAPI.Model, schema, &result)
 	if err != nil {
 		return nil, fmt.Errorf("error generating evaluation: %w", err)
 	}
 	return &result, nil
 }
 func (a *Agent) handleEvaluation(job *types.Job, conv []openai.ChatCompletionMessage, currentLoop int) (bool, []openai.ChatCompletionMessage, error) {
 	if !a.options.enableEvaluation || currentLoop >= a.options.maxEvaluationLoops {
 		return true, conv, nil
 	}
 	result, err := a.evaluateJob(job, conv)
 	if err != nil {
 		return false, conv, err
 	}
 	if result.Satisfied {
 		return true, conv, nil
 	}
 	// If there are gaps, we need to address them
 	if len(result.Gaps) > 0 {
 		// Add the evaluation result to the conversation
 		conv = append(conv, openai.ChatCompletionMessage{
 			Role: "system",
 			Content: fmt.Sprintf("Evaluation found gaps that need to be addressed:\n%s\nReasoning: %s",
 				result.Gaps, result.Reasoning),
 		})
 		xlog.Debug("Evaluation found gaps, incrementing loop count", "loop", currentLoop+1)
 		return false, conv, nil
 	}
 	return true, conv, nil
 }
--- a/core/agent/identity.go
+++ b/core/agent/identity.go
@@ -12,7 +12,7 @@ func (a *Agent) generateIdentity(guidance string) error {
 		guidance = "Generate a random character for roleplaying."
 	}
-	err := llm.GenerateTypedJSON(a.context.Context, a.client, "Generate a character as JSON data. "+guidance, a.options.LLMAPI.Model, a.options.character.ToJSONSchema(), &a.options.character)
+	err := llm.GenerateTypedJSONWithGuidance(a.context.Context, a.client, "Generate a character as JSON data. "+guidance, a.options.LLMAPI.Model, a.options.character.ToJSONSchema(), &a.options.character)
 	//err := llm.GenerateJSONFromStruct(a.context.Context, a.client, guidance, a.options.LLMAPI.Model, &a.options.character)
 	a.Character = a.options.character
 	if err != nil {
--- a/core/agent/options.go
+++ b/core/agent/options.go
@@ -42,6 +42,10 @@ type options struct {
 	kbResults             int
 	ragdb                 RAGDB
 	// Evaluation settings
 	maxEvaluationLoops int
 	enableEvaluation   bool
 	prompts []DynamicPrompt
 	systemPrompt string
@@ -68,9 +72,11 @@ func (o *options) SeparatedMultimodalModel() bool {
 func defaultOptions() *options {
 	return &options{
-		parallelJobs: 1,
+		parallelJobs:       1,
-		periodicRuns: 15 * time.Minute,
+		periodicRuns:       15 * time.Minute,
 		loopDetectionSteps: 10,
 		maxEvaluationLoops: 2,
 		enableEvaluation:   false,
 		LLMAPI: llmOptions{
 			APIURL: "http://localhost:8080",
 			Model:  "gpt-4",
@@ -392,3 +398,17 @@ var EnableStripThinkingTags = func(o *options) error {
 	o.stripThinkingTags = true
 	return nil
 }
 func WithMaxEvaluationLoops(loops int) Option {
 	return func(o *options) error {
 		o.maxEvaluationLoops = loops
 		return nil
 	}
 }
 func EnableEvaluation() Option {
 	return func(o *options) error {
 		o.enableEvaluation = true
 		return nil
 	}
 }
--- a/core/state/config.go
+++ b/core/state/config.go
@@ -74,6 +74,8 @@ type AgentConfig struct {
 	SummaryLongTermMemory bool   `json:"summary_long_term_memory" form:"summary_long_term_memory"`
 	ParallelJobs          int    `json:"parallel_jobs" form:"parallel_jobs"`
 	StripThinkingTags     bool   `json:"strip_thinking_tags" form:"strip_thinking_tags"`
 	EnableEvaluation      bool   `json:"enable_evaluation" form:"enable_evaluation"`
 	MaxEvaluationLoops    int    `json:"max_evaluation_loops" form:"max_evaluation_loops"`
 }
 type AgentConfigMeta struct {
@@ -309,6 +311,24 @@ func NewAgentConfigMeta(
 				HelpText:     "Remove content between <thinking></thinking> and <think></think> tags from agent responses",
 				Tags:         config.Tags{Section: "ModelSettings"},
 			},
 			{
 				Name:         "enable_evaluation",
 				Label:        "Enable Evaluation",
 				Type:         "checkbox",
 				DefaultValue: false,
 				HelpText:     "Enable automatic evaluation of agent responses to ensure they meet user requirements",
 				Tags:         config.Tags{Section: "AdvancedSettings"},
 			},
 			{
 				Name:         "max_evaluation_loops",
 				Label:        "Max Evaluation Loops",
 				Type:         "number",
 				DefaultValue: 2,
 				Min:          1,
 				Step:         1,
 				HelpText:     "Maximum number of evaluation loops to perform when addressing gaps in responses",
 				Tags:         config.Tags{Section: "AdvancedSettings"},
 			},
 		},
 		MCPServers: []config.Field{
 			{
--- a/core/state/pool.go
+++ b/core/state/pool.go
@@ -247,7 +247,7 @@ func createAgentAvatar(APIURL, APIKey, model, imageModel, avatarDir string, agen
 		ImagePrompt string `json:"image_prompt"`
 	}
-	err := llm.GenerateTypedJSON(
+	err := llm.GenerateTypedJSONWithGuidance(
 		context.Background(),
 		llm.NewClient(APIKey, APIURL, "10m"),
 		"Generate a prompt that I can use to create a random avatar for the bot '"+agent.Name+"', the description of the bot is: "+agent.Description,
@@ -561,6 +561,13 @@ func (a *AgentPool) startAgentWithConfig(name string, config *AgentConfig, obs O
 		opts = append(opts, WithParallelJobs(config.ParallelJobs))
 	}
 	if config.EnableEvaluation {
 		opts = append(opts, EnableEvaluation())
 		if config.MaxEvaluationLoops > 0 {
 			opts = append(opts, WithMaxEvaluationLoops(config.MaxEvaluationLoops))
 		}
 	}
 	xlog.Info("Starting agent", "name", name, "config", config)
 	agent, err := New(opts...)
--- a/core/types/job.go
+++ b/core/types/job.go
@@ -162,23 +162,23 @@ func newUUID() string {
 // To wait for a Job result, use JobResult.WaitResult()
 func NewJob(opts ...JobOption) *Job {
 	j := &Job{
-		Result: NewJobResult(),
+		Result:              NewJobResult(),
-		UUID:   newUUID(),
+		UUID:                uuid.New().String(),
-	}
+		Metadata:            make(map[string]interface{}),
-	for _, o := range opts {
+		context:             context.Background(),
-		o(j)
+		ConversationHistory: []openai.ChatCompletionMessage{},
 	}
-	var ctx context.Context
+	for _, opt := range opts {
-	if j.context == nil {
+		opt(j)
 		ctx = context.Background()
 	} else {
 		ctx = j.context
 	}
-	context, cancel := context.WithCancel(ctx)
+	// Store the original request if it exists in the conversation history
-	j.context = context
+
 	ctx, cancel := context.WithCancel(j.context)
 	j.context = ctx
 	j.cancel = cancel
 	return j
 }
@@ -207,3 +207,23 @@ func WithObservable(obs *Observable) JobOption {
 		j.Obs = obs
 	}
 }
 // GetEvaluationLoop returns the current evaluation loop count
 func (j *Job) GetEvaluationLoop() int {
 	if j.Metadata == nil {
 		j.Metadata = make(map[string]interface{})
 	}
 	if loop, ok := j.Metadata["evaluation_loop"].(int); ok {
 		return loop
 	}
 	return 0
 }
 // IncrementEvaluationLoop increments the evaluation loop count
 func (j *Job) IncrementEvaluationLoop() {
 	if j.Metadata == nil {
 		j.Metadata = make(map[string]interface{})
 	}
 	currentLoop := j.GetEvaluationLoop()
 	j.Metadata["evaluation_loop"] = currentLoop + 1
 }
--- a/pkg/llm/json.go
+++ b/pkg/llm/json.go
@@ -10,16 +10,20 @@ import (
 	"github.com/sashabaranov/go-openai/jsonschema"
 )
-func GenerateTypedJSON(ctx context.Context, client *openai.Client, guidance, model string, i jsonschema.Definition, dst any) error {
+func GenerateTypedJSONWithGuidance(ctx context.Context, client *openai.Client, guidance, model string, i jsonschema.Definition, dst any) error {
 	return GenerateTypedJSONWithConversation(ctx, client, []openai.ChatCompletionMessage{
 		{
 			Role:    "user",
 			Content: guidance,
 		},
 	}, model, i, dst)
 }
 func GenerateTypedJSONWithConversation(ctx context.Context, client *openai.Client, conv []openai.ChatCompletionMessage, model string, i jsonschema.Definition, dst any) error {
 	toolName := "json"
 	decision := openai.ChatCompletionRequest{
-		Model: model,
+		Model:    model,
-		Messages: []openai.ChatCompletionMessage{
+		Messages: conv,
 			{
 				Role:    "user",
 				Content: guidance,
 			},
 		},
 		Tools: []openai.Tool{
 			{
--- a/services/filters/classifier.go
+++ b/services/filters/classifier.go
@@ -78,7 +78,7 @@ func (f *ClassifierFilter) Apply(job *types.Job) (bool, error) {
 	var result struct {
 		Asserted bool `json:"answer"`
 	}
-	err := llm.GenerateTypedJSON(job.GetContext(), f.client, guidance, f.model, jsonschema.Definition{
+	err := llm.GenerateTypedJSONWithGuidance(job.GetContext(), f.client, guidance, f.model, jsonschema.Definition{
 		Type: jsonschema.Object,
 		Properties: map[string]jsonschema.Definition{
 			"answer": {
--- a/webui/app.go
+++ b/webui/app.go
@@ -576,7 +576,7 @@ func (a *App) GenerateGroupProfiles(pool *state.AgentPool) func(c *fiber.Ctx) er
 		xlog.Debug("Generating group", "description", request.Descript)
 		client := llm.NewClient(a.config.LLMAPIKey, a.config.LLMAPIURL, "10m")
-		err := llm.GenerateTypedJSON(c.Context(), client, request.Descript, a.config.LLMModel, jsonschema.Definition{
+		err := llm.GenerateTypedJSONWithGuidance(c.Context(), client, request.Descript, a.config.LLMModel, jsonschema.Definition{
 			Type: jsonschema.Object,
 			Properties: map[string]jsonschema.Definition{
 				"agents": {