feat(evaluation): add deep evaluation mechanism (#145)
* feat(evaluation): add deep evaluation mechanism Signed-off-by: Ettore Di Giacinto <mudler@localai.io> * consider whole conversation when evaluating Signed-off-by: mudler <mudler@localai.io> --------- Signed-off-by: Ettore Di Giacinto <mudler@localai.io> Signed-off-by: mudler <mudler@localai.io>
This commit is contained in:
committed by
GitHub
parent
289edb67a6
commit
e431bc234b
@@ -560,7 +560,6 @@ func (a *Agent) filterJob(job *types.Job) (ok bool, err error) {
|
||||
}
|
||||
|
||||
func (a *Agent) consumeJob(job *types.Job, role string, retries int) {
|
||||
|
||||
if err := job.GetContext().Err(); err != nil {
|
||||
job.Result.Finish(fmt.Errorf("expired"))
|
||||
return
|
||||
@@ -659,12 +658,9 @@ func (a *Agent) consumeJob(job *types.Job, role string, retries int) {
|
||||
}
|
||||
}
|
||||
|
||||
//xlog.Debug("Picked action", "agent", a.Character.Name, "action", chosenAction.Definition().Name, "reasoning", reasoning)
|
||||
if chosenAction == nil {
|
||||
// If no action was picked up, the reasoning is the message returned by the assistant
|
||||
// so we can consume it as if it was a reply.
|
||||
//job.Result.SetResult(ActionState{ActionCurrentState{nil, nil, "No action to do, just reply"}, ""})
|
||||
//job.Result.Finish(fmt.Errorf("no action to do"))\
|
||||
xlog.Info("No action to do, just reply", "agent", a.Character.Name, "reasoning", reasoning)
|
||||
|
||||
if reasoning != "" {
|
||||
@@ -684,6 +680,23 @@ func (a *Agent) consumeJob(job *types.Job, role string, retries int) {
|
||||
reasoning = msg.Content
|
||||
}
|
||||
|
||||
var satisfied bool
|
||||
var err error
|
||||
// Evaluate the response
|
||||
satisfied, conv, err = a.handleEvaluation(job, conv, job.GetEvaluationLoop())
|
||||
if err != nil {
|
||||
job.Result.Finish(fmt.Errorf("error evaluating response: %w", err))
|
||||
return
|
||||
}
|
||||
|
||||
if !satisfied {
|
||||
// If not satisfied, continue with the conversation
|
||||
job.ConversationHistory = conv
|
||||
job.IncrementEvaluationLoop()
|
||||
a.consumeJob(job, role, retries)
|
||||
return
|
||||
}
|
||||
|
||||
xlog.Debug("Finish job with reasoning", "reasoning", reasoning, "agent", a.Character.Name, "conversation", fmt.Sprintf("%+v", conv))
|
||||
job.Result.Conversation = conv
|
||||
job.Result.AddFinalizer(func(conv []openai.ChatCompletionMessage) {
|
||||
@@ -773,8 +786,6 @@ func (a *Agent) consumeJob(job *types.Job, role string, retries int) {
|
||||
conv, err = a.handlePlanning(job.GetContext(), job, chosenAction, actionParams, reasoning, pickTemplate, conv)
|
||||
if err != nil {
|
||||
xlog.Error("error handling planning", "error", err)
|
||||
//job.Result.Conversation = conv
|
||||
//job.Result.SetResponse(msg.Content)
|
||||
a.reply(job, role, append(conv, openai.ChatCompletionMessage{
|
||||
Role: "assistant",
|
||||
Content: fmt.Sprintf("Error handling planning: %v", err),
|
||||
@@ -821,9 +832,6 @@ func (a *Agent) consumeJob(job *types.Job, role string, retries int) {
|
||||
if !chosenAction.Definition().Name.Is(action.PlanActionName) {
|
||||
result, err := a.runAction(job, chosenAction, actionParams)
|
||||
if err != nil {
|
||||
//job.Result.Finish(fmt.Errorf("error running action: %w", err))
|
||||
//return
|
||||
// make the LLM aware of the error of running the action instead of stopping the job here
|
||||
result.Result = fmt.Sprintf("Error running tool: %v", err)
|
||||
}
|
||||
|
||||
@@ -866,6 +874,22 @@ func (a *Agent) consumeJob(job *types.Job, role string, retries int) {
|
||||
return
|
||||
}
|
||||
|
||||
// Evaluate the final response
|
||||
var satisfied bool
|
||||
satisfied, conv, err = a.handleEvaluation(job, conv, job.GetEvaluationLoop())
|
||||
if err != nil {
|
||||
job.Result.Finish(fmt.Errorf("error evaluating response: %w", err))
|
||||
return
|
||||
}
|
||||
|
||||
if !satisfied {
|
||||
// If not satisfied, continue with the conversation
|
||||
job.ConversationHistory = conv
|
||||
job.IncrementEvaluationLoop()
|
||||
a.consumeJob(job, role, retries)
|
||||
return
|
||||
}
|
||||
|
||||
a.reply(job, role, conv, actionParams, chosenAction, reasoning)
|
||||
}
|
||||
|
||||
|
||||
162
core/agent/evaluation.go
Normal file
162
core/agent/evaluation.go
Normal file
@@ -0,0 +1,162 @@
|
||||
package agent
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
|
||||
"github.com/mudler/LocalAGI/core/types"
|
||||
"github.com/mudler/LocalAGI/pkg/llm"
|
||||
"github.com/mudler/LocalAGI/pkg/xlog"
|
||||
"github.com/sashabaranov/go-openai"
|
||||
"github.com/sashabaranov/go-openai/jsonschema"
|
||||
)
|
||||
|
||||
type EvaluationResult struct {
|
||||
Satisfied bool `json:"satisfied"`
|
||||
Gaps []string `json:"gaps"`
|
||||
Reasoning string `json:"reasoning"`
|
||||
}
|
||||
|
||||
type GoalExtraction struct {
|
||||
Goal string `json:"goal"`
|
||||
Constraints []string `json:"constraints"`
|
||||
Context string `json:"context"`
|
||||
}
|
||||
|
||||
func (a *Agent) extractGoal(job *types.Job, conv []openai.ChatCompletionMessage) (*GoalExtraction, error) {
|
||||
// Create the goal extraction schema
|
||||
schema := jsonschema.Definition{
|
||||
Type: jsonschema.Object,
|
||||
Properties: map[string]jsonschema.Definition{
|
||||
"goal": {
|
||||
Type: jsonschema.String,
|
||||
Description: "The main goal or request from the user",
|
||||
},
|
||||
"constraints": {
|
||||
Type: jsonschema.Array,
|
||||
Items: &jsonschema.Definition{
|
||||
Type: jsonschema.String,
|
||||
},
|
||||
Description: "Any constraints or requirements specified by the user",
|
||||
},
|
||||
"context": {
|
||||
Type: jsonschema.String,
|
||||
Description: "Additional context that might be relevant for understanding the goal",
|
||||
},
|
||||
},
|
||||
Required: []string{"goal", "constraints", "context"},
|
||||
}
|
||||
|
||||
// Create the goal extraction prompt
|
||||
prompt := `Analyze the conversation and extract the user's main goal, any constraints, and relevant context.
|
||||
Consider the entire conversation history to understand the complete context and requirements.
|
||||
Focus on identifying the primary objective and any specific requirements or limitations mentioned.`
|
||||
|
||||
var result GoalExtraction
|
||||
err := llm.GenerateTypedJSONWithConversation(job.GetContext(), a.client,
|
||||
append(
|
||||
[]openai.ChatCompletionMessage{
|
||||
{
|
||||
Role: "system",
|
||||
Content: prompt,
|
||||
},
|
||||
},
|
||||
conv...), a.options.LLMAPI.Model, schema, &result)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("error extracting goal: %w", err)
|
||||
}
|
||||
|
||||
return &result, nil
|
||||
}
|
||||
|
||||
func (a *Agent) evaluateJob(job *types.Job, conv []openai.ChatCompletionMessage) (*EvaluationResult, error) {
|
||||
if !a.options.enableEvaluation {
|
||||
return &EvaluationResult{Satisfied: true}, nil
|
||||
}
|
||||
|
||||
// Extract the goal first
|
||||
goal, err := a.extractGoal(job, conv)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("error extracting goal: %w", err)
|
||||
}
|
||||
|
||||
// Create the evaluation schema
|
||||
schema := jsonschema.Definition{
|
||||
Type: jsonschema.Object,
|
||||
Properties: map[string]jsonschema.Definition{
|
||||
"satisfied": {
|
||||
Type: jsonschema.Boolean,
|
||||
},
|
||||
"gaps": {
|
||||
Type: jsonschema.Array,
|
||||
Items: &jsonschema.Definition{
|
||||
Type: jsonschema.String,
|
||||
},
|
||||
},
|
||||
"reasoning": {
|
||||
Type: jsonschema.String,
|
||||
},
|
||||
},
|
||||
Required: []string{"satisfied", "gaps", "reasoning"},
|
||||
}
|
||||
|
||||
// Create the evaluation prompt
|
||||
prompt := fmt.Sprintf(`Evaluate if the assistant has satisfied the user's request. Consider:
|
||||
1. The identified goal: %s
|
||||
2. Constraints and requirements: %v
|
||||
3. Context: %s
|
||||
4. The conversation history
|
||||
5. Any gaps or missing information
|
||||
6. Whether the response fully addresses the user's needs
|
||||
|
||||
Provide a detailed evaluation with specific gaps if any are found.`,
|
||||
goal.Goal,
|
||||
goal.Constraints,
|
||||
goal.Context)
|
||||
|
||||
var result EvaluationResult
|
||||
err = llm.GenerateTypedJSONWithConversation(job.GetContext(), a.client,
|
||||
append(
|
||||
[]openai.ChatCompletionMessage{
|
||||
{
|
||||
Role: "system",
|
||||
Content: prompt,
|
||||
},
|
||||
},
|
||||
conv...),
|
||||
a.options.LLMAPI.Model, schema, &result)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("error generating evaluation: %w", err)
|
||||
}
|
||||
|
||||
return &result, nil
|
||||
}
|
||||
|
||||
func (a *Agent) handleEvaluation(job *types.Job, conv []openai.ChatCompletionMessage, currentLoop int) (bool, []openai.ChatCompletionMessage, error) {
|
||||
if !a.options.enableEvaluation || currentLoop >= a.options.maxEvaluationLoops {
|
||||
return true, conv, nil
|
||||
}
|
||||
|
||||
result, err := a.evaluateJob(job, conv)
|
||||
if err != nil {
|
||||
return false, conv, err
|
||||
}
|
||||
|
||||
if result.Satisfied {
|
||||
return true, conv, nil
|
||||
}
|
||||
|
||||
// If there are gaps, we need to address them
|
||||
if len(result.Gaps) > 0 {
|
||||
// Add the evaluation result to the conversation
|
||||
conv = append(conv, openai.ChatCompletionMessage{
|
||||
Role: "system",
|
||||
Content: fmt.Sprintf("Evaluation found gaps that need to be addressed:\n%s\nReasoning: %s",
|
||||
result.Gaps, result.Reasoning),
|
||||
})
|
||||
|
||||
xlog.Debug("Evaluation found gaps, incrementing loop count", "loop", currentLoop+1)
|
||||
return false, conv, nil
|
||||
}
|
||||
|
||||
return true, conv, nil
|
||||
}
|
||||
@@ -12,7 +12,7 @@ func (a *Agent) generateIdentity(guidance string) error {
|
||||
guidance = "Generate a random character for roleplaying."
|
||||
}
|
||||
|
||||
err := llm.GenerateTypedJSON(a.context.Context, a.client, "Generate a character as JSON data. "+guidance, a.options.LLMAPI.Model, a.options.character.ToJSONSchema(), &a.options.character)
|
||||
err := llm.GenerateTypedJSONWithGuidance(a.context.Context, a.client, "Generate a character as JSON data. "+guidance, a.options.LLMAPI.Model, a.options.character.ToJSONSchema(), &a.options.character)
|
||||
//err := llm.GenerateJSONFromStruct(a.context.Context, a.client, guidance, a.options.LLMAPI.Model, &a.options.character)
|
||||
a.Character = a.options.character
|
||||
if err != nil {
|
||||
|
||||
@@ -42,6 +42,10 @@ type options struct {
|
||||
kbResults int
|
||||
ragdb RAGDB
|
||||
|
||||
// Evaluation settings
|
||||
maxEvaluationLoops int
|
||||
enableEvaluation bool
|
||||
|
||||
prompts []DynamicPrompt
|
||||
|
||||
systemPrompt string
|
||||
@@ -68,9 +72,11 @@ func (o *options) SeparatedMultimodalModel() bool {
|
||||
|
||||
func defaultOptions() *options {
|
||||
return &options{
|
||||
parallelJobs: 1,
|
||||
periodicRuns: 15 * time.Minute,
|
||||
parallelJobs: 1,
|
||||
periodicRuns: 15 * time.Minute,
|
||||
loopDetectionSteps: 10,
|
||||
maxEvaluationLoops: 2,
|
||||
enableEvaluation: false,
|
||||
LLMAPI: llmOptions{
|
||||
APIURL: "http://localhost:8080",
|
||||
Model: "gpt-4",
|
||||
@@ -392,3 +398,17 @@ var EnableStripThinkingTags = func(o *options) error {
|
||||
o.stripThinkingTags = true
|
||||
return nil
|
||||
}
|
||||
|
||||
func WithMaxEvaluationLoops(loops int) Option {
|
||||
return func(o *options) error {
|
||||
o.maxEvaluationLoops = loops
|
||||
return nil
|
||||
}
|
||||
}
|
||||
|
||||
func EnableEvaluation() Option {
|
||||
return func(o *options) error {
|
||||
o.enableEvaluation = true
|
||||
return nil
|
||||
}
|
||||
}
|
||||
|
||||
@@ -74,6 +74,8 @@ type AgentConfig struct {
|
||||
SummaryLongTermMemory bool `json:"summary_long_term_memory" form:"summary_long_term_memory"`
|
||||
ParallelJobs int `json:"parallel_jobs" form:"parallel_jobs"`
|
||||
StripThinkingTags bool `json:"strip_thinking_tags" form:"strip_thinking_tags"`
|
||||
EnableEvaluation bool `json:"enable_evaluation" form:"enable_evaluation"`
|
||||
MaxEvaluationLoops int `json:"max_evaluation_loops" form:"max_evaluation_loops"`
|
||||
}
|
||||
|
||||
type AgentConfigMeta struct {
|
||||
@@ -309,6 +311,24 @@ func NewAgentConfigMeta(
|
||||
HelpText: "Remove content between <thinking></thinking> and <think></think> tags from agent responses",
|
||||
Tags: config.Tags{Section: "ModelSettings"},
|
||||
},
|
||||
{
|
||||
Name: "enable_evaluation",
|
||||
Label: "Enable Evaluation",
|
||||
Type: "checkbox",
|
||||
DefaultValue: false,
|
||||
HelpText: "Enable automatic evaluation of agent responses to ensure they meet user requirements",
|
||||
Tags: config.Tags{Section: "AdvancedSettings"},
|
||||
},
|
||||
{
|
||||
Name: "max_evaluation_loops",
|
||||
Label: "Max Evaluation Loops",
|
||||
Type: "number",
|
||||
DefaultValue: 2,
|
||||
Min: 1,
|
||||
Step: 1,
|
||||
HelpText: "Maximum number of evaluation loops to perform when addressing gaps in responses",
|
||||
Tags: config.Tags{Section: "AdvancedSettings"},
|
||||
},
|
||||
},
|
||||
MCPServers: []config.Field{
|
||||
{
|
||||
|
||||
@@ -247,7 +247,7 @@ func createAgentAvatar(APIURL, APIKey, model, imageModel, avatarDir string, agen
|
||||
ImagePrompt string `json:"image_prompt"`
|
||||
}
|
||||
|
||||
err := llm.GenerateTypedJSON(
|
||||
err := llm.GenerateTypedJSONWithGuidance(
|
||||
context.Background(),
|
||||
llm.NewClient(APIKey, APIURL, "10m"),
|
||||
"Generate a prompt that I can use to create a random avatar for the bot '"+agent.Name+"', the description of the bot is: "+agent.Description,
|
||||
@@ -561,6 +561,13 @@ func (a *AgentPool) startAgentWithConfig(name string, config *AgentConfig, obs O
|
||||
opts = append(opts, WithParallelJobs(config.ParallelJobs))
|
||||
}
|
||||
|
||||
if config.EnableEvaluation {
|
||||
opts = append(opts, EnableEvaluation())
|
||||
if config.MaxEvaluationLoops > 0 {
|
||||
opts = append(opts, WithMaxEvaluationLoops(config.MaxEvaluationLoops))
|
||||
}
|
||||
}
|
||||
|
||||
xlog.Info("Starting agent", "name", name, "config", config)
|
||||
|
||||
agent, err := New(opts...)
|
||||
|
||||
@@ -162,23 +162,23 @@ func newUUID() string {
|
||||
// To wait for a Job result, use JobResult.WaitResult()
|
||||
func NewJob(opts ...JobOption) *Job {
|
||||
j := &Job{
|
||||
Result: NewJobResult(),
|
||||
UUID: newUUID(),
|
||||
}
|
||||
for _, o := range opts {
|
||||
o(j)
|
||||
Result: NewJobResult(),
|
||||
UUID: uuid.New().String(),
|
||||
Metadata: make(map[string]interface{}),
|
||||
context: context.Background(),
|
||||
ConversationHistory: []openai.ChatCompletionMessage{},
|
||||
}
|
||||
|
||||
var ctx context.Context
|
||||
if j.context == nil {
|
||||
ctx = context.Background()
|
||||
} else {
|
||||
ctx = j.context
|
||||
for _, opt := range opts {
|
||||
opt(j)
|
||||
}
|
||||
|
||||
context, cancel := context.WithCancel(ctx)
|
||||
j.context = context
|
||||
// Store the original request if it exists in the conversation history
|
||||
|
||||
ctx, cancel := context.WithCancel(j.context)
|
||||
j.context = ctx
|
||||
j.cancel = cancel
|
||||
|
||||
return j
|
||||
}
|
||||
|
||||
@@ -207,3 +207,23 @@ func WithObservable(obs *Observable) JobOption {
|
||||
j.Obs = obs
|
||||
}
|
||||
}
|
||||
|
||||
// GetEvaluationLoop returns the current evaluation loop count
|
||||
func (j *Job) GetEvaluationLoop() int {
|
||||
if j.Metadata == nil {
|
||||
j.Metadata = make(map[string]interface{})
|
||||
}
|
||||
if loop, ok := j.Metadata["evaluation_loop"].(int); ok {
|
||||
return loop
|
||||
}
|
||||
return 0
|
||||
}
|
||||
|
||||
// IncrementEvaluationLoop increments the evaluation loop count
|
||||
func (j *Job) IncrementEvaluationLoop() {
|
||||
if j.Metadata == nil {
|
||||
j.Metadata = make(map[string]interface{})
|
||||
}
|
||||
currentLoop := j.GetEvaluationLoop()
|
||||
j.Metadata["evaluation_loop"] = currentLoop + 1
|
||||
}
|
||||
|
||||
@@ -10,16 +10,20 @@ import (
|
||||
"github.com/sashabaranov/go-openai/jsonschema"
|
||||
)
|
||||
|
||||
func GenerateTypedJSON(ctx context.Context, client *openai.Client, guidance, model string, i jsonschema.Definition, dst any) error {
|
||||
func GenerateTypedJSONWithGuidance(ctx context.Context, client *openai.Client, guidance, model string, i jsonschema.Definition, dst any) error {
|
||||
return GenerateTypedJSONWithConversation(ctx, client, []openai.ChatCompletionMessage{
|
||||
{
|
||||
Role: "user",
|
||||
Content: guidance,
|
||||
},
|
||||
}, model, i, dst)
|
||||
}
|
||||
|
||||
func GenerateTypedJSONWithConversation(ctx context.Context, client *openai.Client, conv []openai.ChatCompletionMessage, model string, i jsonschema.Definition, dst any) error {
|
||||
toolName := "json"
|
||||
decision := openai.ChatCompletionRequest{
|
||||
Model: model,
|
||||
Messages: []openai.ChatCompletionMessage{
|
||||
{
|
||||
Role: "user",
|
||||
Content: guidance,
|
||||
},
|
||||
},
|
||||
Model: model,
|
||||
Messages: conv,
|
||||
Tools: []openai.Tool{
|
||||
{
|
||||
|
||||
|
||||
@@ -78,7 +78,7 @@ func (f *ClassifierFilter) Apply(job *types.Job) (bool, error) {
|
||||
var result struct {
|
||||
Asserted bool `json:"answer"`
|
||||
}
|
||||
err := llm.GenerateTypedJSON(job.GetContext(), f.client, guidance, f.model, jsonschema.Definition{
|
||||
err := llm.GenerateTypedJSONWithGuidance(job.GetContext(), f.client, guidance, f.model, jsonschema.Definition{
|
||||
Type: jsonschema.Object,
|
||||
Properties: map[string]jsonschema.Definition{
|
||||
"answer": {
|
||||
|
||||
@@ -576,7 +576,7 @@ func (a *App) GenerateGroupProfiles(pool *state.AgentPool) func(c *fiber.Ctx) er
|
||||
|
||||
xlog.Debug("Generating group", "description", request.Descript)
|
||||
client := llm.NewClient(a.config.LLMAPIKey, a.config.LLMAPIURL, "10m")
|
||||
err := llm.GenerateTypedJSON(c.Context(), client, request.Descript, a.config.LLMModel, jsonschema.Definition{
|
||||
err := llm.GenerateTypedJSONWithGuidance(c.Context(), client, request.Descript, a.config.LLMModel, jsonschema.Definition{
|
||||
Type: jsonschema.Object,
|
||||
Properties: map[string]jsonschema.Definition{
|
||||
"agents": {
|
||||
|
||||
Reference in New Issue
Block a user