feat: add capability to understand images

Signed-off-by: mudler <mudler@localai.io>
2025-03-08 17:52:19 +01:00
parent 106d1e61d4
commit 3827ebebdf
7 changed files with 218 additions and 56 deletions
--- a/README.md
+++ b/README.md
@@ -78,6 +78,7 @@ LocalAgent can be configured using the following environment variables:
 | Variable                      | Description                                      |
 |-------------------------------|--------------------------------------------------|
 | `LOCALAGENT_MODEL`                  | Specifies the test model to use                  |
 | `LOCALAGENT_MULTIMODAL_MODEL`       | Specifies a separate model to use with multimodal capabilities (optional, if LOCALAGENT_MODEL does not support multimodality)        |
 | `LOCALAGENT_LLM_API_URL`                     | URL of the API server                            |
 | `LOCALAGENT_API_KEY`                     | API key for authentication                       |
 | `LOCALAGENT_TIMEOUT`                     | Timeout duration for requests                    |
--- a/core/agent/actions.go
+++ b/core/agent/actions.go
@@ -139,6 +139,17 @@ func (m Messages) Save(path string) error {
 	return nil
 }
 func (m Messages) GetLatestUserMessage() *openai.ChatCompletionMessage {
 	for i := len(m) - 1; i >= 0; i-- {
 		msg := m[i]
 		if msg.Role == UserRole {
 			return &msg
 		}
 	}
 	return nil
 }
 func (a *Agent) generateParameters(ctx context.Context, pickTemplate string, act Action, c []openai.ChatCompletionMessage, reasoning string) (*decisionResult, error) {
 	stateHUD, err := renderTemplate(pickTemplate, a.prepareHUD(), a.systemInternalActions(), reasoning)
--- a/core/agent/agent.go
+++ b/core/agent/agent.go
@@ -249,6 +249,171 @@ func (a *Agent) runAction(chosenAction Action, params action.ActionParams) (resu
 	return result, nil
 }
 func (a *Agent) processPrompts() {
 	//if job.Image != "" {
 	// TODO: Use llava to explain the image content
 	//}
 	// Add custom prompts
 	for _, prompt := range a.options.prompts {
 		message, err := prompt.Render(a)
 		if err != nil {
 			xlog.Error("Error rendering prompt", "error", err)
 			continue
 		}
 		if message == "" {
 			xlog.Debug("Prompt is empty, skipping", "agent", a.Character.Name)
 			continue
 		}
 		if !Messages(a.currentConversation).Exist(a.options.systemPrompt) {
 			a.currentConversation = append([]openai.ChatCompletionMessage{
 				{
 					Role:    prompt.Role(),
 					Content: message,
 				}}, a.currentConversation...)
 		}
 	}
 	// TODO: move to a Promptblock?
 	if a.options.systemPrompt != "" {
 		if !Messages(a.currentConversation).Exist(a.options.systemPrompt) {
 			a.currentConversation = append([]openai.ChatCompletionMessage{
 				{
 					Role:    "system",
 					Content: a.options.systemPrompt,
 				}}, a.currentConversation...)
 		}
 	}
 }
 func (a *Agent) describeImage(ctx context.Context, model, imageURL string) (string, error) {
 	resp, err := a.client.CreateChatCompletion(ctx,
 		openai.ChatCompletionRequest{
 			Model: model, Messages: []openai.ChatCompletionMessage{
 				{
 					Role: "user",
 					MultiContent: []openai.ChatMessagePart{
 						{
 							Type: openai.ChatMessagePartTypeText,
 							Text: "What is in the image?",
 						},
 						{
 							Type: openai.ChatMessagePartTypeImageURL,
 							ImageURL: &openai.ChatMessageImageURL{
 								URL: imageURL,
 							},
 						},
 					},
 				},
 			}})
 	if err != nil {
 		return "", err
 	}
 	if len(resp.Choices) == 0 {
 		return "", fmt.Errorf("no choices")
 	}
 	return resp.Choices[0].Message.Content, nil
 }
 func extractImageContent(message openai.ChatCompletionMessage) (imageURL, text string, e error) {
 	e = fmt.Errorf("no image found")
 	if message.MultiContent != nil {
 		for _, content := range message.MultiContent {
 			if content.Type == openai.ChatMessagePartTypeImageURL {
 				imageURL = content.ImageURL.URL
 				e = nil
 			}
 			if content.Type == openai.ChatMessagePartTypeText {
 				text = content.Text
 				e = nil
 			}
 		}
 	}
 	return
 }
 func (a *Agent) processUserInputs(job *Job, role string) {
 	noNewMessage := job.Text == "" && job.Image == ""
 	onlyText := job.Text != "" && job.Image == ""
 	// walk conversation history, and check if last message from user contains image.
 	// If it does, we need to describe the image first with a model that supports image understanding (if the current model doesn't support it)
 	// and add it to the conversation context
 	if a.options.SeparatedMultimodalModel() && noNewMessage {
 		lastUserMessage := a.currentConversation.GetLatestUserMessage()
 		if lastUserMessage != nil {
 			imageURL, text, err := extractImageContent(*lastUserMessage)
 			if err == nil {
 				// We have an image, we need to describe it first
 				// and add it to the conversation context
 				imageDescription, err := a.describeImage(a.context.Context, a.options.LLMAPI.MultimodalModel, imageURL)
 				if err != nil {
 					xlog.Error("Error describing image", "error", err)
 				} else {
 					// We replace the user message with the image description
 					// and add the user text to the conversation
 					lastUserMessage.Content = fmt.Sprintf("The user shared an image which can be described as: %s", imageDescription)
 					lastUserMessage.MultiContent = nil
 					lastUserMessage.Role = "system"
 					a.currentConversation = append(a.currentConversation, openai.ChatCompletionMessage{
 						Role:    role,
 						Content: text,
 					})
 				}
 			}
 		}
 	}
 	if onlyText {
 		a.currentConversation = append(a.currentConversation, openai.ChatCompletionMessage{
 			Role:    role,
 			Content: job.Text,
 		})
 	}
 	if job.Image != "" {
 		// If an image is present with the text
 		// we have two cases: if the model supports both images and text, we can send both
 		// if the model supports only text, we can send the text only and we need to describe the image first with a model that support image understanding and add it to the conversation context
 		if a.options.SeparatedMultimodalModel() {
 			// We need to describe the image first
 			imageDescription, err := a.describeImage(a.context.Context, a.options.LLMAPI.Model, job.Image)
 			if err != nil {
 				xlog.Error("Error describing image", "error", err)
 			} else {
 				a.currentConversation = append(a.currentConversation, openai.ChatCompletionMessage{
 					Role:    "system",
 					Content: fmt.Sprintf("The user shared an image which can be described as: %s", imageDescription),
 				})
 				a.currentConversation = append(a.currentConversation, openai.ChatCompletionMessage{
 					Role:    role,
 					Content: job.Text,
 				})
 			}
 		} else {
 			// Just append to the message both the image and the text
 			a.currentConversation = append(a.currentConversation, openai.ChatCompletionMessage{
 				Role: role,
 				MultiContent: []openai.ChatMessagePart{
 					{
 						Type: openai.ChatMessagePartTypeText,
 						Text: job.Text,
 					},
 					{
 						Type: openai.ChatMessagePartTypeImageURL,
 						ImageURL: &openai.ChatMessageImageURL{
 							URL: job.Image,
 						},
 					},
 				},
 			})
 		}
 	}
 }
 func (a *Agent) consumeJob(job *Job, role string) {
 	a.Lock()
 	paused := a.pause
@@ -290,46 +455,8 @@ func (a *Agent) consumeJob(job *Job, role string) {
 		}()
 	}
-	//if job.Image != "" {
+	a.processPrompts()
-	// TODO: Use llava to explain the image content
+	a.processUserInputs(job, role)
 	//}
 	// Add custom prompts
 	for _, prompt := range a.options.prompts {
 		message, err := prompt.Render(a)
 		if err != nil {
 			xlog.Error("Error rendering prompt", "error", err)
 			continue
 		}
 		if message == "" {
 			xlog.Debug("Prompt is empty, skipping", "agent", a.Character.Name)
 			continue
 		}
 		if !Messages(a.currentConversation).Exist(a.options.systemPrompt) {
 			a.currentConversation = append([]openai.ChatCompletionMessage{
 				{
 					Role:    prompt.Role(),
 					Content: message,
 				}}, a.currentConversation...)
 		}
 	}
 	// TODO: move to a Promptblock?
 	if a.options.systemPrompt != "" {
 		if !Messages(a.currentConversation).Exist(a.options.systemPrompt) {
 			a.currentConversation = append([]openai.ChatCompletionMessage{
 				{
 					Role:    "system",
 					Content: a.options.systemPrompt,
 				}}, a.currentConversation...)
 		}
 	}
 	if job.Text != "" {
 		a.currentConversation = append(a.currentConversation, openai.ChatCompletionMessage{
 			Role:    role,
 			Content: job.Text,
 		})
 	}
 	// RAG
 	a.knowledgeBaseLookup()
--- a/core/agent/options.go
+++ b/core/agent/options.go
@@ -7,10 +7,12 @@ import (
 )
 type Option func(*options) error
 type llmOptions struct {
 	APIURL          string
 	APIKey          string
 	Model           string
 	MultimodalModel string
 }
 type options struct {
@@ -44,6 +46,10 @@ type options struct {
 	conversationsPath string
 }
 func (o *options) SeparatedMultimodalModel() bool {
 	return o.LLMAPI.MultimodalModel != "" && o.LLMAPI.Model != o.LLMAPI.MultimodalModel
 }
 func defaultOptions() *options {
 	return &options{
 		periodicRuns: 15 * time.Minute,
@@ -209,6 +215,13 @@ func WithLLMAPIKey(key string) Option {
 	}
 }
 func WithMultimodalModel(model string) Option {
 	return func(o *options) error {
 		o.LLMAPI.MultimodalModel = model
 		return nil
 	}
 }
 func WithPermanentGoal(goal string) Option {
 	return func(o *options) error {
 		o.permanentGoal = goal
--- a/core/state/config.go
+++ b/core/state/config.go
@@ -34,6 +34,7 @@ type AgentConfig struct {
 	// This is what needs to be part of ActionsConfig
 	Model                 string `json:"model" form:"model"`
 	MultimodalModel       string `json:"multimodal_model" form:"multimodal_model"`
 	Name                  string `json:"name" form:"name"`
 	HUD                   bool   `json:"hud" form:"hud"`
 	StandaloneJob         bool   `json:"standalone_job" form:"standalone_job"`
--- a/core/state/pool.go
+++ b/core/state/pool.go
@@ -27,7 +27,7 @@ type AgentPool struct {
 	agents                                              map[string]*Agent
 	managers                                            map[string]sse.Manager
 	agentStatus                                         map[string]*Status
-	apiURL, model, localRAGAPI, apiKey string
+	apiURL, model, multimodalModel, localRAGAPI, apiKey string
 	availableActions                                    func(*AgentConfig) func(ctx context.Context) []Action
 	connectors                                          func(*AgentConfig) []Connector
 	promptBlocks                                        func(*AgentConfig) []PromptBlock
@@ -66,7 +66,7 @@ func loadPoolFromFile(path string) (*AgentPoolData, error) {
 }
 func NewAgentPool(
-	model, apiURL, apiKey, directory string,
+	model, multimodalModel, apiURL, apiKey, directory string,
 	LocalRAGAPI string,
 	availableActions func(*AgentConfig) func(ctx context.Context) []agent.Action,
 	connectors func(*AgentConfig) []Connector,
@@ -91,6 +91,7 @@ func NewAgentPool(
 			pooldir:          directory,
 			apiURL:           apiURL,
 			model:            model,
 			multimodalModel:  multimodalModel,
 			localRAGAPI:      LocalRAGAPI,
 			apiKey:           apiKey,
 			agents:           make(map[string]*Agent),
@@ -114,6 +115,7 @@ func NewAgentPool(
 		apiURL:           apiURL,
 		pooldir:          directory,
 		model:            model,
 		multimodalModel:  multimodalModel,
 		apiKey:           apiKey,
 		agents:           make(map[string]*Agent),
 		managers:         make(map[string]sse.Manager),
@@ -165,6 +167,10 @@ func (a *AgentPool) startAgentWithConfig(name string, config *AgentConfig) error
 	manager := sse.NewManager(5)
 	ctx := context.Background()
 	model := a.model
 	multimodalModel := a.multimodalModel
 	if config.MultimodalModel != "" {
 		multimodalModel = config.MultimodalModel
 	}
 	if config.Model != "" {
 		model = config.Model
 	}
@@ -244,6 +250,7 @@ func (a *AgentPool) startAgentWithConfig(name string, config *AgentConfig) error
 			return true
 		}),
 		WithSystemPrompt(config.SystemPrompt),
 		WithMultimodalModel(multimodalModel),
 		WithAgentResultCallback(func(state ActionState) {
 			a.Lock()
 			if _, ok := a.agentStatus[name]; !ok {
--- a/main.go
+++ b/main.go
@@ -11,6 +11,7 @@ import (
 )
 var testModel = os.Getenv("LOCALAGENT_MODEL")
 var multimodalModel = os.Getenv("LOCALAGENT_MULTIMODAL_MODEL")
 var apiURL = os.Getenv("LOCALAGENT_LLM_API_URL")
 var apiKey = os.Getenv("LOCALAGENT_API_KEY")
 var timeout = os.Getenv("LOCALAGENT_TIMEOUT")
@@ -45,6 +46,7 @@ func main() {
 	// Create the agent pool
 	pool, err := state.NewAgentPool(
 		testModel,
 		multimodalModel,
 		apiURL,
 		apiKey,
 		stateDir,