diff --git a/core/agent/actions.go b/core/agent/actions.go
index 2991ec1..5df6a40 100644
--- a/core/agent/actions.go
+++ b/core/agent/actions.go
@@ -150,6 +150,14 @@ func (m Messages) GetLatestUserMessage() *openai.ChatCompletionMessage {
 	return nil
 }
 
+func (m Messages) IsLastMessageFromRole(role string) bool {
+	if len(m) == 0 {
+		return false
+	}
+
+	return m[len(m)-1].Role == role
+}
+
 func (a *Agent) generateParameters(ctx context.Context, pickTemplate string, act Action, c []openai.ChatCompletionMessage, reasoning string) (*decisionResult, error) {
 
 	stateHUD, err := renderTemplate(pickTemplate, a.prepareHUD(), a.systemInternalActions(), reasoning)
diff --git a/core/agent/agent.go b/core/agent/agent.go
index 9256119..704c70e 100644
--- a/core/agent/agent.go
+++ b/core/agent/agent.go
@@ -286,9 +286,11 @@ func (a *Agent) processPrompts() {
 }
 
 func (a *Agent) describeImage(ctx context.Context, model, imageURL string) (string, error) {
+	xlog.Debug("Describing image", "model", model, "image", imageURL)
 	resp, err := a.client.CreateChatCompletion(ctx,
 		openai.ChatCompletionRequest{
-			Model: model, Messages: []openai.ChatCompletionMessage{
+			Model: model,
+			Messages: []openai.ChatCompletionMessage{
 				{
 
 					Role: "user",
@@ -300,6 +302,7 @@ func (a *Agent) describeImage(ctx context.Context, model, imageURL string) (stri
 						{
 							Type: openai.ChatMessagePartTypeImageURL,
 							ImageURL: &openai.ChatMessageImageURL{
+
 								URL: imageURL,
 							},
 						},
@@ -313,6 +316,7 @@ func (a *Agent) describeImage(ctx context.Context, model, imageURL string) (stri
 		return "", fmt.Errorf("no choices")
 	}
 
+	xlog.Debug("Described image", "description", resp.Choices[0].Message.Content)
 	return resp.Choices[0].Message.Content, nil
 }
 
@@ -343,7 +347,7 @@ func (a *Agent) processUserInputs(job *Job, role string) {
 	// and add it to the conversation context
 	if a.options.SeparatedMultimodalModel() && noNewMessage {
 		lastUserMessage := a.currentConversation.GetLatestUserMessage()
-		if lastUserMessage != nil {
+		if lastUserMessage != nil && a.currentConversation.IsLastMessageFromRole(UserRole) {
 			imageURL, text, err := extractImageContent(*lastUserMessage)
 			if err == nil {
 				// We have an image, we need to describe it first
@@ -361,6 +365,7 @@ func (a *Agent) processUserInputs(job *Job, role string) {
 						Role:    role,
 						Content: text,
 					})
+					xlog.Debug("Conversation after image description", "conversation", a.currentConversation)
 				}
 			}
 		}