From 9e40eea4380a2ef4246ba2c1b3e7ea44bf45f689 Mon Sep 17 00:00:00 2001 From: Richard Palethorpe Date: Thu, 12 Jun 2025 05:11:43 +0100 Subject: [PATCH] feat(api): Handle tool calls in responses API Signed-off-by: Richard Palethorpe --- core/types/job.go | 34 ++++ webui/app.go | 20 +- webui/types/openai.go | 443 +++++++++++++++++++++++++++++++++++------- 3 files changed, 428 insertions(+), 69 deletions(-) diff --git a/core/types/job.go b/core/types/job.go index c701c05..c86fabc 100644 --- a/core/types/job.go +++ b/core/types/job.go @@ -20,6 +20,10 @@ type Job struct { UUID string Metadata map[string]interface{} DoneFilter bool + + // Tools available for this job + BuiltinTools []openai.Tool // Built-in tools like web search + UserTools []openai.Tool // User-defined function tools pastActions []*ActionRequest nextAction *Action @@ -45,6 +49,18 @@ func WithConversationHistory(history []openai.ChatCompletionMessage) JobOption { } } +func WithBuiltinTools(tools []openai.Tool) JobOption { + return func(j *Job) { + j.BuiltinTools = tools + } +} + +func WithUserTools(tools []openai.Tool) JobOption { + return func(j *Job) { + j.UserTools = tools + } +} + func WithReasoningCallback(f func(ActionCurrentState) bool) JobOption { return func(r *Job) { r.ReasoningCallback = f @@ -227,3 +243,21 @@ func (j *Job) IncrementEvaluationLoop() { currentLoop := j.GetEvaluationLoop() j.Metadata["evaluation_loop"] = currentLoop + 1 } + +// GetBuiltinTools returns the builtin tools for this job +func (j *Job) GetBuiltinTools() []openai.Tool { + return j.BuiltinTools +} + +// GetUserTools returns the user tools for this job +func (j *Job) GetUserTools() []openai.Tool { + return j.UserTools +} + +// GetAllTools returns all tools (builtin + user) for this job +func (j *Job) GetAllTools() []openai.Tool { + allTools := make([]openai.Tool, 0, len(j.BuiltinTools)+len(j.UserTools)) + allTools = append(allTools, j.BuiltinTools...) + allTools = append(allTools, j.UserTools...) + return allTools +} diff --git a/webui/app.go b/webui/app.go index 42a5223..7929c96 100644 --- a/webui/app.go +++ b/webui/app.go @@ -513,9 +513,25 @@ func (a *App) Responses(pool *state.AgentPool, tracker *conversations.Conversati return c.Status(http.StatusInternalServerError).JSON(types.ResponseBody{Error: "Agent not found"}) } - res := a.Ask( + // Prepare job options + jobOptions := []coreTypes.JobOption{ coreTypes.WithConversationHistory(messages), - ) + } + + // Add tools if present in the request + if len(request.Tools) > 0 { + builtinTools, userTools := types.SeparateTools(request.Tools) + if len(builtinTools) > 0 { + jobOptions = append(jobOptions, coreTypes.WithBuiltinTools(builtinTools)) + xlog.Debug("Adding builtin tools to job", "count", len(builtinTools), "agent", agentName) + } + if len(userTools) > 0 { + jobOptions = append(jobOptions, coreTypes.WithUserTools(userTools)) + xlog.Debug("Adding user tools to job", "count", len(userTools), "agent", agentName) + } + } + + res := a.Ask(jobOptions...) if res.Error != nil { xlog.Error("Error asking agent", "agent", agentName, "error", res.Error) diff --git a/webui/types/openai.go b/webui/types/openai.go index 550a0d3..9f37433 100644 --- a/webui/types/openai.go +++ b/webui/types/openai.go @@ -7,12 +7,108 @@ import ( "github.com/sashabaranov/go-openai" ) +// Input represents either a string or a slice of Message +type Input struct { + Text *string `json:"-"` + Messages *[]Message `json:"-"` +} + +// UnmarshalJSON implements custom JSON unmarshaling for Input +func (i *Input) UnmarshalJSON(data []byte) error { + // Try to unmarshal as string first + var text string + if err := json.Unmarshal(data, &text); err == nil { + i.Text = &text + return nil + } + + // Try to unmarshal as []Message + var messages []Message + if err := json.Unmarshal(data, &messages); err == nil { + i.Messages = &messages + return nil + } + + return json.Unmarshal(data, &struct{}{}) // fallback to empty struct +} + +// MarshalJSON implements custom JSON marshaling for Input +func (i *Input) MarshalJSON() ([]byte, error) { + if i.Text != nil { + return json.Marshal(*i.Text) + } + if i.Messages != nil { + return json.Marshal(*i.Messages) + } + return json.Marshal(nil) +} + +// IsText returns true if the input contains text +func (i *Input) IsText() bool { + return i.Text != nil +} + +// IsMessages returns true if the input contains messages +func (i *Input) IsMessages() bool { + return i.Messages != nil +} + +// GetText returns the text value or empty string +func (i *Input) GetText() string { + if i.Text != nil { + return *i.Text + } + return "" +} + +// GetMessages returns the messages value or empty slice +func (i *Input) GetMessages() []Message { + if i.Messages != nil { + return *i.Messages + } + return nil +} + +// Message represents different types of messages in the input +type Message struct { + // Common fields + Type string `json:"type,omitempty"` + + // InputMessage fields (when this is a regular chat message) + Role *string `json:"role,omitempty"` + Content *Content `json:"content,omitempty"` + + // WebSearchToolCall fields (when type == "web_search_call") + ID *string `json:"id,omitempty"` + Status *string `json:"status,omitempty"` +} + +// IsInputMessage returns true if this is a regular chat message +func (m *Message) IsInputMessage() bool { + return m.Role != nil +} + +// IsWebSearchCall returns true if this is a web search tool call +func (m *Message) IsWebSearchCall() bool { + return m.Type == "web_search_call" +} + +// ToInputMessage converts to InputMessage if this is a regular message +func (m *Message) ToInputMessage() *InputMessage { + if m.IsInputMessage() && m.Role != nil && m.Content != nil { + content := *m.Content + return &InputMessage{ + Role: *m.Role, + Content: content, + } + } + return nil +} + // RequestBody represents the request body structure for the OpenAI API type RequestBody struct { Model string `json:"model"` - Input json.RawMessage `json:"input"` - InputText string `json:"input_text"` - InputMessages []InputMessage `json:"input_messages"` + Input Input `json:"input"` Include []string `json:"include,omitempty"` Instructions *string `json:"instructions,omitempty"` MaxOutputTokens *int `json:"max_output_tokens,omitempty"` @@ -25,91 +121,78 @@ type RequestBody struct { Temperature *float64 `json:"temperature,omitempty"` Text *TextConfig `json:"text,omitempty"` ToolChoice interface{} `json:"tool_choice,omitempty"` - Tools []interface{} `json:"tools,omitempty"` + Tools []Tool `json:"tools,omitempty"` TopP *float64 `json:"top_p,omitempty"` Truncation *string `json:"truncation,omitempty"` } func (r *RequestBody) SetInputByType() { - xlog.Debug("[Parse Request] Set input type", "input", string(r.Input)) - - var inputText string - if err := json.Unmarshal(r.Input, &inputText); err == nil { - r.InputText = inputText - return + // This method is no longer needed as Input handles unmarshaling automatically + if r.Input.IsText() { + xlog.Debug("[Parse Request] Set input type as text", "input", r.Input.GetText()) + } else if r.Input.IsMessages() { + xlog.Debug("[Parse Request] Input messages parsed", "messages", r.Input.GetMessages()) } - - var inputMessages []InputMessage - if err := json.Unmarshal(r.Input, &inputMessages); err != nil { - xlog.Warn("[Parse Request] Input type not recognized", "input", string(r.Input)) - return - } - - for _, i := range inputMessages { - switch content := i.Content.(type) { - case []ContentItem: - i.ContentItems = content - case string: - i.ContentText = content - default: - xlog.Warn("[Parse Request] Input content type not recognized", "content", content) - } - - r.InputMessages = append(r.InputMessages, i) - } - - xlog.Debug("[Parse Request] Input messages parsed", "messages", r.InputMessages) } func (r *RequestBody) ToChatCompletionMessages() []openai.ChatCompletionMessage { result := []openai.ChatCompletionMessage{} - for _, m := range r.InputMessages { - content := []openai.ChatMessagePart{} - oneImageWasFound := false + if r.Input.IsMessages() { + for _, m := range r.Input.GetMessages() { + // Only process regular input messages, skip web search calls and other types + if !m.IsInputMessage() { + continue + } - if m.ContentText != "" { - content = append(content, openai.ChatMessagePart{ - Type: "text", - Text: m.ContentText, - }) - } + content := []openai.ChatMessagePart{} + oneImageWasFound := false - for _, c := range m.ContentItems { - switch c.Type { - case "text": + if m.Content != nil && m.Content.IsText() && m.Content.GetText() != "" { content = append(content, openai.ChatMessagePart{ Type: "text", - Text: c.Text, - }) - case "image": - oneImageWasFound = true - content = append(content, openai.ChatMessagePart{ - Type: "image", - ImageURL: &openai.ChatMessageImageURL{URL: c.ImageURL}, + Text: m.Content.GetText(), }) } - } - if oneImageWasFound { - result = append(result, openai.ChatCompletionMessage{ - Role: m.Role, - MultiContent: content, - }) - } else { - for _, c := range content { + if m.Content != nil && m.Content.IsItems() { + for _, c := range m.Content.GetItems() { + switch c.Type { + case "text": + content = append(content, openai.ChatMessagePart{ + Type: "text", + Text: c.Text, + }) + case "image": + oneImageWasFound = true + content = append(content, openai.ChatMessagePart{ + Type: "image", + ImageURL: &openai.ChatMessageImageURL{URL: c.ImageURL}, + }) + } + } + } + + if oneImageWasFound { result = append(result, openai.ChatCompletionMessage{ - Role: m.Role, - Content: c.Text, + Role: *m.Role, + MultiContent: content, }) + } else { + for _, c := range content { + result = append(result, openai.ChatCompletionMessage{ + Role: *m.Role, + Content: c.Text, + }) + } } } } - if r.InputText != "" { + if r.Input.IsText() && r.Input.GetText() != "" { result = append(result, openai.ChatCompletionMessage{ Role: "user", - Content: r.InputText, + Content: r.Input.GetText(), }) } @@ -182,7 +265,7 @@ type ResponseBody struct { Temperature float64 `json:"temperature"` Text TextConfig `json:"text"` ToolChoice string `json:"tool_choice"` - Tools []interface{} `json:"tools"` + Tools []Tool `json:"tools"` TopP float64 `json:"top_p"` Truncation string `json:"truncation"` Usage UsageInfo `json:"usage"` @@ -190,12 +273,72 @@ type ResponseBody struct { Metadata map[string]interface{} `json:"metadata"` } +// Content represents either a string or a slice of ContentItem +type Content struct { + Text *string `json:"-"` + Items *[]ContentItem `json:"-"` +} + +// UnmarshalJSON implements custom JSON unmarshaling for Content +func (c *Content) UnmarshalJSON(data []byte) error { + // Try to unmarshal as string first + var text string + if err := json.Unmarshal(data, &text); err == nil { + c.Text = &text + return nil + } + + // Try to unmarshal as []ContentItem + var items []ContentItem + if err := json.Unmarshal(data, &items); err == nil { + c.Items = &items + return nil + } + + return json.Unmarshal(data, &struct{}{}) // fallback to empty struct +} + +// MarshalJSON implements custom JSON marshaling for Content +func (c *Content) MarshalJSON() ([]byte, error) { + if c.Text != nil { + return json.Marshal(*c.Text) + } + if c.Items != nil { + return json.Marshal(*c.Items) + } + return json.Marshal(nil) +} + +// IsText returns true if the content contains text +func (c *Content) IsText() bool { + return c.Text != nil +} + +// IsItems returns true if the content contains items +func (c *Content) IsItems() bool { + return c.Items != nil +} + +// GetText returns the text value or empty string +func (c *Content) GetText() string { + if c.Text != nil { + return *c.Text + } + return "" +} + +// GetItems returns the items value or empty slice +func (c *Content) GetItems() []ContentItem { + if c.Items != nil { + return *c.Items + } + return nil +} + // InputMessage represents a user input message type InputMessage struct { - Role string `json:"role"` - Content any `json:"content"` - ContentText string `json:"content_text"` - ContentItems []ContentItem `json:"content_items"` + Role string `json:"role"` + Content Content `json:"content"` } // ContentItem represents an item in a content array @@ -204,3 +347,169 @@ type ContentItem struct { Text string `json:"text,omitempty"` ImageURL string `json:"image_url,omitempty"` } + +// Tool represents a tool that can be called by the assistant +type Tool struct { + Type string `json:"type"` + + // Function tool fields (used when type == "function") + Name *string `json:"name,omitempty"` + Description *string `json:"description,omitempty"` + Parameters *JSONSchema `json:"parameters,omitempty"` + Strict *bool `json:"strict,omitempty"` + + // Web search tool fields (used when type == "web_search_preview" etc.) + SearchContextSize *string `json:"search_context_size,omitempty"` + UserLocation *UserLocation `json:"user_location,omitempty"` +} + +// IsFunction returns true if this is a function tool +func (t *Tool) IsFunction() bool { + return t.Type == "function" +} + +// IsWebSearch returns true if this is a web search tool +func (t *Tool) IsWebSearch() bool { + return t.Type == "web_search_preview" || t.Type == "web_search_preview_2025_03_11" +} + +// ToCompletionFunction converts this tool to a function definition for the completions API +func (t *Tool) ToCompletionFunction() *openai.FunctionDefinition { + if t.IsFunction() && t.Name != nil { + // Regular function tool + var params interface{} + if t.Parameters != nil { + params = t.Parameters + } + + desc := "" + if t.Description != nil { + desc = *t.Description + } + + return &openai.FunctionDefinition{ + Name: *t.Name, + Description: desc, + Parameters: params, + } + } + + if t.IsWebSearch() { + // Convert web search builtin to function + name := "web_search_" + t.Type + desc := "Web search tool for finding relevant information online" + + // Create parameters schema for web search options + params := map[string]interface{}{ + "type": "object", + "properties": map[string]interface{}{ + "search_context_size": map[string]interface{}{ + "type": "string", + "enum": []string{"low", "medium", "high"}, + "description": "Amount of context window space to use for search", + }, + "user_location": map[string]interface{}{ + "type": "object", + "properties": map[string]interface{}{ + "type": map[string]interface{}{ + "type": "string", + "const": "approximate", + "description": "Type of location approximation", + }, + "city": map[string]interface{}{ + "type": "string", + "description": "City of the user", + }, + "country": map[string]interface{}{ + "type": "string", + "description": "Two-letter ISO country code", + }, + "region": map[string]interface{}{ + "type": "string", + "description": "Region of the user", + }, + "timezone": map[string]interface{}{ + "type": "string", + "description": "IANA timezone of the user", + }, + }, + }, + }, + } + + return &openai.FunctionDefinition{ + Name: name, + Description: desc, + Parameters: params, + } + } + + return nil +} + + + +// ToCompletionTools converts a slice of Tools to openai.Tool format for completions API +func ToCompletionTools(tools []Tool) []openai.Tool { + result := make([]openai.Tool, 0, len(tools)) + + for _, tool := range tools { + if fn := tool.ToCompletionFunction(); fn != nil { + result = append(result, openai.Tool{ + Type: openai.ToolTypeFunction, + Function: fn, + }) + } + } + + return result +} + +// SeparateTools separates a slice of Tools into builtin tools and user tools +func SeparateTools(tools []Tool) (builtinTools []openai.Tool, userTools []openai.Tool) { + for _, tool := range tools { + if tool.IsFunction() { + // User-defined function tool + if fn := tool.ToCompletionFunction(); fn != nil { + userTools = append(userTools, openai.Tool{ + Type: openai.ToolTypeFunction, + Function: fn, + }) + } + } else if tool.IsWebSearch() { + // Builtin tool (web search) + if fn := tool.ToCompletionFunction(); fn != nil { + builtinTools = append(builtinTools, openai.Tool{ + Type: openai.ToolTypeFunction, + Function: fn, + }) + } + } + } + return builtinTools, userTools +} + +// JSONSchema represents a JSON Schema object for function parameters +type JSONSchema struct { + Type string `json:"type,omitempty"` + Properties map[string]*JSONSchema `json:"properties,omitempty"` + Required []string `json:"required,omitempty"` + Items *JSONSchema `json:"items,omitempty"` + AdditionalProperties *bool `json:"additionalProperties,omitempty"` + Description string `json:"description,omitempty"` + Enum []interface{} `json:"enum,omitempty"` + Format string `json:"format,omitempty"` + Minimum *float64 `json:"minimum,omitempty"` + Maximum *float64 `json:"maximum,omitempty"` + MinLength *int `json:"minLength,omitempty"` + MaxLength *int `json:"maxLength,omitempty"` +} + +// UserLocation represents the user's location for web search +type UserLocation struct { + Type string `json:"type"` + City *string `json:"city,omitempty"` + Country *string `json:"country,omitempty"` + Region *string `json:"region,omitempty"` + Timezone *string `json:"timezone,omitempty"` +}