Better paragraph splitting
Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
This commit is contained in:
72
pkg/xstrings/split.go
Normal file
72
pkg/xstrings/split.go
Normal file
@@ -0,0 +1,72 @@
|
||||
package xstrings
|
||||
|
||||
import (
|
||||
"strings"
|
||||
)
|
||||
|
||||
// SplitTextByLength splits text into chunks of specified maxLength,
|
||||
// preserving complete words and special characters like newlines.
|
||||
// It returns a slice of strings, each with length <= maxLength.
|
||||
func SplitParagraph(text string, maxLength int) []string {
|
||||
// Handle edge cases
|
||||
if maxLength <= 0 || len(text) == 0 {
|
||||
return []string{text}
|
||||
}
|
||||
|
||||
var chunks []string
|
||||
remainingText := text
|
||||
|
||||
for len(remainingText) > 0 {
|
||||
// If remaining text fits in a chunk, add it and we're done
|
||||
if len(remainingText) <= maxLength {
|
||||
chunks = append(chunks, remainingText)
|
||||
break
|
||||
}
|
||||
|
||||
// Try to find a good split point near the max length
|
||||
splitIndex := maxLength
|
||||
|
||||
// Look backward from the max length to find a space or newline
|
||||
for splitIndex > 0 && !isWhitespace(rune(remainingText[splitIndex])) {
|
||||
splitIndex--
|
||||
}
|
||||
|
||||
// If we couldn't find a good split point (no whitespace),
|
||||
// look forward for the next whitespace
|
||||
if splitIndex == 0 {
|
||||
splitIndex = maxLength
|
||||
// If we can't find whitespace forward, we'll have to split a word
|
||||
for splitIndex < len(remainingText) && !isWhitespace(rune(remainingText[splitIndex])) {
|
||||
splitIndex++
|
||||
}
|
||||
|
||||
// If we still couldn't find whitespace, take the whole string
|
||||
if splitIndex == len(remainingText) {
|
||||
chunks = append(chunks, remainingText)
|
||||
break
|
||||
}
|
||||
}
|
||||
|
||||
// Add the chunk up to the split point
|
||||
chunk := remainingText[:splitIndex]
|
||||
|
||||
// Preserve trailing newlines with the current chunk
|
||||
if splitIndex < len(remainingText) && remainingText[splitIndex] == '\n' {
|
||||
chunk += string(remainingText[splitIndex])
|
||||
splitIndex++
|
||||
}
|
||||
|
||||
chunks = append(chunks, chunk)
|
||||
|
||||
// Remove leading whitespace from the next chunk
|
||||
remainingText = remainingText[splitIndex:]
|
||||
remainingText = strings.TrimLeftFunc(remainingText, isWhitespace)
|
||||
}
|
||||
|
||||
return chunks
|
||||
}
|
||||
|
||||
// Helper function to determine if a character is whitespace
|
||||
func isWhitespace(r rune) bool {
|
||||
return r == ' ' || r == '\t' || r == '\n' || r == '\r'
|
||||
}
|
||||
79
pkg/xstrings/split_test.go
Normal file
79
pkg/xstrings/split_test.go
Normal file
@@ -0,0 +1,79 @@
|
||||
package xstrings_test
|
||||
|
||||
import (
|
||||
xtrings "github.com/mudler/LocalAgent/pkg/xstrings"
|
||||
. "github.com/onsi/ginkgo/v2"
|
||||
. "github.com/onsi/gomega"
|
||||
)
|
||||
|
||||
var _ = Describe("SplitParagraph", func() {
|
||||
It("should return the text as a single chunk if it's shorter than maxLen", func() {
|
||||
text := "Short text"
|
||||
maxLen := 20
|
||||
result := xtrings.SplitParagraph(text, maxLen)
|
||||
Expect(result).To(Equal([]string{"Short text"}))
|
||||
})
|
||||
|
||||
It("should split the text into chunks of maxLen without truncating words", func() {
|
||||
text := "This is a longer text that needs to be split into chunks."
|
||||
maxLen := 10
|
||||
result := xtrings.SplitParagraph(text, maxLen)
|
||||
Expect(result).To(Equal([]string{"This is a", "longer", "text that", "needs to", "be split", "into", "chunks."}))
|
||||
})
|
||||
|
||||
It("should handle texts with multiple spaces and newlines correctly", func() {
|
||||
text := "This is\na\ntext with\n\nmultiple spaces and\nnewlines."
|
||||
maxLen := 10
|
||||
result := xtrings.SplitParagraph(text, maxLen)
|
||||
Expect(result).To(Equal([]string{"This is\na\n", "text with\n", "multiple", "spaces ", "and\n", "newlines."}))
|
||||
})
|
||||
|
||||
It("should handle a text with a single word longer than maxLen", func() {
|
||||
text := "supercalifragilisticexpialidocious"
|
||||
maxLen := 10
|
||||
result := xtrings.SplitParagraph(text, maxLen)
|
||||
Expect(result).To(Equal([]string{"supercalifragilisticexpialidocious"}))
|
||||
})
|
||||
|
||||
It("should handle a text with empty lines", func() {
|
||||
text := "line1\n\nline2"
|
||||
maxLen := 10
|
||||
result := xtrings.SplitParagraph(text, maxLen)
|
||||
Expect(result).To(Equal([]string{"line1\n\n", "line2"}))
|
||||
})
|
||||
|
||||
It("should handle a text with leading and trailing spaces", func() {
|
||||
text := " leading spaces and trailing spaces "
|
||||
maxLen := 15
|
||||
result := xtrings.SplitParagraph(text, maxLen)
|
||||
Expect(result).To(Equal([]string{" leading", "spaces and", "trailing spaces"}))
|
||||
})
|
||||
|
||||
It("should handle a text with only spaces", func() {
|
||||
text := " "
|
||||
maxLen := 10
|
||||
result := xtrings.SplitParagraph(text, maxLen)
|
||||
Expect(result).To(Equal([]string{" "}))
|
||||
})
|
||||
|
||||
It("should handle empty string", func() {
|
||||
text := ""
|
||||
maxLen := 10
|
||||
result := xtrings.SplitParagraph(text, maxLen)
|
||||
Expect(result).To(Equal([]string{""}))
|
||||
})
|
||||
|
||||
It("should handle a text with only newlines", func() {
|
||||
text := "\n\n\n"
|
||||
maxLen := 10
|
||||
result := xtrings.SplitParagraph(text, maxLen)
|
||||
Expect(result).To(Equal([]string{"\n\n\n"}))
|
||||
})
|
||||
|
||||
It("should handle a text with special characters", func() {
|
||||
text := "This is a text with special characters !@#$%^&*()"
|
||||
maxLen := 20
|
||||
result := xtrings.SplitParagraph(text, maxLen)
|
||||
Expect(result).To(Equal([]string{"This is a text with", "special characters", "!@#$%^&*()"}))
|
||||
})
|
||||
})
|
||||
15
pkg/xstrings/uniq.go
Normal file
15
pkg/xstrings/uniq.go
Normal file
@@ -0,0 +1,15 @@
|
||||
package xstrings
|
||||
|
||||
type Comparable interface{ ~int | ~int64 | ~string }
|
||||
|
||||
func UniqueSlice[T Comparable](s []T) []T {
|
||||
keys := make(map[T]bool)
|
||||
list := []T{}
|
||||
for _, entry := range s {
|
||||
if _, value := keys[entry]; !value {
|
||||
keys[entry] = true
|
||||
list = append(list, entry)
|
||||
}
|
||||
}
|
||||
return list
|
||||
}
|
||||
13
pkg/xstrings/xstrings_suite_test.go
Normal file
13
pkg/xstrings/xstrings_suite_test.go
Normal file
@@ -0,0 +1,13 @@
|
||||
package xstrings_test
|
||||
|
||||
import (
|
||||
"testing"
|
||||
|
||||
. "github.com/onsi/ginkgo/v2"
|
||||
. "github.com/onsi/gomega"
|
||||
)
|
||||
|
||||
func TestXStrings(t *testing.T) {
|
||||
RegisterFailHandler(Fail)
|
||||
RunSpecs(t, "XStrings test suite")
|
||||
}
|
||||
@@ -7,12 +7,12 @@ import (
|
||||
"io/ioutil"
|
||||
"log"
|
||||
"os"
|
||||
"regexp"
|
||||
"strings"
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
"github.com/mudler/LocalAgent/pkg/xlog"
|
||||
"github.com/mudler/LocalAgent/pkg/xstrings"
|
||||
"github.com/mudler/LocalAgent/services/actions"
|
||||
"github.com/sashabaranov/go-openai"
|
||||
|
||||
@@ -137,23 +137,11 @@ func replaceUserIDsWithNamesInMessage(api *slack.Client, message string) string
|
||||
return message
|
||||
}
|
||||
|
||||
func uniqueStringSlice(s []string) []string {
|
||||
keys := make(map[string]bool)
|
||||
list := []string{}
|
||||
for _, entry := range s {
|
||||
if _, value := keys[entry]; !value {
|
||||
keys[entry] = true
|
||||
list = append(list, entry)
|
||||
}
|
||||
}
|
||||
return list
|
||||
}
|
||||
|
||||
func generateAttachmentsFromJobResponse(j *types.JobResult) (attachments []slack.Attachment) {
|
||||
for _, state := range j.State {
|
||||
// coming from the search action
|
||||
if urls, exists := state.Metadata[actions.MetadataUrls]; exists {
|
||||
for _, url := range uniqueStringSlice(urls.([]string)) {
|
||||
for _, url := range xstrings.UniqueSlice(urls.([]string)) {
|
||||
attachment := slack.Attachment{
|
||||
Title: "URL",
|
||||
TitleLink: url,
|
||||
@@ -165,7 +153,7 @@ func generateAttachmentsFromJobResponse(j *types.JobResult) (attachments []slack
|
||||
|
||||
// coming from the gen image actions
|
||||
if imagesUrls, exists := state.Metadata[actions.MetadataImages]; exists {
|
||||
for _, url := range uniqueStringSlice(imagesUrls.([]string)) {
|
||||
for _, url := range xstrings.UniqueSlice(imagesUrls.([]string)) {
|
||||
attachment := slack.Attachment{
|
||||
Title: "Image",
|
||||
TitleLink: url,
|
||||
@@ -312,45 +300,11 @@ func encodeImageFromURL(imageBytes bytes.Buffer) (string, error) {
|
||||
return base64Image, nil
|
||||
}
|
||||
|
||||
// SplitText splits a long text into chunks of a specified maximum length without truncating words and preserves special characters.
|
||||
func splitText(text string, maxLen int) []string {
|
||||
if len(text) <= maxLen {
|
||||
return []string{text}
|
||||
}
|
||||
|
||||
var chunks []string
|
||||
lines := strings.Split(text, "\n") // Split text by newlines first
|
||||
whitespaceRegex := regexp.MustCompile(`\s+`)
|
||||
|
||||
for _, line := range lines {
|
||||
var chunk string
|
||||
words := whitespaceRegex.Split(line, -1) // Splitting the line into words while preserving whitespace
|
||||
|
||||
for _, word := range words {
|
||||
if len(chunk)+len(word)+1 > maxLen { // +1 for space
|
||||
chunks = append(chunks, chunk)
|
||||
chunk = word
|
||||
} else {
|
||||
if chunk != "" {
|
||||
chunk += " "
|
||||
}
|
||||
chunk += word
|
||||
}
|
||||
}
|
||||
|
||||
if chunk != "" {
|
||||
chunks = append(chunks, chunk)
|
||||
}
|
||||
}
|
||||
|
||||
return chunks
|
||||
}
|
||||
|
||||
func replyWithPostMessage(finalResponse string, api *slack.Client, ev *slackevents.MessageEvent, postMessageParams slack.PostMessageParameters, res *types.JobResult) {
|
||||
if len(finalResponse) > 4000 {
|
||||
// split response in multiple messages, and update the first
|
||||
|
||||
messages := splitText(finalResponse, 4000)
|
||||
messages := xstrings.SplitParagraph(finalResponse, 3000)
|
||||
|
||||
for i, message := range messages {
|
||||
if i == 0 {
|
||||
@@ -386,7 +340,7 @@ func replyToUpdateMessage(finalResponse string, api *slack.Client, ev *slackeven
|
||||
if len(finalResponse) > 3000 {
|
||||
// split response in multiple messages, and update the first
|
||||
|
||||
messages := splitText(finalResponse, 3000)
|
||||
messages := xstrings.SplitParagraph(finalResponse, 3000)
|
||||
|
||||
_, _, _, err := api.UpdateMessage(
|
||||
ev.Channel,
|
||||
|
||||
@@ -14,6 +14,7 @@ import (
|
||||
"github.com/mudler/LocalAgent/core/agent"
|
||||
"github.com/mudler/LocalAgent/core/types"
|
||||
"github.com/mudler/LocalAgent/pkg/xlog"
|
||||
"github.com/mudler/LocalAgent/pkg/xstrings"
|
||||
"github.com/mudler/LocalAgent/services/actions"
|
||||
"github.com/sashabaranov/go-openai"
|
||||
)
|
||||
@@ -97,7 +98,7 @@ func (t *Telegram) handleUpdate(ctx context.Context, b *bot.Bot, a *agent.Agent,
|
||||
|
||||
// coming from the gen image actions
|
||||
if imagesUrls, exists := res.Metadata[actions.MetadataImages]; exists {
|
||||
for _, url := range uniqueStringSlice(imagesUrls.([]string)) {
|
||||
for _, url := range xstrings.UniqueSlice(imagesUrls.([]string)) {
|
||||
b.SendPhoto(ctx, &bot.SendPhotoParams{
|
||||
ChatID: update.Message.Chat.ID,
|
||||
Photo: models.InputFileString{
|
||||
|
||||
Reference in New Issue
Block a user