Better paragraph splitting

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
2025-03-25 22:28:08 +01:00
parent 54c8bf5f1a
commit fa12dba7c2
6 changed files with 186 additions and 52 deletions
--- a/pkg/xstrings/split.go
+++ b/pkg/xstrings/split.go
@@ -0,0 +1,72 @@
+package xstrings
+
+import (
+	"strings"
+)
+
+// SplitTextByLength splits text into chunks of specified maxLength,
+// preserving complete words and special characters like newlines.
+// It returns a slice of strings, each with length <= maxLength.
+func SplitParagraph(text string, maxLength int) []string {
+	// Handle edge cases
+	if maxLength <= 0 || len(text) == 0 {
+		return []string{text}
+	}
+
+	var chunks []string
+	remainingText := text
+
+	for len(remainingText) > 0 {
+		// If remaining text fits in a chunk, add it and we're done
+		if len(remainingText) <= maxLength {
+			chunks = append(chunks, remainingText)
+			break
+		}
+
+		// Try to find a good split point near the max length
+		splitIndex := maxLength
+
+		// Look backward from the max length to find a space or newline
+		for splitIndex > 0 && !isWhitespace(rune(remainingText[splitIndex])) {
+			splitIndex--
+		}
+
+		// If we couldn't find a good split point (no whitespace),
+		// look forward for the next whitespace
+		if splitIndex == 0 {
+			splitIndex = maxLength
+			// If we can't find whitespace forward, we'll have to split a word
+			for splitIndex < len(remainingText) && !isWhitespace(rune(remainingText[splitIndex])) {
+				splitIndex++
+			}
+
+			// If we still couldn't find whitespace, take the whole string
+			if splitIndex == len(remainingText) {
+				chunks = append(chunks, remainingText)
+				break
+			}
+		}
+
+		// Add the chunk up to the split point
+		chunk := remainingText[:splitIndex]
+
+		// Preserve trailing newlines with the current chunk
+		if splitIndex < len(remainingText) && remainingText[splitIndex] == '\n' {
+			chunk += string(remainingText[splitIndex])
+			splitIndex++
+		}
+
+		chunks = append(chunks, chunk)
+
+		// Remove leading whitespace from the next chunk
+		remainingText = remainingText[splitIndex:]
+		remainingText = strings.TrimLeftFunc(remainingText, isWhitespace)
+	}
+
+	return chunks
+}
+
+// Helper function to determine if a character is whitespace
+func isWhitespace(r rune) bool {
+	return r == ' ' || r == '\t' || r == '\n' || r == '\r'
+}
--- a/pkg/xstrings/split_test.go
+++ b/pkg/xstrings/split_test.go
@@ -0,0 +1,79 @@
+package xstrings_test
+
+import (
+	xtrings "github.com/mudler/LocalAgent/pkg/xstrings"
+	. "github.com/onsi/ginkgo/v2"
+	. "github.com/onsi/gomega"
+)
+
+var _ = Describe("SplitParagraph", func() {
+	It("should return the text as a single chunk if it's shorter than maxLen", func() {
+		text := "Short text"
+		maxLen := 20
+		result := xtrings.SplitParagraph(text, maxLen)
+		Expect(result).To(Equal([]string{"Short text"}))
+	})
+
+	It("should split the text into chunks of maxLen without truncating words", func() {
+		text := "This is a longer text that needs to be split into chunks."
+		maxLen := 10
+		result := xtrings.SplitParagraph(text, maxLen)
+		Expect(result).To(Equal([]string{"This is a", "longer", "text that", "needs to", "be split", "into", "chunks."}))
+	})
+
+	It("should handle texts with multiple spaces and newlines correctly", func() {
+		text := "This  is\na\ntext  with\n\nmultiple spaces   and\nnewlines."
+		maxLen := 10
+		result := xtrings.SplitParagraph(text, maxLen)
+		Expect(result).To(Equal([]string{"This  is\na\n", "text  with\n", "multiple", "spaces  ", "and\n", "newlines."}))
+	})
+
+	It("should handle a text with a single word longer than maxLen", func() {
+		text := "supercalifragilisticexpialidocious"
+		maxLen := 10
+		result := xtrings.SplitParagraph(text, maxLen)
+		Expect(result).To(Equal([]string{"supercalifragilisticexpialidocious"}))
+	})
+
+	It("should handle a text with empty lines", func() {
+		text := "line1\n\nline2"
+		maxLen := 10
+		result := xtrings.SplitParagraph(text, maxLen)
+		Expect(result).To(Equal([]string{"line1\n\n", "line2"}))
+	})
+
+	It("should handle a text with leading and trailing spaces", func() {
+		text := "   leading spaces and trailing spaces   "
+		maxLen := 15
+		result := xtrings.SplitParagraph(text, maxLen)
+		Expect(result).To(Equal([]string{"   leading", "spaces and", "trailing spaces"}))
+	})
+
+	It("should handle a text with only spaces", func() {
+		text := "   "
+		maxLen := 10
+		result := xtrings.SplitParagraph(text, maxLen)
+		Expect(result).To(Equal([]string{"   "}))
+	})
+
+	It("should handle empty string", func() {
+		text := ""
+		maxLen := 10
+		result := xtrings.SplitParagraph(text, maxLen)
+		Expect(result).To(Equal([]string{""}))
+	})
+
+	It("should handle a text with only newlines", func() {
+		text := "\n\n\n"
+		maxLen := 10
+		result := xtrings.SplitParagraph(text, maxLen)
+		Expect(result).To(Equal([]string{"\n\n\n"}))
+	})
+
+	It("should handle a text with special characters", func() {
+		text := "This is a text with special characters !@#$%^&*()"
+		maxLen := 20
+		result := xtrings.SplitParagraph(text, maxLen)
+		Expect(result).To(Equal([]string{"This is a text with", "special characters", "!@#$%^&*()"}))
+	})
+})
--- a/pkg/xstrings/uniq.go
+++ b/pkg/xstrings/uniq.go
@@ -0,0 +1,15 @@
+package xstrings
+
+type Comparable interface{ ~int | ~int64 | ~string }
+
+func UniqueSlice[T Comparable](s []T) []T {
+	keys := make(map[T]bool)
+	list := []T{}
+	for _, entry := range s {
+		if _, value := keys[entry]; !value {
+			keys[entry] = true
+			list = append(list, entry)
+		}
+	}
+	return list
+}
--- a/pkg/xstrings/xstrings_suite_test.go
+++ b/pkg/xstrings/xstrings_suite_test.go
@@ -0,0 +1,13 @@
+package xstrings_test
+
+import (
+	"testing"
+
+	. "github.com/onsi/ginkgo/v2"
+	. "github.com/onsi/gomega"
+)
+
+func TestXStrings(t *testing.T) {
+	RegisterFailHandler(Fail)
+	RunSpecs(t, "XStrings test suite")
+}
--- a/services/connectors/slack.go
+++ b/services/connectors/slack.go
@@ -7,12 +7,12 @@ import (
 	"io/ioutil"
 	"log"
 	"os"
-	"regexp"
 	"strings"
 	"sync"
 	"time"

 	"github.com/mudler/LocalAgent/pkg/xlog"
+	"github.com/mudler/LocalAgent/pkg/xstrings"
 	"github.com/mudler/LocalAgent/services/actions"
 	"github.com/sashabaranov/go-openai"

@@ -137,23 +137,11 @@ func replaceUserIDsWithNamesInMessage(api *slack.Client, message string) string
 	return message
 }

-func uniqueStringSlice(s []string) []string {
-	keys := make(map[string]bool)
-	list := []string{}
-	for _, entry := range s {
-		if _, value := keys[entry]; !value {
-			keys[entry] = true
-			list = append(list, entry)
-		}
-	}
-	return list
-}
-
 func generateAttachmentsFromJobResponse(j *types.JobResult) (attachments []slack.Attachment) {
 	for _, state := range j.State {
 		// coming from the search action
 		if urls, exists := state.Metadata[actions.MetadataUrls]; exists {
-			for _, url := range uniqueStringSlice(urls.([]string)) {
+			for _, url := range xstrings.UniqueSlice(urls.([]string)) {
 				attachment := slack.Attachment{
 					Title:     "URL",
 					TitleLink: url,
@@ -165,7 +153,7 @@ func generateAttachmentsFromJobResponse(j *types.JobResult) (attachments []slack

 		// coming from the gen image actions
 		if imagesUrls, exists := state.Metadata[actions.MetadataImages]; exists {
-			for _, url := range uniqueStringSlice(imagesUrls.([]string)) {
+			for _, url := range xstrings.UniqueSlice(imagesUrls.([]string)) {
 				attachment := slack.Attachment{
 					Title:     "Image",
 					TitleLink: url,
@@ -312,45 +300,11 @@ func encodeImageFromURL(imageBytes bytes.Buffer) (string, error) {
 	return base64Image, nil
 }

-// SplitText splits a long text into chunks of a specified maximum length without truncating words and preserves special characters.
-func splitText(text string, maxLen int) []string {
-	if len(text) <= maxLen {
-		return []string{text}
-	}
-
-	var chunks []string
-	lines := strings.Split(text, "\n") // Split text by newlines first
-	whitespaceRegex := regexp.MustCompile(`\s+`)
-
-	for _, line := range lines {
-		var chunk string
-		words := whitespaceRegex.Split(line, -1) // Splitting the line into words while preserving whitespace
-
-		for _, word := range words {
-			if len(chunk)+len(word)+1 > maxLen { // +1 for space
-				chunks = append(chunks, chunk)
-				chunk = word
-			} else {
-				if chunk != "" {
-					chunk += " "
-				}
-				chunk += word
-			}
-		}
-
-		if chunk != "" {
-			chunks = append(chunks, chunk)
-		}
-	}
-
-	return chunks
-}
-
 func replyWithPostMessage(finalResponse string, api *slack.Client, ev *slackevents.MessageEvent, postMessageParams slack.PostMessageParameters, res *types.JobResult) {
 	if len(finalResponse) > 4000 {
 		// split response in multiple messages, and update the first

-		messages := splitText(finalResponse, 4000)
+		messages := xstrings.SplitParagraph(finalResponse, 3000)

 		for i, message := range messages {
 			if i == 0 {
@@ -386,7 +340,7 @@ func replyToUpdateMessage(finalResponse string, api *slack.Client, ev *slackeven
 	if len(finalResponse) > 3000 {
 		// split response in multiple messages, and update the first

-		messages := splitText(finalResponse, 3000)
+		messages := xstrings.SplitParagraph(finalResponse, 3000)

 		_, _, _, err := api.UpdateMessage(
 			ev.Channel,
--- a/services/connectors/telegram.go
+++ b/services/connectors/telegram.go
@@ -14,6 +14,7 @@ import (
 	"github.com/mudler/LocalAgent/core/agent"
 	"github.com/mudler/LocalAgent/core/types"
 	"github.com/mudler/LocalAgent/pkg/xlog"
+	"github.com/mudler/LocalAgent/pkg/xstrings"
 	"github.com/mudler/LocalAgent/services/actions"
 	"github.com/sashabaranov/go-openai"
 )
@@ -97,7 +98,7 @@ func (t *Telegram) handleUpdate(ctx context.Context, b *bot.Bot, a *agent.Agent,

 		// coming from the gen image actions
 		if imagesUrls, exists := res.Metadata[actions.MetadataImages]; exists {
-			for _, url := range uniqueStringSlice(imagesUrls.([]string)) {
+			for _, url := range xstrings.UniqueSlice(imagesUrls.([]string)) {
 				b.SendPhoto(ctx, &bot.SendPhotoParams{
 					ChatID: update.Message.Chat.ID,
 					Photo: models.InputFileString{