Better paragraph splitting
Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
This commit is contained in:
72
pkg/xstrings/split.go
Normal file
72
pkg/xstrings/split.go
Normal file
@@ -0,0 +1,72 @@
|
||||
package xstrings
|
||||
|
||||
import (
|
||||
"strings"
|
||||
)
|
||||
|
||||
// SplitTextByLength splits text into chunks of specified maxLength,
|
||||
// preserving complete words and special characters like newlines.
|
||||
// It returns a slice of strings, each with length <= maxLength.
|
||||
func SplitParagraph(text string, maxLength int) []string {
|
||||
// Handle edge cases
|
||||
if maxLength <= 0 || len(text) == 0 {
|
||||
return []string{text}
|
||||
}
|
||||
|
||||
var chunks []string
|
||||
remainingText := text
|
||||
|
||||
for len(remainingText) > 0 {
|
||||
// If remaining text fits in a chunk, add it and we're done
|
||||
if len(remainingText) <= maxLength {
|
||||
chunks = append(chunks, remainingText)
|
||||
break
|
||||
}
|
||||
|
||||
// Try to find a good split point near the max length
|
||||
splitIndex := maxLength
|
||||
|
||||
// Look backward from the max length to find a space or newline
|
||||
for splitIndex > 0 && !isWhitespace(rune(remainingText[splitIndex])) {
|
||||
splitIndex--
|
||||
}
|
||||
|
||||
// If we couldn't find a good split point (no whitespace),
|
||||
// look forward for the next whitespace
|
||||
if splitIndex == 0 {
|
||||
splitIndex = maxLength
|
||||
// If we can't find whitespace forward, we'll have to split a word
|
||||
for splitIndex < len(remainingText) && !isWhitespace(rune(remainingText[splitIndex])) {
|
||||
splitIndex++
|
||||
}
|
||||
|
||||
// If we still couldn't find whitespace, take the whole string
|
||||
if splitIndex == len(remainingText) {
|
||||
chunks = append(chunks, remainingText)
|
||||
break
|
||||
}
|
||||
}
|
||||
|
||||
// Add the chunk up to the split point
|
||||
chunk := remainingText[:splitIndex]
|
||||
|
||||
// Preserve trailing newlines with the current chunk
|
||||
if splitIndex < len(remainingText) && remainingText[splitIndex] == '\n' {
|
||||
chunk += string(remainingText[splitIndex])
|
||||
splitIndex++
|
||||
}
|
||||
|
||||
chunks = append(chunks, chunk)
|
||||
|
||||
// Remove leading whitespace from the next chunk
|
||||
remainingText = remainingText[splitIndex:]
|
||||
remainingText = strings.TrimLeftFunc(remainingText, isWhitespace)
|
||||
}
|
||||
|
||||
return chunks
|
||||
}
|
||||
|
||||
// Helper function to determine if a character is whitespace
|
||||
func isWhitespace(r rune) bool {
|
||||
return r == ' ' || r == '\t' || r == '\n' || r == '\r'
|
||||
}
|
||||
Reference in New Issue
Block a user