Support pdf ingestion

This commit is contained in:
Ettore Di Giacinto
2024-04-11 00:40:46 +02:00
parent cb35f871db
commit d237e17719
6 changed files with 97 additions and 0 deletions

View File

@@ -1,12 +1,15 @@
package main
import (
"bytes"
"fmt"
"net/http"
"os"
. "github.com/mudler/local-agent-framework/agent"
"github.com/donseba/go-htmx"
"github.com/dslipak/pdf"
fiber "github.com/gofiber/fiber/v2"
)
@@ -17,6 +20,54 @@ type (
}
)
func (a *App) KnowledgeBaseFile(db *InMemoryDatabase) func(c *fiber.Ctx) error {
return func(c *fiber.Ctx) error {
// https://golang.withcodeexample.com/blog/file-upload-handling-golang-fiber-guide/
// Handle file upload logic
file, err := c.FormFile("file")
if err != nil {
// Handle error
return err
}
payload := struct {
ChunkSize int `form:"chunk_size"`
}{}
if err := c.BodyParser(&payload); err != nil {
return err
}
os.MkdirAll("./uploads", os.ModePerm)
destination := fmt.Sprintf("./uploads/%s", file.Filename)
if err := c.SaveFile(file, destination); err != nil {
// Handle error
return err
}
fmt.Println("File uploaded to: " + destination)
fmt.Printf("Payload: %+v\n", payload)
content, err := readPdf(destination) // Read local pdf file
if err != nil {
panic(err)
}
fmt.Println("Content is", content)
chunkSize := defaultChunkSize
if payload.ChunkSize > 0 {
chunkSize = payload.ChunkSize
}
go StringsToKB(db, chunkSize, content)
_, err = c.WriteString(chatDiv("File uploaded", "gray"))
return err
}
}
func (a *App) KnowledgeBase(db *InMemoryDatabase) func(c *fiber.Ctx) error {
return func(c *fiber.Ctx) error {
payload := struct {
@@ -153,3 +204,17 @@ func (a *App) Chat(pool *AgentPool) func(c *fiber.Ctx) error {
return nil
}
}
func readPdf(path string) (string, error) {
r, err := pdf.Open(path)
if err != nil {
return "", err
}
var buf bytes.Buffer
b, err := r.GetPlainText()
if err != nil {
return "", err
}
buf.ReadFrom(b)
return buf.String(), nil
}

View File

@@ -127,6 +127,10 @@ func WebsiteToKB(website string, chunkSize int, db *InMemoryDatabase) {
fmt.Println("Found pages: ", len(content))
fmt.Println("ChunkSize: ", chunkSize)
StringsToKB(db, chunkSize, content...)
}
func StringsToKB(db *InMemoryDatabase, chunkSize int, content ...string) {
for _, c := range content {
chunks := splitParagraphIntoChunks(c, chunkSize)
fmt.Println("chunks: ", len(chunks))

View File

@@ -55,6 +55,7 @@ func RegisterRoutes(webapp *fiber.App, pool *AgentPool, db *InMemoryDatabase, ap
webapp.Post("/create", app.Create(pool))
webapp.Get("/delete/:name", app.Delete(pool))
webapp.Post("/knowledgebase", app.KnowledgeBase(db))
webapp.Post("/knowledgebase/upload", app.KnowledgeBaseFile(db))
webapp.Get("/talk/:name", func(c *fiber.Ctx) error {
return c.Render("chat.html", fiber.Map{

View File

@@ -29,6 +29,30 @@
</button>
</div>
</form>
<form id='form' hx-encoding='multipart/form-data' hx-post='/knowledgebase/upload'>
<div class="mb-6">
<label for="file" class="block text-lg font-medium text-gray-400">File</label>
<input type='file' name='file' id='file' class="mt-1 focus:ring-indigo-500 focus:border-indigo-500 block w-full shadow-sm sm:text-lg border-gray-300 rounded-md bg-gray-700 text-white">
</div>
<div class="mb-6">
<label for="chunk_size" class="block text-lg font-medium text-gray-400">Chunk size</label>
<input type="text" name="chunk_size" id="chunk_size" class="mt-1 focus:ring-indigo-500 focus:border-indigo-500 block w-full shadow-sm sm:text-lg border-gray-300 rounded-md bg-gray-700 text-white" placeholder="380">
</div>
<div class="flex items-center justify-between">
<button type="submit" class="w-full flex justify-center py-2 px-4 border border-transparent rounded-md shadow-sm text-sm font-medium text-white bg-blue-500 hover:bg-blue-700 focus:outline-none focus:ring-2 focus:ring-offset-2 focus:ring-blue-500">
Upload file
</button>
</div>
<div class="mb-6">
<progress id='progress' value='0' max='100'></progress>
</div>
</form>
<script>
htmx.on('#form', 'htmx:xhr:progress', function(evt) {
htmx.find('#progress').setAttribute('value', evt.detail.loaded/evt.detail.total * 100)
});
</script>
</div>
</body>
</html>