Add ingest command

This commit is contained in:
mudler
2023-08-23 21:55:57 +02:00
parent a823131a2d
commit 42d35dd7a3

View File

@@ -1,6 +1,13 @@
import openai import openai
#from langchain.embeddings import HuggingFaceEmbeddings #from langchain.embeddings import HuggingFaceEmbeddings
from langchain.embeddings import LocalAIEmbeddings from langchain.embeddings import LocalAIEmbeddings
from langchain.document_loaders import (
SitemapLoader,
# GitHubIssuesLoader,
# GitLoader,
)
import uuid import uuid
import sys import sys
from queue import Queue from queue import Queue
@@ -30,16 +37,32 @@ FILE_NAME_FORMAT = '%Y_%m_%d_%H_%M_%S'
EMBEDDINGS_MODEL = os.environ.get("EMBEDDINGS_MODEL", "all-MiniLM-L6-v2") EMBEDDINGS_MODEL = os.environ.get("EMBEDDINGS_MODEL", "all-MiniLM-L6-v2")
EMBEDDINGS_API_BASE = os.environ.get("EMBEDDINGS_API_BASE", "http://api:8080") EMBEDDINGS_API_BASE = os.environ.get("EMBEDDINGS_API_BASE", "http://api:8080")
PERSISTENT_DIR = os.environ.get("PERSISTENT_DIR", "/data/") PERSISTENT_DIR = os.environ.get("PERSISTENT_DIR", "/tmp/data/")
DB_DIR = os.environ.get("DB_DIR", "/tmp/data/db")
embeddings = LocalAIEmbeddings(model=EMBEDDINGS_MODEL,openai_api_base=EMBEDDINGS_API_BASE) embeddings = LocalAIEmbeddings(model=EMBEDDINGS_MODEL,openai_api_base=EMBEDDINGS_API_BASE)
chroma_client = Chroma(collection_name="memories", persist_directory="/data/db", embedding_function=embeddings)
loop = None loop = None
channel = None channel = None
def call(thing): def call(thing):
return asyncio.run_coroutine_threadsafe(thing,loop).result() return asyncio.run_coroutine_threadsafe(thing,loop).result()
def ingest(a, agent_actions={}, localagi=None):
q = json.loads(a)
chunk_size = 500
chunk_overlap = 50
logger.info(">>> ingesting: ")
logger.info(q)
documents = []
sitemap_loader = SitemapLoader(web_path=q["url"])
text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
documents.extend(sitemap_loader.load())
texts = text_splitter.split_documents(documents)
db = Chroma.from_documents(texts,embeddings,collection_name="memories", persist_directory=DB_DIR)
db.persist()
db = None
return f"Documents ingested"
def create_image(a, agent_actions={}, localagi=None): def create_image(a, agent_actions={}, localagi=None):
q = json.loads(a) q = json.loads(a)
logger.info(">>> creating image: ") logger.info(">>> creating image: ")
@@ -63,6 +86,8 @@ def download_image(url: str):
full_path = f"{PERSISTENT_DIR}{file_name}" full_path = f"{PERSISTENT_DIR}{file_name}"
urllib.request.urlretrieve(url, full_path) urllib.request.urlretrieve(url, full_path)
return file_name return file_name
### Agent capabilities ### Agent capabilities
### These functions are called by the agent to perform actions ### These functions are called by the agent to perform actions
### ###
@@ -70,17 +95,20 @@ def save(memory, agent_actions={}, localagi=None):
q = json.loads(memory) q = json.loads(memory)
logger.info(">>> saving to memories: ") logger.info(">>> saving to memories: ")
logger.info(q["content"]) logger.info(q["content"])
chroma_client = Chroma(collection_name="memories",embedding_function=embeddings, persist_directory=DB_DIR)
chroma_client.add_texts([q["content"]],[{"id": str(uuid.uuid4())}]) chroma_client.add_texts([q["content"]],[{"id": str(uuid.uuid4())}])
chroma_client.persist() chroma_client.persist()
chroma_client = None
return f"The object was saved permanently to memory." return f"The object was saved permanently to memory."
def search_memory(query, agent_actions={}, localagi=None): def search_memory(query, agent_actions={}, localagi=None):
q = json.loads(query) q = json.loads(query)
chroma_client = Chroma(collection_name="memories",embedding_function=embeddings, persist_directory=DB_DIR)
docs = chroma_client.similarity_search(q["reasoning"]) docs = chroma_client.similarity_search(q["reasoning"])
text_res="Memories found in the database:\n" text_res="Memories found in the database:\n"
for doc in docs: for doc in docs:
text_res+="- "+doc.page_content+"\n" text_res+="- "+doc.page_content+"\n"
chroma_client = None
#if args.postprocess: #if args.postprocess:
# return post_process(text_res) # return post_process(text_res)
#return text_res #return text_res
@@ -178,12 +206,12 @@ def search_duckduckgo(a, agent_actions={}, localagi=None):
### Agent action definitions ### Agent action definitions
agent_actions = { agent_actions = {
"create_image": { "generate_picture": {
"function": create_image, "function": create_image,
"plannable": True, "plannable": True,
"description": 'If the user wants to generate an image, the assistant replies with "create_image", a detailed caption, the width and height of the image to generate.', "description": 'For creating a picture, the assistant replies with "generate_picture" and a detailed caption, enhancing it with as much detail as possible.',
"signature": { "signature": {
"name": "create_image", "name": "generate_picture",
"parameters": { "parameters": {
"type": "object", "type": "object",
"properties": { "properties": {
@@ -240,6 +268,25 @@ agent_actions = {
} }
}, },
}, },
"ingest": {
"function": ingest,
"plannable": True,
"description": 'The assistant replies with the action "ingest" when there is an url to a sitemap to ingest memories from.',
"signature": {
"name": "ingest",
"description": """Save or store informations into memory.""",
"parameters": {
"type": "object",
"properties": {
"url": {
"type": "string",
"description": "information to save"
},
},
"required": ["url"]
}
},
},
"save_memory": { "save_memory": {
"function": save, "function": save,
"plannable": True, "plannable": True,