Fix/document embeddings (#247)

* Update schema for doc embeddings

* Rename embeddings-vectorize to graph-embeddings

* Added document-embeddings processor (broken, needs fixing)

* Added scripts

* Fixed DE queue schema

* Add missing DE process

* Fix doc RAG processing, put graph-rag and doc-rag in appropriate component files.
This commit is contained in:
cybermaggedon 2025-01-04 21:51:28 +00:00 committed by GitHub
parent c633652fd2
commit 6aa212061d
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
22 changed files with 421 additions and 189 deletions

View file

@ -131,6 +131,35 @@ class Api:
except:
raise ProtocolException(f"Response not formatted correctly")
def document_rag(self, question):
# The input consists of a question
input = {
"query": question
}
url = f"{self.url}document-rag"
# Invoke the API, input is passed as JSON
resp = requests.post(url, json=input)
# Should be a 200 status code
if resp.status_code != 200:
raise ProtocolException(f"Status code {resp.status_code}")
try:
# Parse the response as JSON
object = resp.json()
except:
raise ProtocolException(f"Expected JSON response")
self.check_error(resp)
try:
return object["response"]
except:
raise ProtocolException(f"Response not formatted correctly")
def embeddings(self, text):
# The input consists of a text block

View file

@ -38,8 +38,12 @@ class DocumentEmbeddingsClient(BaseClient):
output_schema=DocumentEmbeddingsResponse,
)
def request(self, vectors, limit=10, timeout=300):
def request(
self, vectors, user="trustgraph", collection="default",
limit=10, timeout=300
):
return self.call(
user=user, collection=collection,
vectors=vectors, limit=limit, timeout=timeout
).documents

View file

@ -35,11 +35,28 @@ chunk_ingest_queue = topic('chunk-load')
############################################################################
# Document embeddings are embeddings associated with a chunk
class ChunkEmbeddings(Record):
chunk = Bytes()
vectors = Array(Array(Double()))
# This is a 'batching' mechanism for the above data
class DocumentEmbeddings(Record):
metadata = Metadata()
chunks = Array(ChunkEmbeddings())
document_embeddings_store_queue = topic('document-embeddings-store')
############################################################################
# Doc embeddings query
class DocumentEmbeddingsRequest(Record):
vectors = Array(Array(Double()))
limit = Integer()
user = String()
collection = String()
class DocumentEmbeddingsResponse(Record):
error = Error()