Revert "Feature/configure flows (#345)"

This reverts commit a9197d11ee.
This commit is contained in:
Cyber MacGeddon 2025-04-25 19:02:08 +01:00
parent 3adb3cf59c
commit 1822ca395f
125 changed files with 2628 additions and 3751 deletions

View file

@ -6,63 +6,61 @@ Output is chunk plus embedding.
"""
from ... schema import Chunk, ChunkEmbeddings, DocumentEmbeddings
from ... schema import EmbeddingsRequest, EmbeddingsResponse
from ... schema import chunk_ingest_queue
from ... schema import document_embeddings_store_queue
from ... schema import embeddings_request_queue, embeddings_response_queue
from ... clients.embeddings_client import EmbeddingsClient
from ... log_level import LogLevel
from ... base import ConsumerProducer
from ... base import FlowProcessor, RequestResponseSpec, ConsumerSpec
from ... base import ProducerSpec
module = ".".join(__name__.split(".")[1:-1])
default_ident = "document-embeddings"
default_input_queue = chunk_ingest_queue
default_output_queue = document_embeddings_store_queue
default_subscriber = module
class Processor(FlowProcessor):
class Processor(ConsumerProducer):
def __init__(self, **params):
id = params.get("id")
input_queue = params.get("input_queue", default_input_queue)
output_queue = params.get("output_queue", default_output_queue)
subscriber = params.get("subscriber", default_subscriber)
emb_request_queue = params.get(
"embeddings_request_queue", embeddings_request_queue
)
emb_response_queue = params.get(
"embeddings_response_queue", embeddings_response_queue
)
super(Processor, self).__init__(
**params | {
"id": id,
"input_queue": input_queue,
"output_queue": output_queue,
"embeddings_request_queue": emb_request_queue,
"embeddings_response_queue": emb_response_queue,
"subscriber": subscriber,
"input_schema": Chunk,
"output_schema": DocumentEmbeddings,
}
)
self.register_specification(
ConsumerSpec(
name = "input",
schema = Chunk,
handler = self.on_message,
)
self.embeddings = EmbeddingsClient(
pulsar_host=self.pulsar_host,
pulsar_api_key=self.pulsar_api_key,
input_queue=emb_request_queue,
output_queue=emb_response_queue,
subscriber=module + "-emb",
)
self.register_specification(
RequestResponseSpec(
request_name = "embeddings-request",
request_schema = EmbeddingsRequest,
response_name = "embeddings-response",
response_schema = EmbeddingsResponse,
)
)
self.register_specification(
ProducerSpec(
name = "output",
schema = DocumentEmbeddings
)
)
async def on_message(self, msg, consumer, flow):
async def handle(self, msg):
v = msg.value()
print(f"Indexing {v.metadata.id}...", flush=True)
try:
resp = await flow("embeddings-request").request(
EmbeddingsRequest(
text = v.chunk
)
)
vectors = resp.vectors
vectors = self.embeddings.request(v.chunk)
embeds = [
ChunkEmbeddings(
@ -76,7 +74,7 @@ class Processor(FlowProcessor):
chunks=embeds,
)
await flow("output").send(r)
await self.send(r)
except Exception as e:
print("Exception:", e, flush=True)
@ -89,9 +87,24 @@ class Processor(FlowProcessor):
@staticmethod
def add_args(parser):
FlowProcessor.add_args(parser)
ConsumerProducer.add_args(
parser, default_input_queue, default_subscriber,
default_output_queue,
)
parser.add_argument(
'--embeddings-request-queue',
default=embeddings_request_queue,
help=f'Embeddings request queue (default: {embeddings_request_queue})',
)
parser.add_argument(
'--embeddings-response-queue',
default=embeddings_response_queue,
help=f'Embeddings request queue (default: {embeddings_response_queue})',
)
def run():
Processor.launch(default_ident, __doc__)
Processor.launch(module, __doc__)

View file

@ -1,43 +1,81 @@
"""
Embeddings service, applies an embeddings model using fastembed
Embeddings service, applies an embeddings model selected from HuggingFace.
Input is text, output is embeddings vector.
"""
from ... base import EmbeddingsService
from ... schema import EmbeddingsRequest, EmbeddingsResponse
from ... schema import embeddings_request_queue, embeddings_response_queue
from ... log_level import LogLevel
from ... base import ConsumerProducer
from fastembed import TextEmbedding
import os
default_ident = "embeddings"
module = ".".join(__name__.split(".")[1:-1])
default_input_queue = embeddings_request_queue
default_output_queue = embeddings_response_queue
default_subscriber = module
default_model="sentence-transformers/all-MiniLM-L6-v2"
class Processor(EmbeddingsService):
class Processor(ConsumerProducer):
def __init__(self, **params):
input_queue = params.get("input_queue", default_input_queue)
output_queue = params.get("output_queue", default_output_queue)
subscriber = params.get("subscriber", default_subscriber)
model = params.get("model", default_model)
super(Processor, self).__init__(
**params | { "model": model }
**params | {
"input_queue": input_queue,
"output_queue": output_queue,
"subscriber": subscriber,
"input_schema": EmbeddingsRequest,
"output_schema": EmbeddingsResponse,
"model": model,
}
)
print("Get model...", flush=True)
self.embeddings = TextEmbedding(model_name = model)
async def on_embeddings(self, text):
async def handle(self, msg):
v = msg.value()
# Sender-produced ID
id = msg.properties()["id"]
print(f"Handling input {id}...", flush=True)
text = v.text
vecs = self.embeddings.embed([text])
return [
vecs = [
v.tolist()
for v in vecs
]
print("Send response...", flush=True)
r = EmbeddingsResponse(
vectors=list(vecs),
error=None,
)
await self.send(r, properties={"id": id})
print("Done.", flush=True)
@staticmethod
def add_args(parser):
EmbeddingsService.add_args(parser)
ConsumerProducer.add_args(
parser, default_input_queue, default_subscriber,
default_output_queue,
)
parser.add_argument(
'-m', '--model',
@ -47,5 +85,5 @@ class Processor(EmbeddingsService):
def run():
Processor.launch(default_ident, __doc__)
Processor.launch(module, __doc__)

View file

@ -6,48 +6,53 @@ Output is entity plus embedding.
"""
from ... schema import EntityContexts, EntityEmbeddings, GraphEmbeddings
from ... schema import EmbeddingsRequest, EmbeddingsResponse
from ... schema import entity_contexts_ingest_queue
from ... schema import graph_embeddings_store_queue
from ... schema import embeddings_request_queue, embeddings_response_queue
from ... clients.embeddings_client import EmbeddingsClient
from ... log_level import LogLevel
from ... base import ConsumerProducer
from ... base import FlowProcessor, EmbeddingsClientSpec, ConsumerSpec
from ... base import ProducerSpec
module = ".".join(__name__.split(".")[1:-1])
default_ident = "graph-embeddings"
default_input_queue = entity_contexts_ingest_queue
default_output_queue = graph_embeddings_store_queue
default_subscriber = module
class Processor(FlowProcessor):
class Processor(ConsumerProducer):
def __init__(self, **params):
id = params.get("id")
input_queue = params.get("input_queue", default_input_queue)
output_queue = params.get("output_queue", default_output_queue)
subscriber = params.get("subscriber", default_subscriber)
emb_request_queue = params.get(
"embeddings_request_queue", embeddings_request_queue
)
emb_response_queue = params.get(
"embeddings_response_queue", embeddings_response_queue
)
super(Processor, self).__init__(
**params | {
"id": id,
"input_queue": input_queue,
"output_queue": output_queue,
"embeddings_request_queue": emb_request_queue,
"embeddings_response_queue": emb_response_queue,
"subscriber": subscriber,
"input_schema": EntityContexts,
"output_schema": GraphEmbeddings,
}
)
self.register_specification(
ConsumerSpec(
name = "input",
schema = EntityContexts,
handler = self.on_message,
)
self.embeddings = EmbeddingsClient(
pulsar_host=self.pulsar_host,
input_queue=emb_request_queue,
output_queue=emb_response_queue,
subscriber=module + "-emb",
)
self.register_specification(
EmbeddingsClientSpec(
request_name = "embeddings-request",
response_name = "embeddings-response",
)
)
self.register_specification(
ProducerSpec(
name = "output",
schema = GraphEmbeddings
)
)
async def on_message(self, msg, consumer, flow):
async def handle(self, msg):
v = msg.value()
print(f"Indexing {v.metadata.id}...", flush=True)
@ -58,9 +63,7 @@ class Processor(FlowProcessor):
for entity in v.entities:
vectors = await flow("embeddings-request").embed(
text = entity.context
)
vectors = self.embeddings.request(entity.context)
entities.append(
EntityEmbeddings(
@ -74,7 +77,7 @@ class Processor(FlowProcessor):
entities=entities,
)
await flow("output").send(r)
await self.send(r)
except Exception as e:
print("Exception:", e, flush=True)
@ -87,9 +90,24 @@ class Processor(FlowProcessor):
@staticmethod
def add_args(parser):
FlowProcessor.add_args(parser)
ConsumerProducer.add_args(
parser, default_input_queue, default_subscriber,
default_output_queue,
)
parser.add_argument(
'--embeddings-request-queue',
default=embeddings_request_queue,
help=f'Embeddings request queue (default: {embeddings_request_queue})',
)
parser.add_argument(
'--embeddings-response-queue',
default=embeddings_response_queue,
help=f'Embeddings request queue (default: {embeddings_response_queue})',
)
def run():
Processor.launch(default_ident, __doc__)
Processor.launch(module, __doc__)

View file

@ -11,7 +11,7 @@ from ... base import ConsumerProducer
from ollama import Client
import os
module = "embeddings"
module = ".".join(__name__.split(".")[1:-1])
default_input_queue = embeddings_request_queue
default_output_queue = embeddings_response_queue