Merge branch 'release/v2.3'

This commit is contained in:
Cyber MacGeddon 2026-04-18 12:09:52 +01:00
commit 222537c26b
18 changed files with 1020 additions and 247 deletions

View file

@ -10,6 +10,8 @@ from prometheus_client import Histogram
from ... schema import TextDocument, Chunk, Metadata, Triples
from ... base import ChunkingService, ConsumerSpec, ProducerSpec
RecursiveCharacterTextSplitter = None
from ... provenance import (
chunk_uri as make_chunk_uri, derived_entity_triples,
set_graph, GRAPH_SOURCE,
@ -41,8 +43,12 @@ class Processor(ChunkingService):
self.default_chunk_size = chunk_size
self.default_chunk_overlap = chunk_overlap
from langchain_text_splitters import RecursiveCharacterTextSplitter
self.RecursiveCharacterTextSplitter = RecursiveCharacterTextSplitter
global RecursiveCharacterTextSplitter
if RecursiveCharacterTextSplitter is None:
from langchain_text_splitters import (
RecursiveCharacterTextSplitter as _cls,
)
RecursiveCharacterTextSplitter = _cls
if not hasattr(__class__, "chunk_metric"):
__class__.chunk_metric = Histogram(

View file

@ -10,6 +10,8 @@ from prometheus_client import Histogram
from ... schema import TextDocument, Chunk, Metadata, Triples
from ... base import ChunkingService, ConsumerSpec, ProducerSpec
TokenTextSplitter = None
from ... provenance import (
chunk_uri as make_chunk_uri, derived_entity_triples,
set_graph, GRAPH_SOURCE,
@ -41,8 +43,10 @@ class Processor(ChunkingService):
self.default_chunk_size = chunk_size
self.default_chunk_overlap = chunk_overlap
from langchain_text_splitters import TokenTextSplitter
self.TokenTextSplitter = TokenTextSplitter
global TokenTextSplitter
if TokenTextSplitter is None:
from langchain_text_splitters import TokenTextSplitter as _cls
TokenTextSplitter = _cls
if not hasattr(__class__, "chunk_metric"):
__class__.chunk_metric = Histogram(

View file

@ -124,9 +124,7 @@ class Processor(AsyncProcessor):
async def start(self):
await self.pubsub.ensure_queue(
self.config_request_topic, self.config_request_subscriber
)
await self.pubsub.ensure_topic(self.config_request_topic)
await self.push() # Startup poke: empty types = everything
await self.config_request_consumer.start()

View file

@ -119,9 +119,7 @@ class Processor(AsyncProcessor):
async def start(self):
await self.pubsub.ensure_queue(
self.knowledge_request_topic, self.knowledge_request_subscriber
)
await self.pubsub.ensure_topic(self.knowledge_request_topic)
await super(Processor, self).start()
await self.knowledge_request_consumer.start()
await self.knowledge_response_producer.start()

View file

@ -15,6 +15,9 @@ from ... schema import Document, TextDocument, Metadata
from ... schema import librarian_request_queue, librarian_response_queue
from ... schema import Triples
from ... base import FlowProcessor, ConsumerSpec, ProducerSpec, LibrarianClient
PyPDFLoader = None
from ... provenance import (
document_uri, page_uri as make_page_uri, derived_entity_triples,
set_graph, GRAPH_SOURCE,
@ -128,7 +131,12 @@ class Processor(FlowProcessor):
fp.write(base64.b64decode(v.data))
fp.close()
from langchain_community.document_loaders import PyPDFLoader
global PyPDFLoader
if PyPDFLoader is None:
from langchain_community.document_loaders import (
PyPDFLoader as _cls,
)
PyPDFLoader = _cls
loader = PyPDFLoader(temp_path)
pages = loader.load()

View file

@ -7,7 +7,7 @@ import logging
# Module logger
logger = logging.getLogger(__name__)
# Queue deletion retry settings
# Topic deletion retry settings
DELETE_RETRIES = 5
DELETE_RETRY_DELAY = 2 # seconds
@ -215,11 +215,11 @@ class FlowConfig:
return result
# Pre-create flow-level queues so the data path is wired
# Pre-create topic exchanges so the data path is wired
# before processors receive their config and start connecting.
queues = self._collect_flow_queues(cls, repl_template_with_params)
for topic, subscription in queues:
await self.pubsub.create_queue(topic, subscription)
topics = self._collect_flow_topics(cls, repl_template_with_params)
for topic in topics:
await self.pubsub.create_topic(topic)
# Build all processor config updates, then write in a single batch.
updates = []
@ -283,8 +283,8 @@ class FlowConfig:
error = None,
)
async def ensure_existing_flow_queues(self):
"""Ensure queues exist for all already-running flows.
async def ensure_existing_flow_topics(self):
"""Ensure topics exist for all already-running flows.
Called on startup to handle flows that were started before this
version of the flow service was deployed, or before a restart.
@ -315,7 +315,7 @@ class FlowConfig:
if blueprint_data is None:
logger.warning(
f"Blueprint '{blueprint_name}' not found for "
f"flow '{flow_id}', skipping queue creation"
f"flow '{flow_id}', skipping topic creation"
)
continue
@ -333,65 +333,63 @@ class FlowConfig:
)
return result
queues = self._collect_flow_queues(cls, repl_template)
for topic, subscription in queues:
await self.pubsub.ensure_queue(topic, subscription)
topics = self._collect_flow_topics(cls, repl_template)
for topic in topics:
await self.pubsub.ensure_topic(topic)
logger.info(
f"Ensured queues for existing flow '{flow_id}'"
f"Ensured topics for existing flow '{flow_id}'"
)
except Exception as e:
logger.error(
f"Failed to ensure queues for flow '{flow_id}': {e}"
f"Failed to ensure topics for flow '{flow_id}': {e}"
)
def _collect_flow_queues(self, cls, repl_template):
"""Collect (topic, subscription) pairs for all flow-level queues.
def _collect_flow_topics(self, cls, repl_template):
"""Collect unique topic identifiers from the blueprint.
Iterates the blueprint's "flow" section and reads only the
"topics" dict from each processor entry.
Iterates the blueprint's "flow" section and returns a
deduplicated set of resolved topic strings. The flow service
manages topic lifecycle (create/delete exchanges), not
individual consumer queues.
"""
queues = []
topics = set()
for k, v in cls["flow"].items():
processor, variant = k.split(":", 1)
variant = repl_template(variant)
for spec_name, topic_template in v.get("topics", {}).items():
topic = repl_template(topic_template)
subscription = f"{processor}--{variant}--{spec_name}"
queues.append((topic, subscription))
topics.add(topic)
return queues
return topics
async def _delete_queues(self, queues):
"""Delete queues with retries. Best-effort — logs failures but
async def _delete_topics(self, topics):
"""Delete topics with retries. Best-effort — logs failures but
does not raise."""
for attempt in range(DELETE_RETRIES):
remaining = []
for topic, subscription in queues:
for topic in topics:
try:
await self.pubsub.delete_queue(topic, subscription)
await self.pubsub.delete_topic(topic)
except Exception as e:
logger.warning(
f"Queue delete failed (attempt {attempt + 1}/"
f"Topic delete failed (attempt {attempt + 1}/"
f"{DELETE_RETRIES}): {topic}: {e}"
)
remaining.append((topic, subscription))
remaining.append(topic)
if not remaining:
return
queues = remaining
topics = remaining
if attempt < DELETE_RETRIES - 1:
await asyncio.sleep(DELETE_RETRY_DELAY)
for topic, subscription in queues:
for topic in topics:
logger.error(
f"Failed to delete queue after {DELETE_RETRIES} "
f"Failed to delete topic after {DELETE_RETRIES} "
f"attempts: {topic}"
)
@ -426,8 +424,8 @@ class FlowConfig:
result = result.replace(f"{{{param_name}}}", str(param_value))
return result
# Collect queue identifiers before removing config
queues = self._collect_flow_queues(cls, repl_template)
# Collect topic identifiers before removing config
topics = self._collect_flow_topics(cls, repl_template)
# Phase 1: Set status to "stopping" and remove processor config.
# The config push tells processors to shut down their consumers.
@ -448,8 +446,8 @@ class FlowConfig:
await self.config.delete_many(deletes)
# Phase 2: Delete queues with retries, then remove the flow record.
await self._delete_queues(queues)
# Phase 2: Delete topics with retries, then remove the flow record.
await self._delete_topics(topics)
if msg.flow_id in await self.config.keys("flow"):
await self.config.delete("flow", msg.flow_id)

View file

@ -101,11 +101,9 @@ class Processor(AsyncProcessor):
async def start(self):
await self.pubsub.ensure_queue(
self.flow_request_topic, self.flow_request_subscriber
)
await self.pubsub.ensure_topic(self.flow_request_topic)
await self.config_client.start()
await self.flow.ensure_existing_flow_queues()
await self.flow.ensure_existing_flow_topics()
await self.flow_request_consumer.start()
async def on_flow_request(self, msg, consumer, flow):

View file

@ -263,12 +263,8 @@ class Processor(AsyncProcessor):
async def start(self):
await self.pubsub.ensure_queue(
self.librarian_request_topic, self.librarian_request_subscriber
)
await self.pubsub.ensure_queue(
self.collection_request_topic, self.collection_request_subscriber
)
await self.pubsub.ensure_topic(self.librarian_request_topic)
await self.pubsub.ensure_topic(self.collection_request_topic)
await super(Processor, self).start()
await self.librarian_request_consumer.start()
await self.librarian_response_producer.start()