mirror of
https://github.com/trustgraph-ai/trustgraph.git
synced 2026-04-25 08:26:21 +02:00
Merge branch 'release/v2.3'
This commit is contained in:
commit
222537c26b
18 changed files with 1020 additions and 247 deletions
|
|
@ -10,6 +10,8 @@ from prometheus_client import Histogram
|
|||
from ... schema import TextDocument, Chunk, Metadata, Triples
|
||||
from ... base import ChunkingService, ConsumerSpec, ProducerSpec
|
||||
|
||||
RecursiveCharacterTextSplitter = None
|
||||
|
||||
from ... provenance import (
|
||||
chunk_uri as make_chunk_uri, derived_entity_triples,
|
||||
set_graph, GRAPH_SOURCE,
|
||||
|
|
@ -41,8 +43,12 @@ class Processor(ChunkingService):
|
|||
self.default_chunk_size = chunk_size
|
||||
self.default_chunk_overlap = chunk_overlap
|
||||
|
||||
from langchain_text_splitters import RecursiveCharacterTextSplitter
|
||||
self.RecursiveCharacterTextSplitter = RecursiveCharacterTextSplitter
|
||||
global RecursiveCharacterTextSplitter
|
||||
if RecursiveCharacterTextSplitter is None:
|
||||
from langchain_text_splitters import (
|
||||
RecursiveCharacterTextSplitter as _cls,
|
||||
)
|
||||
RecursiveCharacterTextSplitter = _cls
|
||||
|
||||
if not hasattr(__class__, "chunk_metric"):
|
||||
__class__.chunk_metric = Histogram(
|
||||
|
|
|
|||
|
|
@ -10,6 +10,8 @@ from prometheus_client import Histogram
|
|||
from ... schema import TextDocument, Chunk, Metadata, Triples
|
||||
from ... base import ChunkingService, ConsumerSpec, ProducerSpec
|
||||
|
||||
TokenTextSplitter = None
|
||||
|
||||
from ... provenance import (
|
||||
chunk_uri as make_chunk_uri, derived_entity_triples,
|
||||
set_graph, GRAPH_SOURCE,
|
||||
|
|
@ -41,8 +43,10 @@ class Processor(ChunkingService):
|
|||
self.default_chunk_size = chunk_size
|
||||
self.default_chunk_overlap = chunk_overlap
|
||||
|
||||
from langchain_text_splitters import TokenTextSplitter
|
||||
self.TokenTextSplitter = TokenTextSplitter
|
||||
global TokenTextSplitter
|
||||
if TokenTextSplitter is None:
|
||||
from langchain_text_splitters import TokenTextSplitter as _cls
|
||||
TokenTextSplitter = _cls
|
||||
|
||||
if not hasattr(__class__, "chunk_metric"):
|
||||
__class__.chunk_metric = Histogram(
|
||||
|
|
|
|||
|
|
@ -124,9 +124,7 @@ class Processor(AsyncProcessor):
|
|||
|
||||
async def start(self):
|
||||
|
||||
await self.pubsub.ensure_queue(
|
||||
self.config_request_topic, self.config_request_subscriber
|
||||
)
|
||||
await self.pubsub.ensure_topic(self.config_request_topic)
|
||||
await self.push() # Startup poke: empty types = everything
|
||||
await self.config_request_consumer.start()
|
||||
|
||||
|
|
|
|||
|
|
@ -119,9 +119,7 @@ class Processor(AsyncProcessor):
|
|||
|
||||
async def start(self):
|
||||
|
||||
await self.pubsub.ensure_queue(
|
||||
self.knowledge_request_topic, self.knowledge_request_subscriber
|
||||
)
|
||||
await self.pubsub.ensure_topic(self.knowledge_request_topic)
|
||||
await super(Processor, self).start()
|
||||
await self.knowledge_request_consumer.start()
|
||||
await self.knowledge_response_producer.start()
|
||||
|
|
|
|||
|
|
@ -15,6 +15,9 @@ from ... schema import Document, TextDocument, Metadata
|
|||
from ... schema import librarian_request_queue, librarian_response_queue
|
||||
from ... schema import Triples
|
||||
from ... base import FlowProcessor, ConsumerSpec, ProducerSpec, LibrarianClient
|
||||
|
||||
PyPDFLoader = None
|
||||
|
||||
from ... provenance import (
|
||||
document_uri, page_uri as make_page_uri, derived_entity_triples,
|
||||
set_graph, GRAPH_SOURCE,
|
||||
|
|
@ -128,7 +131,12 @@ class Processor(FlowProcessor):
|
|||
fp.write(base64.b64decode(v.data))
|
||||
fp.close()
|
||||
|
||||
from langchain_community.document_loaders import PyPDFLoader
|
||||
global PyPDFLoader
|
||||
if PyPDFLoader is None:
|
||||
from langchain_community.document_loaders import (
|
||||
PyPDFLoader as _cls,
|
||||
)
|
||||
PyPDFLoader = _cls
|
||||
loader = PyPDFLoader(temp_path)
|
||||
pages = loader.load()
|
||||
|
||||
|
|
|
|||
|
|
@ -7,7 +7,7 @@ import logging
|
|||
# Module logger
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# Queue deletion retry settings
|
||||
# Topic deletion retry settings
|
||||
DELETE_RETRIES = 5
|
||||
DELETE_RETRY_DELAY = 2 # seconds
|
||||
|
||||
|
|
@ -215,11 +215,11 @@ class FlowConfig:
|
|||
|
||||
return result
|
||||
|
||||
# Pre-create flow-level queues so the data path is wired
|
||||
# Pre-create topic exchanges so the data path is wired
|
||||
# before processors receive their config and start connecting.
|
||||
queues = self._collect_flow_queues(cls, repl_template_with_params)
|
||||
for topic, subscription in queues:
|
||||
await self.pubsub.create_queue(topic, subscription)
|
||||
topics = self._collect_flow_topics(cls, repl_template_with_params)
|
||||
for topic in topics:
|
||||
await self.pubsub.create_topic(topic)
|
||||
|
||||
# Build all processor config updates, then write in a single batch.
|
||||
updates = []
|
||||
|
|
@ -283,8 +283,8 @@ class FlowConfig:
|
|||
error = None,
|
||||
)
|
||||
|
||||
async def ensure_existing_flow_queues(self):
|
||||
"""Ensure queues exist for all already-running flows.
|
||||
async def ensure_existing_flow_topics(self):
|
||||
"""Ensure topics exist for all already-running flows.
|
||||
|
||||
Called on startup to handle flows that were started before this
|
||||
version of the flow service was deployed, or before a restart.
|
||||
|
|
@ -315,7 +315,7 @@ class FlowConfig:
|
|||
if blueprint_data is None:
|
||||
logger.warning(
|
||||
f"Blueprint '{blueprint_name}' not found for "
|
||||
f"flow '{flow_id}', skipping queue creation"
|
||||
f"flow '{flow_id}', skipping topic creation"
|
||||
)
|
||||
continue
|
||||
|
||||
|
|
@ -333,65 +333,63 @@ class FlowConfig:
|
|||
)
|
||||
return result
|
||||
|
||||
queues = self._collect_flow_queues(cls, repl_template)
|
||||
for topic, subscription in queues:
|
||||
await self.pubsub.ensure_queue(topic, subscription)
|
||||
topics = self._collect_flow_topics(cls, repl_template)
|
||||
for topic in topics:
|
||||
await self.pubsub.ensure_topic(topic)
|
||||
|
||||
logger.info(
|
||||
f"Ensured queues for existing flow '{flow_id}'"
|
||||
f"Ensured topics for existing flow '{flow_id}'"
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(
|
||||
f"Failed to ensure queues for flow '{flow_id}': {e}"
|
||||
f"Failed to ensure topics for flow '{flow_id}': {e}"
|
||||
)
|
||||
|
||||
def _collect_flow_queues(self, cls, repl_template):
|
||||
"""Collect (topic, subscription) pairs for all flow-level queues.
|
||||
def _collect_flow_topics(self, cls, repl_template):
|
||||
"""Collect unique topic identifiers from the blueprint.
|
||||
|
||||
Iterates the blueprint's "flow" section and reads only the
|
||||
"topics" dict from each processor entry.
|
||||
Iterates the blueprint's "flow" section and returns a
|
||||
deduplicated set of resolved topic strings. The flow service
|
||||
manages topic lifecycle (create/delete exchanges), not
|
||||
individual consumer queues.
|
||||
"""
|
||||
queues = []
|
||||
topics = set()
|
||||
|
||||
for k, v in cls["flow"].items():
|
||||
processor, variant = k.split(":", 1)
|
||||
variant = repl_template(variant)
|
||||
|
||||
for spec_name, topic_template in v.get("topics", {}).items():
|
||||
topic = repl_template(topic_template)
|
||||
subscription = f"{processor}--{variant}--{spec_name}"
|
||||
queues.append((topic, subscription))
|
||||
topics.add(topic)
|
||||
|
||||
return queues
|
||||
return topics
|
||||
|
||||
async def _delete_queues(self, queues):
|
||||
"""Delete queues with retries. Best-effort — logs failures but
|
||||
async def _delete_topics(self, topics):
|
||||
"""Delete topics with retries. Best-effort — logs failures but
|
||||
does not raise."""
|
||||
for attempt in range(DELETE_RETRIES):
|
||||
remaining = []
|
||||
|
||||
for topic, subscription in queues:
|
||||
for topic in topics:
|
||||
try:
|
||||
await self.pubsub.delete_queue(topic, subscription)
|
||||
await self.pubsub.delete_topic(topic)
|
||||
except Exception as e:
|
||||
logger.warning(
|
||||
f"Queue delete failed (attempt {attempt + 1}/"
|
||||
f"Topic delete failed (attempt {attempt + 1}/"
|
||||
f"{DELETE_RETRIES}): {topic}: {e}"
|
||||
)
|
||||
remaining.append((topic, subscription))
|
||||
remaining.append(topic)
|
||||
|
||||
if not remaining:
|
||||
return
|
||||
|
||||
queues = remaining
|
||||
topics = remaining
|
||||
|
||||
if attempt < DELETE_RETRIES - 1:
|
||||
await asyncio.sleep(DELETE_RETRY_DELAY)
|
||||
|
||||
for topic, subscription in queues:
|
||||
for topic in topics:
|
||||
logger.error(
|
||||
f"Failed to delete queue after {DELETE_RETRIES} "
|
||||
f"Failed to delete topic after {DELETE_RETRIES} "
|
||||
f"attempts: {topic}"
|
||||
)
|
||||
|
||||
|
|
@ -426,8 +424,8 @@ class FlowConfig:
|
|||
result = result.replace(f"{{{param_name}}}", str(param_value))
|
||||
return result
|
||||
|
||||
# Collect queue identifiers before removing config
|
||||
queues = self._collect_flow_queues(cls, repl_template)
|
||||
# Collect topic identifiers before removing config
|
||||
topics = self._collect_flow_topics(cls, repl_template)
|
||||
|
||||
# Phase 1: Set status to "stopping" and remove processor config.
|
||||
# The config push tells processors to shut down their consumers.
|
||||
|
|
@ -448,8 +446,8 @@ class FlowConfig:
|
|||
|
||||
await self.config.delete_many(deletes)
|
||||
|
||||
# Phase 2: Delete queues with retries, then remove the flow record.
|
||||
await self._delete_queues(queues)
|
||||
# Phase 2: Delete topics with retries, then remove the flow record.
|
||||
await self._delete_topics(topics)
|
||||
|
||||
if msg.flow_id in await self.config.keys("flow"):
|
||||
await self.config.delete("flow", msg.flow_id)
|
||||
|
|
|
|||
|
|
@ -101,11 +101,9 @@ class Processor(AsyncProcessor):
|
|||
|
||||
async def start(self):
|
||||
|
||||
await self.pubsub.ensure_queue(
|
||||
self.flow_request_topic, self.flow_request_subscriber
|
||||
)
|
||||
await self.pubsub.ensure_topic(self.flow_request_topic)
|
||||
await self.config_client.start()
|
||||
await self.flow.ensure_existing_flow_queues()
|
||||
await self.flow.ensure_existing_flow_topics()
|
||||
await self.flow_request_consumer.start()
|
||||
|
||||
async def on_flow_request(self, msg, consumer, flow):
|
||||
|
|
|
|||
|
|
@ -263,12 +263,8 @@ class Processor(AsyncProcessor):
|
|||
|
||||
async def start(self):
|
||||
|
||||
await self.pubsub.ensure_queue(
|
||||
self.librarian_request_topic, self.librarian_request_subscriber
|
||||
)
|
||||
await self.pubsub.ensure_queue(
|
||||
self.collection_request_topic, self.collection_request_subscriber
|
||||
)
|
||||
await self.pubsub.ensure_topic(self.librarian_request_topic)
|
||||
await self.pubsub.ensure_topic(self.collection_request_topic)
|
||||
await super(Processor, self).start()
|
||||
await self.librarian_request_consumer.start()
|
||||
await self.librarian_response_producer.start()
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue