trustgraph/trustgraph-base/trustgraph/base/chunking_service.py
cybermaggedon 03cc5ac80f
Per-flow librarian clients and per-workspace response queues (#865)
Replace singleton LibrarianClient with per-flow instances via the new
LibrarianSpec, giving each flow its own librarian tied to the
workspace-scoped request/response queues from the blueprint.

Move all workspace-scoped services (config, flow, librarian, knowledge)
from a single base-queue response producer to per-workspace response
producers created alongside the existing per-workspace request
consumers.  Update the gateway dispatcher and bootstrapper flow client
to subscribe to the matching workspace-scoped response queues.

Fix WorkspaceInit to register workspaces through the IAM
create-workspace API so they appear in __workspaces__ and are visible
to the gateway.  Simplify the bootstrapper gate to only check
config-svc reachability.

Updated tests accordingly.
2026-05-06 12:01:01 +01:00

95 lines
2.9 KiB
Python

"""
Base chunking service that provides parameter specification functionality
for chunk-size and chunk-overlap parameters, and librarian client for
fetching large document content.
"""
import logging
from .flow_processor import FlowProcessor
from .parameter_spec import ParameterSpec
from .librarian_spec import LibrarianSpec
# Module logger
logger = logging.getLogger(__name__)
class ChunkingService(FlowProcessor):
"""Base service for chunking processors with parameter specification support"""
def __init__(self, **params):
id = params.get("id", "chunker")
# Call parent constructor
super(ChunkingService, self).__init__(**params)
# Register parameter specifications for chunk-size and chunk-overlap
self.register_specification(
ParameterSpec(name="chunk-size")
)
self.register_specification(
ParameterSpec(name="chunk-overlap")
)
self.register_specification(
LibrarianSpec()
)
logger.debug("ChunkingService initialized with parameter specifications")
async def get_document_text(self, doc, flow):
"""
Get text content from a TextDocument, fetching from librarian if needed.
Args:
doc: TextDocument with either inline text or document_id
flow: Flow object with librarian client
Returns:
str: The document text content
"""
if doc.document_id and not doc.text:
logger.info(f"Fetching document {doc.document_id} from librarian...")
text = await flow.librarian.fetch_document_text(
document_id=doc.document_id,
)
logger.info(f"Fetched {len(text)} characters from librarian")
return text
else:
return doc.text.decode("utf-8")
async def chunk_document(self, msg, consumer, flow, default_chunk_size, default_chunk_overlap):
"""
Extract chunk parameters from flow and return effective values
Args:
msg: The message being processed
consumer: The consumer instance
flow: The flow object containing parameters
default_chunk_size: Default chunk size if not configured
default_chunk_overlap: Default chunk overlap if not configured
Returns:
tuple: (chunk_size, chunk_overlap) effective values
"""
chunk_size = default_chunk_size
chunk_overlap = default_chunk_overlap
try:
cs = flow("chunk-size")
if cs is not None:
chunk_size = int(cs)
except Exception as e:
logger.warning(f"Could not parse chunk-size parameter: {e}")
try:
co = flow("chunk-overlap")
if co is not None:
chunk_overlap = int(co)
except Exception as e:
logger.warning(f"Could not parse chunk-overlap parameter: {e}")
return chunk_size, chunk_overlap