mirror of
https://github.com/trustgraph-ai/trustgraph.git
synced 2026-07-03 15:01:00 +02:00
Replace singleton LibrarianClient with per-flow instances via the new LibrarianSpec, giving each flow its own librarian tied to the workspace-scoped request/response queues from the blueprint. Move all workspace-scoped services (config, flow, librarian, knowledge) from a single base-queue response producer to per-workspace response producers created alongside the existing per-workspace request consumers. Update the gateway dispatcher and bootstrapper flow client to subscribe to the matching workspace-scoped response queues. Fix WorkspaceInit to register workspaces through the IAM create-workspace API so they appear in __workspaces__ and are visible to the gateway. Simplify the bootstrapper gate to only check config-svc reachability. Updated tests accordingly.
95 lines
2.9 KiB
Python
95 lines
2.9 KiB
Python
"""
|
|
Base chunking service that provides parameter specification functionality
|
|
for chunk-size and chunk-overlap parameters, and librarian client for
|
|
fetching large document content.
|
|
"""
|
|
|
|
import logging
|
|
|
|
from .flow_processor import FlowProcessor
|
|
from .parameter_spec import ParameterSpec
|
|
from .librarian_spec import LibrarianSpec
|
|
|
|
# Module logger
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
class ChunkingService(FlowProcessor):
|
|
"""Base service for chunking processors with parameter specification support"""
|
|
|
|
def __init__(self, **params):
|
|
|
|
id = params.get("id", "chunker")
|
|
|
|
# Call parent constructor
|
|
super(ChunkingService, self).__init__(**params)
|
|
|
|
# Register parameter specifications for chunk-size and chunk-overlap
|
|
self.register_specification(
|
|
ParameterSpec(name="chunk-size")
|
|
)
|
|
|
|
self.register_specification(
|
|
ParameterSpec(name="chunk-overlap")
|
|
)
|
|
|
|
self.register_specification(
|
|
LibrarianSpec()
|
|
)
|
|
|
|
logger.debug("ChunkingService initialized with parameter specifications")
|
|
|
|
async def get_document_text(self, doc, flow):
|
|
"""
|
|
Get text content from a TextDocument, fetching from librarian if needed.
|
|
|
|
Args:
|
|
doc: TextDocument with either inline text or document_id
|
|
flow: Flow object with librarian client
|
|
|
|
Returns:
|
|
str: The document text content
|
|
"""
|
|
if doc.document_id and not doc.text:
|
|
logger.info(f"Fetching document {doc.document_id} from librarian...")
|
|
text = await flow.librarian.fetch_document_text(
|
|
document_id=doc.document_id,
|
|
)
|
|
logger.info(f"Fetched {len(text)} characters from librarian")
|
|
return text
|
|
else:
|
|
return doc.text.decode("utf-8")
|
|
|
|
async def chunk_document(self, msg, consumer, flow, default_chunk_size, default_chunk_overlap):
|
|
"""
|
|
Extract chunk parameters from flow and return effective values
|
|
|
|
Args:
|
|
msg: The message being processed
|
|
consumer: The consumer instance
|
|
flow: The flow object containing parameters
|
|
default_chunk_size: Default chunk size if not configured
|
|
default_chunk_overlap: Default chunk overlap if not configured
|
|
|
|
Returns:
|
|
tuple: (chunk_size, chunk_overlap) effective values
|
|
"""
|
|
|
|
chunk_size = default_chunk_size
|
|
chunk_overlap = default_chunk_overlap
|
|
|
|
try:
|
|
cs = flow("chunk-size")
|
|
if cs is not None:
|
|
chunk_size = int(cs)
|
|
except Exception as e:
|
|
logger.warning(f"Could not parse chunk-size parameter: {e}")
|
|
|
|
try:
|
|
co = flow("chunk-overlap")
|
|
if co is not None:
|
|
chunk_overlap = int(co)
|
|
except Exception as e:
|
|
logger.warning(f"Could not parse chunk-overlap parameter: {e}")
|
|
|
|
return chunk_size, chunk_overlap
|