mirror of
https://github.com/trustgraph-ai/trustgraph.git
synced 2026-04-25 08:26:21 +02:00
The id field in pipeline Metadata was being overwritten at each processing stage (document → page → chunk), causing knowledge storage to create separate cores per chunk instead of grouping by document. Add a root field that: - Is set by librarian to the original document ID - Is copied unchanged through PDF decoder, chunkers, and extractors - Is used by knowledge storage for document_id grouping (with fallback to id) Changes: - Add root field to Metadata schema with empty string default - Set root=document.id in librarian when initiating document processing - Copy root through PDF decoder, recursive chunker, and all extractors - Update knowledge storage to use root (or id as fallback) for grouping - Add root handling to translators and gateway serialization - Update test mock Metadata class to include root parameter
170 lines
No EOL
6.2 KiB
Python
170 lines
No EOL
6.2 KiB
Python
from typing import Dict, Any, Tuple, Optional
|
|
from ...schema import (
|
|
KnowledgeRequest, KnowledgeResponse, Triples, GraphEmbeddings,
|
|
Metadata, EntityEmbeddings
|
|
)
|
|
from .base import MessageTranslator
|
|
from .primitives import ValueTranslator, SubgraphTranslator
|
|
|
|
|
|
class KnowledgeRequestTranslator(MessageTranslator):
|
|
"""Translator for KnowledgeRequest schema objects"""
|
|
|
|
def __init__(self):
|
|
self.value_translator = ValueTranslator()
|
|
self.subgraph_translator = SubgraphTranslator()
|
|
|
|
def to_pulsar(self, data: Dict[str, Any]) -> KnowledgeRequest:
|
|
triples = None
|
|
if "triples" in data:
|
|
triples = Triples(
|
|
metadata=Metadata(
|
|
id=data["triples"]["metadata"]["id"],
|
|
root=data["triples"]["metadata"].get("root", ""),
|
|
user=data["triples"]["metadata"]["user"],
|
|
collection=data["triples"]["metadata"]["collection"]
|
|
),
|
|
triples=self.subgraph_translator.to_pulsar(data["triples"]["triples"]),
|
|
)
|
|
|
|
graph_embeddings = None
|
|
if "graph-embeddings" in data:
|
|
graph_embeddings = GraphEmbeddings(
|
|
metadata=Metadata(
|
|
id=data["graph-embeddings"]["metadata"]["id"],
|
|
root=data["graph-embeddings"]["metadata"].get("root", ""),
|
|
user=data["graph-embeddings"]["metadata"]["user"],
|
|
collection=data["graph-embeddings"]["metadata"]["collection"]
|
|
),
|
|
entities=[
|
|
EntityEmbeddings(
|
|
entity=self.value_translator.to_pulsar(ent["entity"]),
|
|
vectors=ent["vectors"],
|
|
)
|
|
for ent in data["graph-embeddings"]["entities"]
|
|
]
|
|
)
|
|
|
|
return KnowledgeRequest(
|
|
operation=data.get("operation"),
|
|
user=data.get("user"),
|
|
id=data.get("id"),
|
|
flow=data.get("flow"),
|
|
collection=data.get("collection"),
|
|
triples=triples,
|
|
graph_embeddings=graph_embeddings,
|
|
)
|
|
|
|
def from_pulsar(self, obj: KnowledgeRequest) -> Dict[str, Any]:
|
|
result = {}
|
|
|
|
if obj.operation:
|
|
result["operation"] = obj.operation
|
|
if obj.user:
|
|
result["user"] = obj.user
|
|
if obj.id:
|
|
result["id"] = obj.id
|
|
if obj.flow:
|
|
result["flow"] = obj.flow
|
|
if obj.collection:
|
|
result["collection"] = obj.collection
|
|
|
|
if obj.triples:
|
|
result["triples"] = {
|
|
"metadata": {
|
|
"id": obj.triples.metadata.id,
|
|
"root": obj.triples.metadata.root,
|
|
"user": obj.triples.metadata.user,
|
|
"collection": obj.triples.metadata.collection,
|
|
},
|
|
"triples": self.subgraph_translator.from_pulsar(obj.triples.triples),
|
|
}
|
|
|
|
if obj.graph_embeddings:
|
|
result["graph-embeddings"] = {
|
|
"metadata": {
|
|
"id": obj.graph_embeddings.metadata.id,
|
|
"root": obj.graph_embeddings.metadata.root,
|
|
"user": obj.graph_embeddings.metadata.user,
|
|
"collection": obj.graph_embeddings.metadata.collection,
|
|
},
|
|
"entities": [
|
|
{
|
|
"vector": entity.vector,
|
|
"entity": self.value_translator.from_pulsar(entity.entity),
|
|
}
|
|
for entity in obj.graph_embeddings.entities
|
|
],
|
|
}
|
|
|
|
return result
|
|
|
|
|
|
class KnowledgeResponseTranslator(MessageTranslator):
|
|
"""Translator for KnowledgeResponse schema objects"""
|
|
|
|
def __init__(self):
|
|
self.value_translator = ValueTranslator()
|
|
self.subgraph_translator = SubgraphTranslator()
|
|
|
|
def to_pulsar(self, data: Dict[str, Any]) -> KnowledgeResponse:
|
|
raise NotImplementedError("Response translation to Pulsar not typically needed")
|
|
|
|
def from_pulsar(self, obj: KnowledgeResponse) -> Dict[str, Any]:
|
|
# Response to list operation
|
|
if obj.ids is not None:
|
|
return {"ids": obj.ids}
|
|
|
|
# Streaming triples response
|
|
if obj.triples:
|
|
return {
|
|
"triples": {
|
|
"metadata": {
|
|
"id": obj.triples.metadata.id,
|
|
"root": obj.triples.metadata.root,
|
|
"user": obj.triples.metadata.user,
|
|
"collection": obj.triples.metadata.collection,
|
|
},
|
|
"triples": self.subgraph_translator.from_pulsar(obj.triples.triples),
|
|
}
|
|
}
|
|
|
|
# Streaming graph embeddings response
|
|
if obj.graph_embeddings:
|
|
return {
|
|
"graph-embeddings": {
|
|
"metadata": {
|
|
"id": obj.graph_embeddings.metadata.id,
|
|
"root": obj.graph_embeddings.metadata.root,
|
|
"user": obj.graph_embeddings.metadata.user,
|
|
"collection": obj.graph_embeddings.metadata.collection,
|
|
},
|
|
"entities": [
|
|
{
|
|
"vector": entity.vector,
|
|
"entity": self.value_translator.from_pulsar(entity.entity),
|
|
}
|
|
for entity in obj.graph_embeddings.entities
|
|
],
|
|
}
|
|
}
|
|
|
|
# End of stream marker
|
|
if obj.eos is True:
|
|
return {"eos": True}
|
|
|
|
# Empty response (successful delete)
|
|
return {}
|
|
|
|
def from_response_with_completion(self, obj: KnowledgeResponse) -> Tuple[Dict[str, Any], bool]:
|
|
"""Returns (response_dict, is_final)"""
|
|
response = self.from_pulsar(obj)
|
|
|
|
# Check if this is a final response
|
|
is_final = (
|
|
obj.ids is not None or # List response
|
|
obj.eos is True or # End of stream
|
|
(not obj.triples and not obj.graph_embeddings) # Empty response
|
|
)
|
|
|
|
return response, is_final |