Object import (#497)

* Object import dispatcher

* Add object import gateway test
This commit is contained in:
cybermaggedon 2025-09-05 14:06:01 +01:00 committed by GitHub
parent f1d08969ec
commit 257a7951a7
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
3 changed files with 490 additions and 0 deletions

View file

@ -37,6 +37,7 @@ from . triples_import import TriplesImport
from . graph_embeddings_import import GraphEmbeddingsImport
from . document_embeddings_import import DocumentEmbeddingsImport
from . entity_contexts_import import EntityContextsImport
from . objects_import import ObjectsImport
from . core_export import CoreExport
from . core_import import CoreImport
@ -82,6 +83,7 @@ import_dispatchers = {
"graph-embeddings": GraphEmbeddingsImport,
"document-embeddings": DocumentEmbeddingsImport,
"entity-contexts": EntityContextsImport,
"objects": ObjectsImport,
}
class DispatcherWrapper:

View file

@ -0,0 +1,70 @@
import asyncio
import uuid
import logging
from aiohttp import WSMsgType
from ... schema import Metadata
from ... schema import ExtractedObject
from ... base import Publisher
from . serialize import to_subgraph
# Module logger
logger = logging.getLogger(__name__)
class ObjectsImport:
def __init__(
self, ws, running, pulsar_client, queue
):
self.ws = ws
self.running = running
self.publisher = Publisher(
pulsar_client, topic = queue, schema = ExtractedObject
)
async def start(self):
await self.publisher.start()
async def destroy(self):
# Step 1: Stop accepting new messages
self.running.stop()
# Step 2: Wait for publisher to drain its queue
logger.info("Draining publisher queue...")
await self.publisher.stop()
# Step 3: Close websocket only after queue is drained
if self.ws:
await self.ws.close()
async def receive(self, msg):
data = msg.json()
elt = ExtractedObject(
metadata=Metadata(
id=data["metadata"]["id"],
metadata=to_subgraph(data["metadata"].get("metadata", [])),
user=data["metadata"]["user"],
collection=data["metadata"]["collection"],
),
schema_name=data["schema_name"],
values=data["values"],
confidence=data.get("confidence", 1.0),
source_span=data.get("source_span", ""),
)
await self.publisher.send(None, elt)
async def run(self):
while self.running.get():
await asyncio.sleep(0.5)
if self.ws:
await self.ws.close()
self.ws = None