Merge pull request #226 from trustgraph-ai/feature/doc-load-over-websocket

Doc & text load over websocket
2026-05-01 19:32:38 +02:00 · 2024-12-28 19:59:15 +00:00 · 2024-12-28 19:59:15 +00:00 · f80424c795
commit f80424c795
parent 444f2d97e2 e4fdde541e
7 changed files with 358 additions and 112 deletions
--- a/test-api/test-load-document
+++ b/test-api/test-load-document
@ -0,0 +1,106 @@
+#!/usr/bin/env python3
+
+import requests
+import json
+import sys
+import base64
+
+url = "http://localhost:8088/api/v1/"
+
+############################################################################
+
+text = open("../sources/Challenger-Report-Vol1.pdf", "rb").read()
+
+# Some random identifiers.  The doc ID is important, as extracted knowledge
+# is linked back to this identifier
+org_id = "https://trustgraph.ai/org/1dd51ece-8bd3-48b8-98ce-1ac9164c5214"
+doc_id = "https://trustgraph.ai/doc/72ef3374-af7a-40c4-8c7b-45050aef5b90"
+pub_id = "https://trustgraph.ai/pubev/59012ae1-65d4-441f-8288-b6f3c6c15333"
+
+# Organization metadata
+org_facts = [
+    [org_id, "http://www.w3.org/1999/02/22-rdf-syntax-ns#type",
+     "https://schema.org/Organization"],
+    [org_id, "http://www.w3.org/2000/01/rdf-schema#label", "NASA"],
+    [org_id, "https://schema.org/name", "NASA"]
+]
+
+# Publication metadata.  Note how it links to the Organization
+pub_facts = [
+    [pub_id, "http://www.w3.org/1999/02/22-rdf-syntax-ns#type",
+     "https://schema.org/PublicationEvent"],
+    [pub_id, "https://schema.org/description", "Uploading to Github"],
+    [pub_id, "https://schema.org/endDate", "1986-06-06"],
+    [pub_id, "https://schema.org/publishedBy", org_id],
+    [pub_id, "https://schema.org/startDate", "1986-06-06"]
+]
+
+# Document metadata.  Note how it links to the publication event
+doc_facts = [
+    [doc_id, "http://www.w3.org/1999/02/22-rdf-syntax-ns#type",
+     "https://schema.org/DigitalDocument"],
+    [doc_id, "http://www.w3.org/2000/01/rdf-schema#label",
+     "Challenger Report Volume 1"],
+    [doc_id, "https://schema.org/copyrightHolder", "US Government"],
+    [doc_id, "https://schema.org/copyrightNotice",
+     "Work of the US Gov. Public Use Permitted"],
+    [doc_id, "https://schema.org/copyrightYear", "1986"],
+    [doc_id, "https://schema.org/description",
+     "The findings of the Presidential Commission regarding the circumstances surrounding the Challenger accident are reported and recommendations for corrective action are outlined"
+     ],
+    [doc_id, "https://schema.org/keywords", "nasa"],
+    [doc_id, "https://schema.org/keywords", "challenger"],
+    [doc_id, "https://schema.org/keywords", "space-shuttle"],
+    [doc_id, "https://schema.org/keywords", "shuttle"],
+    [doc_id, "https://schema.org/keywords", "orbiter"],
+    [doc_id, "https://schema.org/name", "Challenger Report Volume 1"],
+    [doc_id, "https://schema.org/publication", pub_id],
+    [doc_id, "https://schema.org/url",
+     "https://ntrs.nasa.gov/citations/19860015255"]
+]
+
+def to_value(x):
+    if x.startswith("https://"):
+        return { "v": x, "e": True }
+    if x.startswith("http://"):
+        return { "v": x, "e": True }
+    return { "v": x, "e": False }
+
+# Convert the above metadata into the right form
+metadata = [
+    { "s": to_value(t[0]), "p": to_value(t[1]), "o": to_value(t[2]) }
+    for t in org_facts + pub_facts + doc_facts
+]
+
+input = {
+    
+    # Document identifer.  Knowledge derived by TrustGraph is linked to this
+    # identifier, so the additional metadata specified above is linked to the
+    # derived knowledge and users of the knowledge graph could see
+    # information about the source of knowledge
+    "id": doc_id,
+    
+    # Additional metadata in the form of RDF triples
+    "metadata": metadata,
+    
+    # Text character set.  Default is UTF-8
+    "charset": "utf-8",
+    
+    # The PDF document, is presented as a base64 encoded document.
+    "data": base64.b64encode(text).decode("utf-8")
+
+}
+
+resp = requests.post(
+    f"{url}load/document",
+    json=input,
+)
+
+resp = resp.json()
+
+if "error" in resp:
+    print(f"Error: {resp['error']}")
+    sys.exit(1)
+
+print(resp)
+
--- a/test-api/test-load-text
+++ b/test-api/test-load-text
@ -0,0 +1,100 @@
+#!/usr/bin/env python3
+
+import requests
+import json
+import sys
+import base64
+
+url = "http://localhost:8088/api/v1/"
+
+############################################################################
+
+text = open("docs/README.cats", "rb").read()
+
+# Some random identifiers.  The doc ID is important, as extracted knowledge
+# is linked back to this identifier
+org_id = "https://trustgraph.ai/org/3c35111a-f8ce-54b2-4dd6-c673f8bf0d09"
+doc_id = "https://trustgraph.ai/doc/4faa45c1-f91a-a96a-d44f-2e57b9813db8"
+pub_id = "https://trustgraph.ai/pubev/a847d950-a281-4099-aaab-c5e35333ff61"
+
+# Organization metadata
+org_facts = [
+    [org_id, "http://www.w3.org/1999/02/22-rdf-syntax-ns#type",
+     "https://schema.org/Organization"],
+    [org_id, "http://www.w3.org/2000/01/rdf-schema#label", "trustgraph.ai"],
+    [org_id, "https://schema.org/name", "trustgraph.ai"]
+]
+
+# Publication metadata.  Note how it links to the Organization
+pub_facts = [
+    [pub_id, "http://www.w3.org/1999/02/22-rdf-syntax-ns#type",
+     "https://schema.org/PublicationEvent"],
+    [pub_id, "https://schema.org/description", "Uploading to Github"],
+    [pub_id, "https://schema.org/endDate", "2024-10-23"],
+    [pub_id, "https://schema.org/publishedBy", org_id],
+    [pub_id, "https://schema.org/startDate", "2024-10-23"]
+]
+
+# Document metadata.  Note how it links to the publication event
+doc_facts = [
+    [doc_id, "http://www.w3.org/1999/02/22-rdf-syntax-ns#type",
+     "https://schema.org/DigitalDocument"],
+    [doc_id, "http://www.w3.org/2000/01/rdf-schema#label", "Mark's cats"],
+    [doc_id, "https://schema.org/copyrightHolder", "trustgraph.ai"],
+    [doc_id, "https://schema.org/copyrightNotice", "Public domain"],
+    [doc_id, "https://schema.org/copyrightYear", "2024"],
+    [doc_id, "https://schema.org/description",
+     "This document describes Mark's cats"],
+    [doc_id, "https://schema.org/keywords", "animals"],
+    [doc_id, "https://schema.org/keywords", "cats"],
+    [doc_id, "https://schema.org/keywords", "home-life"],
+    [doc_id, "https://schema.org/name", "Mark's cats"],
+    [doc_id, "https://schema.org/publication", pub_id],
+    [doc_id, "https://schema.org/url", "https://example.com"]
+]
+
+def to_value(x):
+    if x.startswith("https://"):
+        return { "v": x, "e": True }
+    if x.startswith("http://"):
+        return { "v": x, "e": True }
+    return { "v": x, "e": False }
+
+# Convert the above metadata into the right form
+metadata = [
+    { "s": to_value(t[0]), "p": to_value(t[1]), "o": to_value(t[2]) }
+    for t in org_facts + pub_facts + doc_facts
+]
+
+input = {
+    
+    # Document identifer.  Knowledge derived by TrustGraph is linked to this
+    # identifier, so the additional metadata specified above is linked to the
+    # derived knowledge and users of the knowledge graph could see
+    # information about the source of knowledge
+    "id": doc_id,
+    
+    # Additional metadata in the form of RDF triples
+    "metadata": metadata,
+    
+    # Text character set.  Default is UTF-8
+    "charset": "utf-8",
+    
+    # The PDF document, is presented as a base64 encoded document.
+    "text": base64.b64encode(text).decode("utf-8")
+    
+}
+
+resp = requests.post(
+    f"{url}load/text",
+    json=input,
+)
+
+resp = resp.json()
+
+if "error" in resp:
+    print(f"Error: {resp['error']}")
+    sys.exit(1)
+
+print(resp)
+
--- a/trustgraph-flow/trustgraph/gateway/document_load.py
+++ b/trustgraph-flow/trustgraph/gateway/document_load.py
@ -0,0 +1,42 @@
+
+import base64
+
+from .. schema import Document
+from .. schema import document_ingest_queue
+
+from . sender import ServiceSender
+from . serialize import to_subgraph
+
+class DocumentLoadSender(ServiceSender):
+    def __init__(self, pulsar_host):
+
+        super(DocumentLoadSender, self).__init__(
+            pulsar_host=pulsar_host,
+            request_queue=document_ingest_queue,
+            request_schema=Document,
+        )
+
+    def to_request(self, body):
+
+        if "metadata" in data:
+            metadata = to_subgraph(data["metadata"])
+        else:
+            metadata = []
+
+        # Doing a base64 decoe/encode here to make sure the
+        # content is valid base64
+        doc = base64.b64decode(data["data"])
+
+        print("Document received")
+
+        return Document(
+            metadata=Metadata(
+                id=data.get("id"),
+                metadata=metadata,
+                user=data.get("user", "trustgraph"),
+                collection=data.get("collection", "default"),
+            ),
+            data=base64.b64encode(doc).decode("utf-8")
+        )
+
+
--- a/trustgraph-flow/trustgraph/gateway/embeddings.py
+++ b/trustgraph-flow/trustgraph/gateway/embeddings.py
@ -26,4 +26,3 @@ class EmbeddingsRequestor(ServiceRequestor):
    def from_response(self, message):
        return { "vectors": message.vectors }, True

-
--- a/trustgraph-flow/trustgraph/gateway/sender.py
+++ b/trustgraph-flow/trustgraph/gateway/sender.py
@ -0,0 +1,49 @@
+
+# Like ServiceRequestor, but just fire-and-forget instead of request/response
+
+import asyncio
+from pulsar.schema import JsonSchema
+import uuid
+import logging
+
+from . publisher import Publisher
+
+logger = logging.getLogger("sender")
+logger.setLevel(logging.INFO)
+
+class ServiceSender:
+
+    def __init__(
+            self,
+            pulsar_host,
+            request_queue, request_schema,
+    ):
+
+        self.pub = Publisher(
+            pulsar_host, request_queue,
+            schema=JsonSchema(request_schema)
+        )
+
+    async def start(self):
+
+        self.pub.start()
+
+    def to_request(self, request):
+        raise RuntimeError("Not defined")
+
+    async def process(self, request, responder=None):
+
+        try:
+
+            await asyncio.to_thread(
+                self.pub.send, None, self.to_request(request)
+            )
+
+            return {}
+
+        except Exception as e:
+
+            logging.error(f"Exception: {e}")
+
+            return { "error": str(e) }
+
--- a/trustgraph-flow/trustgraph/gateway/service.py
+++ b/trustgraph-flow/trustgraph/gateway/service.py
@ -24,9 +24,6 @@ from prometheus_client import start_http_server

 from .. log_level import LogLevel

-from .. schema import Metadata, Document, TextDocument
-from .. schema import document_ingest_queue, text_ingest_queue
-
 from . serialize import to_subgraph
 from . running import Running
 from . publisher import Publisher
@ -46,6 +43,8 @@ from . graph_embeddings_stream import GraphEmbeddingsStreamEndpoint
 from . triples_load import TriplesLoadEndpoint
 from . graph_embeddings_load import GraphEmbeddingsLoadEndpoint
 from . mux import MuxEndpoint
+from . document_load import DocumentLoadSender
+from . text_load import TextLoadSender

 from . endpoint import ServiceEndpoint
 from . auth import Authenticator
@ -120,6 +119,12 @@ class Api:
                pulsar_host=self.pulsar_host, timeout=self.timeout,
                auth = self.auth,
            ),
+            "document-load": DocumentLoadSender(
+                pulsar_host=self.pulsar_host,
+            ),
+            "text-load": TextLoadSender(
+                pulsar_host=self.pulsar_host,
+            ),
        }

        self.endpoints = [
@ -164,6 +169,14 @@ class Api:
                endpoint_path = "/api/v1/internet-search", auth=self.auth,
                requestor = self.services["internet-search"],
            ),
+            ServiceEndpoint(
+                endpoint_path = "/api/v1/load/document", auth=self.auth,
+                requestor = self.services["document-load"],
+            ),
+            ServiceEndpoint(
+                endpoint_path = "/api/v1/load/text", auth=self.auth,
+                requestor = self.services["text-load"],
+            ),
            TriplesStreamEndpoint(
                pulsar_host=self.pulsar_host,
                auth = self.auth,
@ -187,122 +200,14 @@ class Api:
            ),
        ]

-        self.document_out = Publisher(
-            self.pulsar_host, document_ingest_queue,
-            schema=JsonSchema(Document),
-            chunking_enabled=True,
-        )
-
-        self.text_out = Publisher(
-            self.pulsar_host, text_ingest_queue,
-            schema=JsonSchema(TextDocument),
-            chunking_enabled=True,
-        )
-
        for ep in self.endpoints:
            ep.add_routes(self.app)

-        self.app.add_routes([
-            web.post("/api/v1/load/document", self.load_document),
-            web.post("/api/v1/load/text", self.load_text),
-        ])
-
-    async def load_document(self, request):
-
-        try:
-
-            data = await request.json()
-
-            if "metadata" in data:
-                metadata = to_subgraph(data["metadata"])
-            else:
-                metadata = []
-
-            # Doing a base64 decoe/encode here to make sure the
-            # content is valid base64
-            doc = base64.b64decode(data["data"])
-
-            resp = await asyncio.to_thread(
-                self.document_out.send,
-                None,
-                Document(
-                    metadata=Metadata(
-                        id=data.get("id"),
-                        metadata=metadata,
-                        user=data.get("user", "trustgraph"),
-                        collection=data.get("collection", "default"),
-                    ),
-                    data=base64.b64encode(doc).decode("utf-8")
-                )
-            )
-
-            print("Document loaded.")
-
-            return web.json_response(
-                { }
-            )
-
-        except Exception as e:
-            logging.error(f"Exception: {e}")
-
-            return web.json_response(
-                { "error": str(e) }
-            )
-
-    async def load_text(self, request):
-
-        try:
-
-            data = await request.json()
-
-            if "metadata" in data:
-                metadata = to_subgraph(data["metadata"])
-            else:
-                metadata = []
-
-            if "charset" in data:
-                charset = data["charset"]
-            else:
-                charset = "utf-8"
-
-            # Text is base64 encoded
-            text = base64.b64decode(data["text"]).decode(charset)
-
-            resp = await asyncio.to_thread(
-                self.text_out.send,
-                None,
-                TextDocument(
-                    metadata=Metadata(
-                        id=data.get("id"),
-                        metadata=metadata,
-                        user=data.get("user", "trustgraph"),
-                        collection=data.get("collection", "default"),
-                    ),
-                    text=text,
-                )
-            )
-
-            print("Text document loaded.")
-
-            return web.json_response(
-                { }
-            )
-
-        except Exception as e:
-            logging.error(f"Exception: {e}")
-
-            return web.json_response(
-                { "error": str(e) }
-            )
-
    async def app_factory(self):

        for ep in self.endpoints:
            await ep.start()

-        self.document_out.start()
-        self.text_out.start()
-
        return self.app

    def run(self):
--- a/trustgraph-flow/trustgraph/gateway/text_load.py
+++ b/trustgraph-flow/trustgraph/gateway/text_load.py
@ -0,0 +1,45 @@
+
+import base64
+
+from .. schema import TextDocument, Metadata
+from .. schema import text_ingest_queue
+
+from . sender import ServiceSender
+from . serialize import to_subgraph
+
+class TextLoadSender(ServiceSender):
+    def __init__(self, pulsar_host):
+
+        super(TextLoadSender, self).__init__(
+            pulsar_host=pulsar_host,
+            request_queue=text_ingest_queue,
+            request_schema=TextDocument,
+        )
+
+    def to_request(self, body):
+
+        if "metadata" in body:
+            metadata = to_subgraph(body["metadata"])
+        else:
+            metadata = []
+
+        if "charset" in body:
+            charset = body["charset"]
+        else:
+            charset = "utf-8"
+
+        # Text is base64 encoded
+        text = base64.b64decode(body["text"]).decode(charset)
+
+        print("Text document received")
+
+        return TextDocument(
+            metadata=Metadata(
+                id=body.get("id"),
+                metabody=metadata,
+                user=body.get("user", "trustgraph"),
+                collection=body.get("collection", "default"),
+            ),
+            text=text,
+        )
+