trustgraph/tests/unit/test_api/test_library_api.py

"""
Tests for the Library API wrapper round-trip behavior.
Covers the get_documents → update_document path and edge cases
from issue #893.
"""

import datetime
import pytest
from unittest.mock import MagicMock, patch

from trustgraph.api.library import Library, to_value, from_value
from trustgraph.api.types import DocumentMetadata, Triple
from trustgraph.knowledge import Uri, Literal


# ---------------------------------------------------------------------------
# Helpers
# ---------------------------------------------------------------------------

def _make_library(response=None):
    api = MagicMock()
    api.workspace = "default"
    api.request.return_value = response or {}
    lib = Library(api)
    return lib, api


def _wire_triple(s_iri, p_iri, o_val):
    return {
        "s": {"t": "i", "i": s_iri},
        "p": {"t": "i", "i": p_iri},
        "o": {"t": "l", "v": o_val},
    }


def _doc_wire(id="doc-1", time=1700000000, title="Test Doc",
              kind="text/plain", comments="", tags=None,
              metadata=None, parent_id="", document_type="source",
              include_title=True):
    doc = {
        "id": id,
        "time": time,
        "kind": kind,
        "comments": comments,
        "metadata": metadata or [],
        "tags": tags or [],
        "parent-id": parent_id,
        "document-type": document_type,
    }
    if include_title:
        doc["title"] = title
    return doc


# ---------------------------------------------------------------------------
# Bug 1: get_documents tolerates missing title
# ---------------------------------------------------------------------------

class TestGetDocumentsMissingTitle:

    def test_missing_title_defaults_to_empty(self):
        doc = _doc_wire(include_title=False)
        lib, api = _make_library({"document-metadatas": [doc]})

        result = lib.get_documents()

        assert len(result) == 1
        assert result[0].title == ""

    def test_present_title_preserved(self):
        doc = _doc_wire(title="My Title")
        lib, api = _make_library({"document-metadatas": [doc]})

        result = lib.get_documents()

        assert result[0].title == "My Title"


# ---------------------------------------------------------------------------
# Bug 2: update_document handles Triple objects (attribute access)
# ---------------------------------------------------------------------------

class TestUpdateDocumentTripleAccess:

    def test_triple_objects_serialized_correctly(self):
        lib, api = _make_library({})

        metadata = DocumentMetadata(
            id="doc-1",
            time=datetime.datetime.fromtimestamp(1700000000),
            kind="text/plain",
            title="Test",
            comments="",
            metadata=[
                Triple(
                    s=Uri("http://example.org/entity/alice"),
                    p=Uri("http://example.org/rel/knows"),
                    o=Literal("Bob"),
                ),
            ],
            tags=["test"],
        )

        lib.update_document(id="doc-1", metadata=metadata)

        call_args = api.request.call_args[0][1]
        triples = call_args["document-metadata"]["metadata"]

        assert len(triples) == 1
        assert triples[0]["s"]["i"] == "http://example.org/entity/alice"
        assert triples[0]["p"]["i"] == "http://example.org/rel/knows"
        assert triples[0]["o"]["v"] == "Bob"

    def test_empty_metadata_list(self):
        lib, api = _make_library({})

        metadata = DocumentMetadata(
            id="doc-1",
            time=datetime.datetime.fromtimestamp(1700000000),
            kind="text/plain",
            title="Test",
            comments="",
            metadata=[],
            tags=[],
        )

        lib.update_document(id="doc-1", metadata=metadata)

        call_args = api.request.call_args[0][1]
        assert call_args["document-metadata"]["metadata"] == []


# ---------------------------------------------------------------------------
# Bug 3: update_document serializes datetime to int seconds
# ---------------------------------------------------------------------------

class TestUpdateDocumentTimeSerialization:

    def test_datetime_serialized_to_int(self):
        lib, api = _make_library({})

        ts = 1700000000
        metadata = DocumentMetadata(
            id="doc-1",
            time=datetime.datetime.fromtimestamp(ts),
            kind="text/plain",
            title="Test",
            comments="",
            metadata=[],
            tags=[],
        )

        lib.update_document(id="doc-1", metadata=metadata)

        call_args = api.request.call_args[0][1]
        wire_time = call_args["document-metadata"]["time"]

        assert isinstance(wire_time, int)
        assert wire_time == ts

    def test_int_time_passed_through(self):
        lib, api = _make_library({})

        metadata = DocumentMetadata(
            id="doc-1",
            time=1700000000,
            kind="text/plain",
            title="Test",
            comments="",
            metadata=[],
            tags=[],
        )

        lib.update_document(id="doc-1", metadata=metadata)

        call_args = api.request.call_args[0][1]
        assert call_args["document-metadata"]["time"] == 1700000000


# ---------------------------------------------------------------------------
# Bug 4: update_document handles empty server response
# ---------------------------------------------------------------------------

class TestUpdateDocumentEmptyResponse:

    def test_empty_response_returns_input_metadata(self):
        lib, api = _make_library({})

        metadata = DocumentMetadata(
            id="doc-1",
            time=datetime.datetime.fromtimestamp(1700000000),
            kind="text/plain",
            title="Updated Title",
            comments="notes",
            metadata=[],
            tags=["a"],
        )

        result = lib.update_document(id="doc-1", metadata=metadata)

        assert result is metadata

    def test_full_response_parsed(self):
        response_doc = _doc_wire(
            id="doc-1", title="Server Title", tags=["b"],
        )
        lib, api = _make_library({"document-metadata": response_doc})

        metadata = DocumentMetadata(
            id="doc-1",
            time=datetime.datetime.fromtimestamp(1700000000),
            kind="text/plain",
            title="Client Title",
            comments="",
            metadata=[],
            tags=["a"],
        )

        result = lib.update_document(id="doc-1", metadata=metadata)

        assert result.title == "Server Title"
        assert result.tags == ["b"]


# ---------------------------------------------------------------------------
# Bug 5: update_document sends both id and document-id
# ---------------------------------------------------------------------------

class TestUpdateDocumentIdKeys:

    def test_both_id_keys_sent(self):
        lib, api = _make_library({})

        metadata = DocumentMetadata(
            id="doc-1",
            time=datetime.datetime.fromtimestamp(1700000000),
            kind="text/plain",
            title="Test",
            comments="",
            metadata=[],
            tags=[],
        )

        lib.update_document(id="doc-1", metadata=metadata)

        call_args = api.request.call_args[0][1]
        doc_meta = call_args["document-metadata"]

        assert doc_meta["id"] == "doc-1"
        assert doc_meta["document-id"] == "doc-1"


# ---------------------------------------------------------------------------
# Round-trip: get_documents → update_document
# ---------------------------------------------------------------------------

class TestGetUpdateRoundTrip:

    def test_full_round_trip(self):
        wire_doc = _doc_wire(
            id="doc-42",
            title="Original",
            tags=["v1"],
            metadata=[_wire_triple(
                "http://example.org/e/1",
                "http://example.org/r/type",
                "report",
            )],
        )

        lib, api = _make_library({"document-metadatas": [wire_doc]})

        docs = lib.get_documents()
        assert len(docs) == 1

        doc = docs[0]
        doc.title = "Updated"
        doc.tags.append("v2")

        # Server returns empty on update
        api.request.return_value = {}
        result = lib.update_document(id=doc.id, metadata=doc)

        # Should not raise, should return the input metadata
        assert result.title == "Updated"
        assert "v2" in result.tags

        # Verify the wire format sent
        call_args = api.request.call_args[0][1]
        doc_meta = call_args["document-metadata"]

        assert doc_meta["id"] == "doc-42"
        assert doc_meta["title"] == "Updated"
        assert isinstance(doc_meta["time"], int)
        assert len(doc_meta["metadata"]) == 1
        assert doc_meta["metadata"][0]["o"]["v"] == "report"
release/v2.4 -> master (#932) * CLI auth migration, document embeddings core lifecycle (#913) Migrate get_kg_core and put_kg_core CLI tools to use Api/SocketClient with first-frame auth (fixes broken raw websocket path). Fix wire format field names (root/vector). Remove ~600 lines of dead raw websocket code from invoke_graph_rag.py. Add document embeddings core lifecycle to the knowledge service: list/get/put/delete/load operations across schema, translator, Cassandra table store, knowledge manager, gateway registry, REST API, socket client, and CLI (tg-get-de-core, tg-put-de-core). Fix delete_kg_core to also clean up document embeddings rows. * Remove spurious workspace parameter from SPARQL algebra evaluator (#915) Fix threading of workspace paramater: - The SPARQL algebra evaluator was threading a workspace parameter through every function and passing it to TriplesClient.query(), which doesn't accept it. Workspace isolation is handled by pub/sub topic routing — the TriplesClient is already scoped to a workspace-specific flow, same as GraphRAG. Passing workspace explicitly was both incorrect and unnecessary. Update tests: - tests/unit/test_query/test_sparql_algebra.py (new) — Tests _query_pattern, _eval_bgp, and evaluate() with various algebra nodes. Key tests assert workspace is never in tc.query() kwargs, plus correctness tests for BGP, JOIN, UNION, SLICE, DISTINCT, and edge cases. - tests/unit/test_retrieval/test_graph_rag.py — Added test_triples_query_never_passes_workspace (checks query()) and test_follow_edges_never_passes_workspace (checks query_stream()). * Make all Cassandra and Qdrant I/O async-safe with proper concurrency controls (#916) Cassandra triples services were using syncronous EntityCentricKnowledgeGraph methods from async contexts, and connection state was managed with threading.local which is wrong for asyncio coroutines sharing a single thread. Qdrant services had no async wrapping at all, blocking the event loop on every network call. Rows services had unprotected shared state mutations across concurrent coroutines. - Add async methods to EntityCentricKnowledgeGraph (async_insert, async_get_s/p/o/sp/po/os/spo/all, async_collection_exists, async_create_collection, async_delete_collection) using the existing cassandra_async.async_execute bridge - Rewrite triples write + query services: replace threading.local with asyncio.Lock + dict cache for per-workspace connections, use async ECKG methods for all data operations, keep asyncio.to_thread only for one-time blocking ECKG construction - Wrap all Qdrant calls in asyncio.to_thread across all 6 services (doc/graph/row embeddings write + query), add asyncio.Lock + set cache for collection existence checks - Add asyncio.Lock to rows write + query services to protect shared state (schemas, sessions, config caches) from concurrent mutation - Update all affected tests to match new async patterns * Fixed error only returning a page of results (#921) The root cause: async_execute only materialises the first result page (by design — it says so in its docstring). The streaming query set fetch_size=20 and expected to iterate all results, but only got the first 20 rows back. The fix uses asyncio.to_thread(lambda: list(tg.session.execute(...))) which lets the sync driver iterate all pages in a worker thread — exactly what the pre-async code did. * Optional test warning suppression (#923) * Fix test collection module errors & silence upstream Pytest warnings (#823) * chore: add virtual environment and .env directories to gitignore * test: filter upstream DeprecationWarning and UserWarning messages * fix(namespace): remove empty __init__.py files to fix PEP 420 implicit namespace routing for trustgraph sub-packages * Revert __init__.py deletions * Add .ini changes but commented out, will be useful at times --------- Co-authored-by: Salil M <d2kyt@protonmail.com> * fix(openai): fail fast on unrecoverable RateLimitError codes (#901) (#904) (#925) Co-authored-by: Sahil Yadav <sahilyadav.sy2004@gmail.com> * Ensure retry exception is properly raised (#926) * fix: library API get/update document round-trip bugs (#893) (#928) Fix 5 cascading bugs in the Library API wrapper that prevented the get_documents → update_document round-trip from working: - Tolerate missing title field in document metadata (use .get()) - Use attribute access on Triple objects instead of subscript - Serialize datetime to int seconds for JSON compatibility - Handle empty server response on successful update - Send both id and document-id keys in update request Added library API tests * Fix ontology selector defaults, add bypass mode, enforce domain/range (#929) - Align similarity_threshold default to 0.3 everywhere (class signature had stale 0.7). Fix matching contradiction in tech-spec. - Add bypass_selector_below parameter (default 5) to skip vector similarity selection when ontology element count is small enough. - Enforce domain/range constraints in TripleConverter for object properties and datatype properties, with subclass hierarchy support. Properties with no declared domain/range pass through unchanged. - Add unit tests for domain/range validation, subclass acceptance, polymorphic pass-through, and selector bypass. Fixes #908, #920 * Close producers on flow stop to prevent stale non-persistent topics (#930) Flow.stop() only stopped consumers, leaving response producers connected to non-persistent Pulsar topics. After flow restart, the orphaned producers held stale broker routing state, causing response messages to never reach new consumers — manifesting as 120s timeouts on document-embeddings and similar RPC paths. Fix: Flow.stop() now explicitly stops all producers. Producer.stop() closes the underlying Pulsar producer connection rather than just setting a flag. Fixes #906 * fix(gateway): propagate --timeout flag to per-service dispatchers (#931) The api-gateway accepts a --timeout flag (default 600s) but the value was not propagated into DispatcherManager, which hard-coded timeout=120 for every per-service dispatcher (graph-rag, document-rag, text-completion, embeddings, librarian, etc.). This meant any synchronous request taking more than 120 seconds would always return a Timeout error at the 120s mark, regardless of the --timeout value set on the gateway. Changes: - Add timeout parameter to DispatcherManager.__init__ (default: 120 for backward compatibility) - Store self.timeout in DispatcherManager - Replace both hardcoded timeout=120 with self.timeout in invoke_global_service and invoke_flow_service - Pass self.timeout from Api to DispatcherManager in service.py - Document the timeout parameter in the docstring Fixes #894 --------- Co-authored-by: Salil M <d2kyt@protonmail.com> Co-authored-by: Sahil Yadav <sahilyadav.sy2004@gmail.com> Co-authored-by: Mister Lobster <jlaportebot@gmail.com> 2026-05-18 09:46:58 +01:00			`"""`
			`Tests for the Library API wrapper round-trip behavior.`
			`Covers the get_documents → update_document path and edge cases`
			`from issue #893.`
			`"""`

			`import datetime`
			`import pytest`
			`from unittest.mock import MagicMock, patch`

			`from trustgraph.api.library import Library, to_value, from_value`
			`from trustgraph.api.types import DocumentMetadata, Triple`
			`from trustgraph.knowledge import Uri, Literal`


			`# ---------------------------------------------------------------------------`
			`# Helpers`
			`# ---------------------------------------------------------------------------`

			`def _make_library(response=None):`
			`api = MagicMock()`
			`api.workspace = "default"`
			`api.request.return_value = response or {}`
			`lib = Library(api)`
			`return lib, api`


			`def _wire_triple(s_iri, p_iri, o_val):`
			`return {`
			`"s": {"t": "i", "i": s_iri},`
			`"p": {"t": "i", "i": p_iri},`
			`"o": {"t": "l", "v": o_val},`
			`}`


			`def _doc_wire(id="doc-1", time=1700000000, title="Test Doc",`
			`kind="text/plain", comments="", tags=None,`
			`metadata=None, parent_id="", document_type="source",`
			`include_title=True):`
			`doc = {`
			`"id": id,`
			`"time": time,`
			`"kind": kind,`
			`"comments": comments,`
			`"metadata": metadata or [],`
			`"tags": tags or [],`
			`"parent-id": parent_id,`
			`"document-type": document_type,`
			`}`
			`if include_title:`
			`doc["title"] = title`
			`return doc`


			`# ---------------------------------------------------------------------------`
			`# Bug 1: get_documents tolerates missing title`
			`# ---------------------------------------------------------------------------`

			`class TestGetDocumentsMissingTitle:`

			`def test_missing_title_defaults_to_empty(self):`
			`doc = _doc_wire(include_title=False)`
			`lib, api = _make_library({"document-metadatas": [doc]})`

			`result = lib.get_documents()`

			`assert len(result) == 1`
			`assert result[0].title == ""`

			`def test_present_title_preserved(self):`
			`doc = _doc_wire(title="My Title")`
			`lib, api = _make_library({"document-metadatas": [doc]})`

			`result = lib.get_documents()`

			`assert result[0].title == "My Title"`


			`# ---------------------------------------------------------------------------`
			`# Bug 2: update_document handles Triple objects (attribute access)`
			`# ---------------------------------------------------------------------------`

			`class TestUpdateDocumentTripleAccess:`

			`def test_triple_objects_serialized_correctly(self):`
			`lib, api = _make_library({})`

			`metadata = DocumentMetadata(`
			`id="doc-1",`
			`time=datetime.datetime.fromtimestamp(1700000000),`
			`kind="text/plain",`
			`title="Test",`
			`comments="",`
			`metadata=[`
			`Triple(`
			`s=Uri("http://example.org/entity/alice"),`
			`p=Uri("http://example.org/rel/knows"),`
			`o=Literal("Bob"),`
			`),`
			`],`
			`tags=["test"],`
			`)`

			`lib.update_document(id="doc-1", metadata=metadata)`

			`call_args = api.request.call_args[0][1]`
			`triples = call_args["document-metadata"]["metadata"]`

			`assert len(triples) == 1`
			`assert triples[0]["s"]["i"] == "http://example.org/entity/alice"`
			`assert triples[0]["p"]["i"] == "http://example.org/rel/knows"`
			`assert triples[0]["o"]["v"] == "Bob"`

			`def test_empty_metadata_list(self):`
			`lib, api = _make_library({})`

			`metadata = DocumentMetadata(`
			`id="doc-1",`
			`time=datetime.datetime.fromtimestamp(1700000000),`
			`kind="text/plain",`
			`title="Test",`
			`comments="",`
			`metadata=[],`
			`tags=[],`
			`)`

			`lib.update_document(id="doc-1", metadata=metadata)`

			`call_args = api.request.call_args[0][1]`
			`assert call_args["document-metadata"]["metadata"] == []`


			`# ---------------------------------------------------------------------------`
			`# Bug 3: update_document serializes datetime to int seconds`
			`# ---------------------------------------------------------------------------`

			`class TestUpdateDocumentTimeSerialization:`

			`def test_datetime_serialized_to_int(self):`
			`lib, api = _make_library({})`

			`ts = 1700000000`
			`metadata = DocumentMetadata(`
			`id="doc-1",`
			`time=datetime.datetime.fromtimestamp(ts),`
			`kind="text/plain",`
			`title="Test",`
			`comments="",`
			`metadata=[],`
			`tags=[],`
			`)`

			`lib.update_document(id="doc-1", metadata=metadata)`

			`call_args = api.request.call_args[0][1]`
			`wire_time = call_args["document-metadata"]["time"]`

			`assert isinstance(wire_time, int)`
			`assert wire_time == ts`

			`def test_int_time_passed_through(self):`
			`lib, api = _make_library({})`

			`metadata = DocumentMetadata(`
			`id="doc-1",`
			`time=1700000000,`
			`kind="text/plain",`
			`title="Test",`
			`comments="",`
			`metadata=[],`
			`tags=[],`
			`)`

			`lib.update_document(id="doc-1", metadata=metadata)`

			`call_args = api.request.call_args[0][1]`
			`assert call_args["document-metadata"]["time"] == 1700000000`


			`# ---------------------------------------------------------------------------`
			`# Bug 4: update_document handles empty server response`
			`# ---------------------------------------------------------------------------`

			`class TestUpdateDocumentEmptyResponse:`

			`def test_empty_response_returns_input_metadata(self):`
			`lib, api = _make_library({})`

			`metadata = DocumentMetadata(`
			`id="doc-1",`
			`time=datetime.datetime.fromtimestamp(1700000000),`
			`kind="text/plain",`
			`title="Updated Title",`
			`comments="notes",`
			`metadata=[],`
			`tags=["a"],`
			`)`

			`result = lib.update_document(id="doc-1", metadata=metadata)`

			`assert result is metadata`

			`def test_full_response_parsed(self):`
			`response_doc = _doc_wire(`
			`id="doc-1", title="Server Title", tags=["b"],`
			`)`
			`lib, api = _make_library({"document-metadata": response_doc})`

			`metadata = DocumentMetadata(`
			`id="doc-1",`
			`time=datetime.datetime.fromtimestamp(1700000000),`
			`kind="text/plain",`
			`title="Client Title",`
			`comments="",`
			`metadata=[],`
			`tags=["a"],`
			`)`

			`result = lib.update_document(id="doc-1", metadata=metadata)`

			`assert result.title == "Server Title"`
			`assert result.tags == ["b"]`


			`# ---------------------------------------------------------------------------`
			`# Bug 5: update_document sends both id and document-id`
			`# ---------------------------------------------------------------------------`

			`class TestUpdateDocumentIdKeys:`

			`def test_both_id_keys_sent(self):`
			`lib, api = _make_library({})`

			`metadata = DocumentMetadata(`
			`id="doc-1",`
			`time=datetime.datetime.fromtimestamp(1700000000),`
			`kind="text/plain",`
			`title="Test",`
			`comments="",`
			`metadata=[],`
			`tags=[],`
			`)`

			`lib.update_document(id="doc-1", metadata=metadata)`

			`call_args = api.request.call_args[0][1]`
			`doc_meta = call_args["document-metadata"]`

			`assert doc_meta["id"] == "doc-1"`
			`assert doc_meta["document-id"] == "doc-1"`


			`# ---------------------------------------------------------------------------`
			`# Round-trip: get_documents → update_document`
			`# ---------------------------------------------------------------------------`

			`class TestGetUpdateRoundTrip:`

			`def test_full_round_trip(self):`
			`wire_doc = _doc_wire(`
			`id="doc-42",`
			`title="Original",`
			`tags=["v1"],`
			`metadata=[_wire_triple(`
			`"http://example.org/e/1",`
			`"http://example.org/r/type",`
			`"report",`
			`)],`
			`)`

			`lib, api = _make_library({"document-metadatas": [wire_doc]})`

			`docs = lib.get_documents()`
			`assert len(docs) == 1`

			`doc = docs[0]`
			`doc.title = "Updated"`
			`doc.tags.append("v2")`

			`# Server returns empty on update`
			`api.request.return_value = {}`
			`result = lib.update_document(id=doc.id, metadata=doc)`

			`# Should not raise, should return the input metadata`
			`assert result.title == "Updated"`
			`assert "v2" in result.tags`

			`# Verify the wire format sent`
			`call_args = api.request.call_args[0][1]`
			`doc_meta = call_args["document-metadata"]`

			`assert doc_meta["id"] == "doc-42"`
			`assert doc_meta["title"] == "Updated"`
			`assert isinstance(doc_meta["time"], int)`
			`assert len(doc_meta["metadata"]) == 1`
			`assert doc_meta["metadata"][0]["o"]["v"] == "report"`