2025-08-28 13:39:28 +01:00
|
|
|
"""Unit tests for Subscriber graceful shutdown functionality."""
|
|
|
|
|
|
|
|
|
|
import pytest
|
|
|
|
|
import asyncio
|
|
|
|
|
import uuid
|
|
|
|
|
from unittest.mock import AsyncMock, MagicMock, patch
|
|
|
|
|
from trustgraph.base.subscriber import Subscriber
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@pytest.fixture
|
2025-12-19 08:53:21 +00:00
|
|
|
def mock_pulsar_backend():
|
|
|
|
|
"""Mock Pulsar backend for testing."""
|
|
|
|
|
backend = MagicMock()
|
2025-08-28 13:39:28 +01:00
|
|
|
consumer = MagicMock()
|
|
|
|
|
consumer.receive = MagicMock()
|
|
|
|
|
consumer.acknowledge = MagicMock()
|
|
|
|
|
consumer.negative_acknowledge = MagicMock()
|
|
|
|
|
consumer.pause_message_listener = MagicMock()
|
|
|
|
|
consumer.unsubscribe = MagicMock()
|
|
|
|
|
consumer.close = MagicMock()
|
2025-12-19 08:53:21 +00:00
|
|
|
backend.create_consumer.return_value = consumer
|
|
|
|
|
return backend
|
2025-08-28 13:39:28 +01:00
|
|
|
|
|
|
|
|
|
|
|
|
|
@pytest.fixture
|
2025-12-19 08:53:21 +00:00
|
|
|
def subscriber(mock_pulsar_backend):
|
2025-08-28 13:39:28 +01:00
|
|
|
"""Create Subscriber instance for testing."""
|
|
|
|
|
return Subscriber(
|
2025-12-19 08:53:21 +00:00
|
|
|
backend=mock_pulsar_backend,
|
2025-08-28 13:39:28 +01:00
|
|
|
topic="test-topic",
|
|
|
|
|
subscription="test-subscription",
|
|
|
|
|
consumer_name="test-consumer",
|
|
|
|
|
schema=dict,
|
|
|
|
|
max_size=10,
|
|
|
|
|
drain_timeout=2.0,
|
|
|
|
|
backpressure_strategy="block"
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def create_mock_message(message_id="test-id", data=None):
|
|
|
|
|
"""Create a mock Pulsar message."""
|
|
|
|
|
msg = MagicMock()
|
|
|
|
|
msg.properties.return_value = {"id": message_id}
|
|
|
|
|
msg.value.return_value = data or {"test": "data"}
|
|
|
|
|
return msg
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
|
|
|
async def test_subscriber_deferred_acknowledgment_success():
|
|
|
|
|
"""Verify Subscriber only acks on successful delivery."""
|
2025-12-19 08:53:21 +00:00
|
|
|
mock_backend = MagicMock()
|
2025-08-28 13:39:28 +01:00
|
|
|
mock_consumer = MagicMock()
|
2025-12-19 08:53:21 +00:00
|
|
|
mock_backend.create_consumer.return_value = mock_consumer
|
|
|
|
|
|
2025-08-28 13:39:28 +01:00
|
|
|
subscriber = Subscriber(
|
2025-12-19 08:53:21 +00:00
|
|
|
backend=mock_backend,
|
2025-08-28 13:39:28 +01:00
|
|
|
topic="test-topic",
|
2025-12-19 08:53:21 +00:00
|
|
|
subscription="test-subscription",
|
2025-08-28 13:39:28 +01:00
|
|
|
consumer_name="test-consumer",
|
|
|
|
|
schema=dict,
|
|
|
|
|
max_size=10,
|
|
|
|
|
backpressure_strategy="block"
|
|
|
|
|
)
|
Subscriber resilience and RabbitMQ fixes (#765)
Subscriber resilience: recreate consumer after connection failure
- Move consumer creation from Subscriber.start() into the run() loop,
matching the pattern used by Consumer. If the connection drops and the
consumer is closed in the finally block, the loop now recreates it on
the next iteration instead of spinning forever on a None consumer.
Consumer thread safety:
- Dedicated ThreadPoolExecutor per consumer so all pika operations
(create, receive, acknowledge, negative_acknowledge) run on the
same thread — pika BlockingConnection is not thread-safe
- Applies to both Consumer and Subscriber classes
Config handler type audit — fix four mismatched type registrations:
- librarian: was ["librarian"] (non-existent type), now ["flow",
"active-flow"] (matches config["flow"] that the handler reads)
- cores/service: was ["kg-core"], now ["flow"] (reads
config["flow"])
- metering/counter: was ["token-costs"], now ["token-cost"]
(singular)
- agent/mcp_tool: was ["mcp-tool"], now ["mcp"] (reads
config["mcp"])
Update tests
2026-04-07 14:51:14 +01:00
|
|
|
subscriber.consumer = mock_consumer
|
|
|
|
|
|
2025-08-28 13:39:28 +01:00
|
|
|
# Create queue for subscription
|
|
|
|
|
queue = await subscriber.subscribe("test-queue")
|
Subscriber resilience and RabbitMQ fixes (#765)
Subscriber resilience: recreate consumer after connection failure
- Move consumer creation from Subscriber.start() into the run() loop,
matching the pattern used by Consumer. If the connection drops and the
consumer is closed in the finally block, the loop now recreates it on
the next iteration instead of spinning forever on a None consumer.
Consumer thread safety:
- Dedicated ThreadPoolExecutor per consumer so all pika operations
(create, receive, acknowledge, negative_acknowledge) run on the
same thread — pika BlockingConnection is not thread-safe
- Applies to both Consumer and Subscriber classes
Config handler type audit — fix four mismatched type registrations:
- librarian: was ["librarian"] (non-existent type), now ["flow",
"active-flow"] (matches config["flow"] that the handler reads)
- cores/service: was ["kg-core"], now ["flow"] (reads
config["flow"])
- metering/counter: was ["token-costs"], now ["token-cost"]
(singular)
- agent/mcp_tool: was ["mcp-tool"], now ["mcp"] (reads
config["mcp"])
Update tests
2026-04-07 14:51:14 +01:00
|
|
|
|
2025-08-28 13:39:28 +01:00
|
|
|
# Create mock message with matching queue name
|
|
|
|
|
msg = create_mock_message("test-queue", {"data": "test"})
|
Subscriber resilience and RabbitMQ fixes (#765)
Subscriber resilience: recreate consumer after connection failure
- Move consumer creation from Subscriber.start() into the run() loop,
matching the pattern used by Consumer. If the connection drops and the
consumer is closed in the finally block, the loop now recreates it on
the next iteration instead of spinning forever on a None consumer.
Consumer thread safety:
- Dedicated ThreadPoolExecutor per consumer so all pika operations
(create, receive, acknowledge, negative_acknowledge) run on the
same thread — pika BlockingConnection is not thread-safe
- Applies to both Consumer and Subscriber classes
Config handler type audit — fix four mismatched type registrations:
- librarian: was ["librarian"] (non-existent type), now ["flow",
"active-flow"] (matches config["flow"] that the handler reads)
- cores/service: was ["kg-core"], now ["flow"] (reads
config["flow"])
- metering/counter: was ["token-costs"], now ["token-cost"]
(singular)
- agent/mcp_tool: was ["mcp-tool"], now ["mcp"] (reads
config["mcp"])
Update tests
2026-04-07 14:51:14 +01:00
|
|
|
|
2025-08-28 13:39:28 +01:00
|
|
|
# Process message
|
|
|
|
|
await subscriber._process_message(msg)
|
Subscriber resilience and RabbitMQ fixes (#765)
Subscriber resilience: recreate consumer after connection failure
- Move consumer creation from Subscriber.start() into the run() loop,
matching the pattern used by Consumer. If the connection drops and the
consumer is closed in the finally block, the loop now recreates it on
the next iteration instead of spinning forever on a None consumer.
Consumer thread safety:
- Dedicated ThreadPoolExecutor per consumer so all pika operations
(create, receive, acknowledge, negative_acknowledge) run on the
same thread — pika BlockingConnection is not thread-safe
- Applies to both Consumer and Subscriber classes
Config handler type audit — fix four mismatched type registrations:
- librarian: was ["librarian"] (non-existent type), now ["flow",
"active-flow"] (matches config["flow"] that the handler reads)
- cores/service: was ["kg-core"], now ["flow"] (reads
config["flow"])
- metering/counter: was ["token-costs"], now ["token-cost"]
(singular)
- agent/mcp_tool: was ["mcp-tool"], now ["mcp"] (reads
config["mcp"])
Update tests
2026-04-07 14:51:14 +01:00
|
|
|
|
2025-08-28 13:39:28 +01:00
|
|
|
# Should acknowledge successful delivery
|
|
|
|
|
mock_consumer.acknowledge.assert_called_once_with(msg)
|
|
|
|
|
mock_consumer.negative_acknowledge.assert_not_called()
|
Subscriber resilience and RabbitMQ fixes (#765)
Subscriber resilience: recreate consumer after connection failure
- Move consumer creation from Subscriber.start() into the run() loop,
matching the pattern used by Consumer. If the connection drops and the
consumer is closed in the finally block, the loop now recreates it on
the next iteration instead of spinning forever on a None consumer.
Consumer thread safety:
- Dedicated ThreadPoolExecutor per consumer so all pika operations
(create, receive, acknowledge, negative_acknowledge) run on the
same thread — pika BlockingConnection is not thread-safe
- Applies to both Consumer and Subscriber classes
Config handler type audit — fix four mismatched type registrations:
- librarian: was ["librarian"] (non-existent type), now ["flow",
"active-flow"] (matches config["flow"] that the handler reads)
- cores/service: was ["kg-core"], now ["flow"] (reads
config["flow"])
- metering/counter: was ["token-costs"], now ["token-cost"]
(singular)
- agent/mcp_tool: was ["mcp-tool"], now ["mcp"] (reads
config["mcp"])
Update tests
2026-04-07 14:51:14 +01:00
|
|
|
|
2025-08-28 13:39:28 +01:00
|
|
|
# Message should be in queue
|
|
|
|
|
assert not queue.empty()
|
|
|
|
|
received_msg = await queue.get()
|
|
|
|
|
assert received_msg == {"data": "test"}
|
|
|
|
|
|
|
|
|
|
# Clean up
|
|
|
|
|
await subscriber.stop()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@pytest.mark.asyncio
|
2026-02-23 14:34:05 +00:00
|
|
|
async def test_subscriber_dropped_message_still_acks():
|
|
|
|
|
"""Verify Subscriber acks even when message is dropped (backpressure).
|
|
|
|
|
|
|
|
|
|
This prevents redelivery storms on shared topics - if we negative_ack
|
|
|
|
|
a dropped message, it gets redelivered to all subscribers, none of
|
|
|
|
|
whom can handle it either, causing a tight redelivery loop.
|
|
|
|
|
"""
|
2025-12-19 08:53:21 +00:00
|
|
|
mock_backend = MagicMock()
|
2025-08-28 13:39:28 +01:00
|
|
|
mock_consumer = MagicMock()
|
2025-12-19 08:53:21 +00:00
|
|
|
mock_backend.create_consumer.return_value = mock_consumer
|
|
|
|
|
|
2025-08-28 13:39:28 +01:00
|
|
|
subscriber = Subscriber(
|
2025-12-19 08:53:21 +00:00
|
|
|
backend=mock_backend,
|
2025-08-28 13:39:28 +01:00
|
|
|
topic="test-topic",
|
|
|
|
|
subscription="test-subscription",
|
2025-12-19 08:53:21 +00:00
|
|
|
consumer_name="test-consumer",
|
2025-08-28 13:39:28 +01:00
|
|
|
schema=dict,
|
|
|
|
|
max_size=1, # Very small queue
|
|
|
|
|
backpressure_strategy="drop_new"
|
|
|
|
|
)
|
Subscriber resilience and RabbitMQ fixes (#765)
Subscriber resilience: recreate consumer after connection failure
- Move consumer creation from Subscriber.start() into the run() loop,
matching the pattern used by Consumer. If the connection drops and the
consumer is closed in the finally block, the loop now recreates it on
the next iteration instead of spinning forever on a None consumer.
Consumer thread safety:
- Dedicated ThreadPoolExecutor per consumer so all pika operations
(create, receive, acknowledge, negative_acknowledge) run on the
same thread — pika BlockingConnection is not thread-safe
- Applies to both Consumer and Subscriber classes
Config handler type audit — fix four mismatched type registrations:
- librarian: was ["librarian"] (non-existent type), now ["flow",
"active-flow"] (matches config["flow"] that the handler reads)
- cores/service: was ["kg-core"], now ["flow"] (reads
config["flow"])
- metering/counter: was ["token-costs"], now ["token-cost"]
(singular)
- agent/mcp_tool: was ["mcp-tool"], now ["mcp"] (reads
config["mcp"])
Update tests
2026-04-07 14:51:14 +01:00
|
|
|
subscriber.consumer = mock_consumer
|
2026-02-23 14:34:05 +00:00
|
|
|
|
2025-08-28 13:39:28 +01:00
|
|
|
# Create queue and fill it
|
|
|
|
|
queue = await subscriber.subscribe("test-queue")
|
|
|
|
|
await queue.put({"existing": "data"})
|
2026-02-23 14:34:05 +00:00
|
|
|
|
|
|
|
|
# Create mock message - should be dropped due to full queue
|
|
|
|
|
msg = create_mock_message("test-queue", {"data": "test"})
|
|
|
|
|
|
|
|
|
|
# Process message (should be dropped due to full queue + drop_new strategy)
|
2025-08-28 13:39:28 +01:00
|
|
|
await subscriber._process_message(msg)
|
2026-02-23 14:34:05 +00:00
|
|
|
|
|
|
|
|
# Should acknowledge even though delivery failed - prevents redelivery storm
|
|
|
|
|
mock_consumer.acknowledge.assert_called_once_with(msg)
|
|
|
|
|
mock_consumer.negative_acknowledge.assert_not_called()
|
|
|
|
|
|
|
|
|
|
# Clean up
|
|
|
|
|
await subscriber.stop()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
|
|
|
async def test_subscriber_orphaned_message_acks():
|
|
|
|
|
"""Verify Subscriber acks orphaned messages (no matching waiter).
|
|
|
|
|
|
|
|
|
|
On shared response topics, if a message arrives for a waiter that
|
|
|
|
|
no longer exists (e.g., client disconnected, request timed out),
|
|
|
|
|
we must acknowledge it to prevent redelivery storms.
|
|
|
|
|
"""
|
|
|
|
|
mock_backend = MagicMock()
|
|
|
|
|
mock_consumer = MagicMock()
|
|
|
|
|
mock_backend.create_consumer.return_value = mock_consumer
|
|
|
|
|
|
|
|
|
|
subscriber = Subscriber(
|
|
|
|
|
backend=mock_backend,
|
|
|
|
|
topic="test-topic",
|
|
|
|
|
subscription="test-subscription",
|
|
|
|
|
consumer_name="test-consumer",
|
|
|
|
|
schema=dict,
|
|
|
|
|
max_size=10,
|
|
|
|
|
backpressure_strategy="block"
|
|
|
|
|
)
|
Subscriber resilience and RabbitMQ fixes (#765)
Subscriber resilience: recreate consumer after connection failure
- Move consumer creation from Subscriber.start() into the run() loop,
matching the pattern used by Consumer. If the connection drops and the
consumer is closed in the finally block, the loop now recreates it on
the next iteration instead of spinning forever on a None consumer.
Consumer thread safety:
- Dedicated ThreadPoolExecutor per consumer so all pika operations
(create, receive, acknowledge, negative_acknowledge) run on the
same thread — pika BlockingConnection is not thread-safe
- Applies to both Consumer and Subscriber classes
Config handler type audit — fix four mismatched type registrations:
- librarian: was ["librarian"] (non-existent type), now ["flow",
"active-flow"] (matches config["flow"] that the handler reads)
- cores/service: was ["kg-core"], now ["flow"] (reads
config["flow"])
- metering/counter: was ["token-costs"], now ["token-cost"]
(singular)
- agent/mcp_tool: was ["mcp-tool"], now ["mcp"] (reads
config["mcp"])
Update tests
2026-04-07 14:51:14 +01:00
|
|
|
subscriber.consumer = mock_consumer
|
2026-02-23 14:34:05 +00:00
|
|
|
|
|
|
|
|
# Don't create any queues - message will be orphaned
|
|
|
|
|
# This simulates a response arriving after the waiter has unsubscribed
|
|
|
|
|
|
|
|
|
|
# Create mock message with an ID that has no matching waiter
|
|
|
|
|
msg = create_mock_message("non-existent-waiter-id", {"data": "orphaned"})
|
|
|
|
|
|
|
|
|
|
# Process message (should be orphaned - no matching waiter)
|
|
|
|
|
await subscriber._process_message(msg)
|
|
|
|
|
|
|
|
|
|
# Should acknowledge orphaned message - prevents redelivery storm
|
|
|
|
|
mock_consumer.acknowledge.assert_called_once_with(msg)
|
|
|
|
|
mock_consumer.negative_acknowledge.assert_not_called()
|
|
|
|
|
|
2025-08-28 13:39:28 +01:00
|
|
|
# Clean up
|
|
|
|
|
await subscriber.stop()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
|
|
|
async def test_subscriber_backpressure_strategies():
|
|
|
|
|
"""Test different backpressure strategies."""
|
2025-12-19 08:53:21 +00:00
|
|
|
mock_backend = MagicMock()
|
2025-08-28 13:39:28 +01:00
|
|
|
mock_consumer = MagicMock()
|
2025-12-19 08:53:21 +00:00
|
|
|
mock_backend.create_consumer.return_value = mock_consumer
|
|
|
|
|
|
2025-08-28 13:39:28 +01:00
|
|
|
# Test drop_oldest strategy
|
|
|
|
|
subscriber = Subscriber(
|
2025-12-19 08:53:21 +00:00
|
|
|
backend=mock_backend,
|
|
|
|
|
topic="test-topic",
|
2025-08-28 13:39:28 +01:00
|
|
|
subscription="test-subscription",
|
|
|
|
|
consumer_name="test-consumer",
|
|
|
|
|
schema=dict,
|
|
|
|
|
max_size=2,
|
|
|
|
|
backpressure_strategy="drop_oldest"
|
|
|
|
|
)
|
Subscriber resilience and RabbitMQ fixes (#765)
Subscriber resilience: recreate consumer after connection failure
- Move consumer creation from Subscriber.start() into the run() loop,
matching the pattern used by Consumer. If the connection drops and the
consumer is closed in the finally block, the loop now recreates it on
the next iteration instead of spinning forever on a None consumer.
Consumer thread safety:
- Dedicated ThreadPoolExecutor per consumer so all pika operations
(create, receive, acknowledge, negative_acknowledge) run on the
same thread — pika BlockingConnection is not thread-safe
- Applies to both Consumer and Subscriber classes
Config handler type audit — fix four mismatched type registrations:
- librarian: was ["librarian"] (non-existent type), now ["flow",
"active-flow"] (matches config["flow"] that the handler reads)
- cores/service: was ["kg-core"], now ["flow"] (reads
config["flow"])
- metering/counter: was ["token-costs"], now ["token-cost"]
(singular)
- agent/mcp_tool: was ["mcp-tool"], now ["mcp"] (reads
config["mcp"])
Update tests
2026-04-07 14:51:14 +01:00
|
|
|
subscriber.consumer = mock_consumer
|
2025-08-28 13:39:28 +01:00
|
|
|
|
|
|
|
|
queue = await subscriber.subscribe("test-queue")
|
|
|
|
|
|
|
|
|
|
# Fill queue
|
|
|
|
|
await queue.put({"data": "old1"})
|
|
|
|
|
await queue.put({"data": "old2"})
|
|
|
|
|
|
|
|
|
|
# Add new message (should drop oldest) - use matching queue name
|
|
|
|
|
msg = create_mock_message("test-queue", {"data": "new"})
|
|
|
|
|
await subscriber._process_message(msg)
|
|
|
|
|
|
|
|
|
|
# Should acknowledge delivery
|
|
|
|
|
mock_consumer.acknowledge.assert_called_once_with(msg)
|
|
|
|
|
|
|
|
|
|
# Queue should have new message (old one dropped)
|
|
|
|
|
messages = []
|
|
|
|
|
while not queue.empty():
|
|
|
|
|
messages.append(await queue.get())
|
|
|
|
|
|
|
|
|
|
# Should contain old2 and new (old1 was dropped)
|
|
|
|
|
assert len(messages) == 2
|
|
|
|
|
assert {"data": "new"} in messages
|
|
|
|
|
|
|
|
|
|
# Clean up
|
|
|
|
|
await subscriber.stop()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
|
|
|
async def test_subscriber_graceful_shutdown():
|
|
|
|
|
"""Test Subscriber graceful shutdown with queue draining."""
|
2025-12-19 08:53:21 +00:00
|
|
|
mock_backend = MagicMock()
|
2025-08-28 13:39:28 +01:00
|
|
|
mock_consumer = MagicMock()
|
2025-12-19 08:53:21 +00:00
|
|
|
mock_backend.create_consumer.return_value = mock_consumer
|
|
|
|
|
|
2025-08-28 13:39:28 +01:00
|
|
|
subscriber = Subscriber(
|
2025-12-19 08:53:21 +00:00
|
|
|
backend=mock_backend,
|
2025-08-28 13:39:28 +01:00
|
|
|
topic="test-topic",
|
|
|
|
|
subscription="test-subscription",
|
|
|
|
|
consumer_name="test-consumer",
|
|
|
|
|
schema=dict,
|
|
|
|
|
max_size=10,
|
|
|
|
|
drain_timeout=1.0
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
# Create subscription with messages before starting
|
|
|
|
|
queue = await subscriber.subscribe("test-queue")
|
|
|
|
|
await queue.put({"data": "msg1"})
|
|
|
|
|
await queue.put({"data": "msg2"})
|
|
|
|
|
|
|
|
|
|
with patch.object(subscriber, 'run') as mock_run:
|
|
|
|
|
# Mock run that simulates graceful shutdown
|
|
|
|
|
async def mock_run_graceful():
|
Fix RabbitMQ request/response race and chunker Flow API drift (#779)
* Fix Metadata/EntityEmbeddings schema migration tail and add regression tests (#776)
The Metadata dataclass dropped its `metadata: list[Triple]` field
and EntityEmbeddings/ChunkEmbeddings settled on a singular
`vector: list[float]` field, but several call sites kept passing
`Metadata(metadata=...)` and `EntityEmbeddings(vectors=...)`. The
bugs were latent until a websocket client first hit
`/api/v1/flow/default/import/entity-contexts`, at which point the
dispatcher TypeError'd on construction.
Production fixes (5 call sites on the same migration tail):
* trustgraph-flow gateway dispatchers entity_contexts_import.py
and graph_embeddings_import.py — drop the stale
Metadata(metadata=...) kwarg; switch graph_embeddings_import
to the singular `vector` wire key.
* trustgraph-base messaging translators knowledge.py and
document_loading.py — fix decode side to read the singular
`"vector"` key, matching what their own encode sides have
always written.
* trustgraph-flow tables/knowledge.py — fix Cassandra row
deserialiser to construct EntityEmbeddings(vector=...)
instead of vectors=.
* trustgraph-flow gateway core_import/core_export — switch the
kg-core msgpack wire format to the singular `"v"`/`"vector"`
key and drop the dead `m["m"]` envelope field that referenced
the removed Metadata.metadata triples list (it was a
guaranteed KeyError on the export side).
Defense-in-depth regression coverage (32 new tests across 7 files):
* tests/contract/test_schema_field_contracts.py — pin the field
set of Metadata, EntityEmbeddings, ChunkEmbeddings,
EntityContext so any future schema rename fails CI loudly
with a clear diff.
* tests/unit/test_translators/test_knowledge_translator_roundtrip.py
and test_document_embeddings_translator_roundtrip.py -
encode→decode round-trip the affected translators end to end,
locking in the singular `"vector"` wire key.
* tests/unit/test_gateway/test_entity_contexts_import_dispatcher.py
and test_graph_embeddings_import_dispatcher.py — exercise the
websocket dispatchers' receive() path with realistic
payloads, the direct regression test for the original
production crash.
* tests/unit/test_gateway/test_core_import_export_roundtrip.py
— pack/unpack the kg-core msgpack format through the real
dispatcher classes (with KnowledgeRequestor mocked),
including a full export→import round-trip.
* tests/unit/test_tables/test_knowledge_table_store.py —
exercise the Cassandra row → schema conversion via __new__ to
bypass the live cluster connection.
Also fixes an unrelated leaked-coroutine RuntimeWarning in
test_gateway/test_service.py::test_run_method_calls_web_run_app: the
mocked aiohttp.web.run_app now closes the coroutine that Api.run() hands
it, mirroring what the real run_app would do, instead of leaving it for
the GC to complain about.
* Fix RabbitMQ request/response race and chunker Flow API drift
Two unrelated regressions surfaced after the v2.2 queue class
refactor. Bundled here because both are small and both block
production.
1. Request/response race against ephemeral RabbitMQ response
queues
Commit feeb92b3 switched response/notify queues to per-subscriber
auto-delete exclusive queues. That fixed orphaned-queue
accumulation but introduced a setup race: Subscriber.start()
created the run() task and returned immediately, while the
underlying RabbitMQ consumer only declared and bound its queue
lazily on the first receive() call. RequestResponse.request()
therefore published the request before any queue was bound to the
matching routing key, and the broker dropped the reply. Symptoms:
"Failed to fetch config on notify" / "Request timeout exception"
repeating roughly every 10s in api-gateway, document-embeddings
and any other service exercising the config notify path.
Fix:
* Add ensure_connected() to the BackendConsumer protocol;
implement it on RabbitMQBackendConsumer (calls _connect
synchronously, declaring and binding the queue) and as a
no-op on PulsarBackendConsumer (Pulsar's client.subscribe is
already synchronous at construction).
* Convert Subscriber's readiness signal from a non-existent
Event to an asyncio.Future created in start(). run() calls
consumer.ensure_connected() immediately after
create_consumer() and sets _ready.set_result(None) on first
successful bind. start() awaits the future via asyncio.wait
so it returns only once the consumer is fully bound. Any
reply published after start() returns is therefore guaranteed
to land in a bound queue.
* First-attempt connection failures call
_ready.set_exception(e) and exit run() so start() unblocks
with the error rather than hanging forever — the existing
higher-level retry pattern in fetch_and_apply_config takes
over from there. Runtime failures after a successful start
still go through the existing retry-with-backoff path.
* Update the two existing graceful-shutdown tests that
monkey-patch Subscriber.run with a custom coroutine to honor
the new contract by signalling _ready themselves.
* Add tests/unit/test_base/test_subscriber_readiness.py with
five regression tests pinning the readiness contract:
ensure_connected must be called before start() returns;
start() must block while ensure_connected runs
(race-condition guard with a threading.Event gate);
first-attempt create_consumer and ensure_connected failures
must propagate to start() instead of hanging;
ensure_connected must run before any receive() call.
2. Chunker Flow parameter lookup using the wrong attribute
trustgraph-base/trustgraph/base/chunking_service.py was reading
flow.parameters.get("chunk-size") and chunk-overlap, but the Flow
class has no `parameters` attribute — parameter lookup is exposed
through Flow.__call__ (flow("chunk-size") returns the resolved
value or None). The exception was caught and logged as a
WARNING, so chunking continued with the default sizes and any
configured chunk-size / chunk-overlap was silently ignored:
chunker - WARNING - Could not parse chunk-size parameter:
'Flow' object has no attribute 'parameters'
The chunker tests didn't catch this because they constructed
mock_flow = MagicMock() and configured
mock_flow.parameters.get.side_effect = ..., which is the same
phantom attribute MagicMock auto-creates on demand. Tests and
production agreed on the wrong API.
Fix: switch chunking_service.py to flow("chunk-size") /
flow("chunk-overlap"). Update both chunker test files to mock the
__call__ side_effect instead of the phantom parameters.get,
merging parameter values into the existing flow() lookup the
on_message tests already used for producer resolution.
2026-04-11 01:29:38 +01:00
|
|
|
# Honor the readiness contract: real run() signals _ready
|
|
|
|
|
# after binding the consumer, so start() can unblock. Mocks
|
|
|
|
|
# of run() must do the same or start() hangs forever.
|
|
|
|
|
subscriber._ready.set_result(None)
|
2025-08-28 13:39:28 +01:00
|
|
|
# Process messages while running, then drain
|
|
|
|
|
while subscriber.running or subscriber.draining:
|
|
|
|
|
if subscriber.draining:
|
|
|
|
|
# Simulate pause message listener
|
|
|
|
|
mock_consumer.pause_message_listener()
|
|
|
|
|
# Drain messages
|
|
|
|
|
while not queue.empty():
|
|
|
|
|
await queue.get()
|
|
|
|
|
break
|
|
|
|
|
await asyncio.sleep(0.05)
|
|
|
|
|
|
|
|
|
|
# Cleanup
|
|
|
|
|
mock_consumer.unsubscribe()
|
|
|
|
|
mock_consumer.close()
|
|
|
|
|
|
|
|
|
|
mock_run.side_effect = mock_run_graceful
|
|
|
|
|
|
|
|
|
|
await subscriber.start()
|
|
|
|
|
|
|
|
|
|
# Initial state
|
|
|
|
|
assert subscriber.running is True
|
|
|
|
|
assert subscriber.draining is False
|
|
|
|
|
|
|
|
|
|
# Start shutdown
|
|
|
|
|
stop_task = asyncio.create_task(subscriber.stop())
|
|
|
|
|
|
|
|
|
|
# Allow brief processing
|
|
|
|
|
await asyncio.sleep(0.1)
|
|
|
|
|
|
|
|
|
|
# Should be in drain state
|
|
|
|
|
assert subscriber.running is False
|
|
|
|
|
assert subscriber.draining is True
|
|
|
|
|
|
|
|
|
|
# Complete shutdown
|
|
|
|
|
await stop_task
|
|
|
|
|
|
|
|
|
|
# Should have cleaned up
|
|
|
|
|
mock_consumer.unsubscribe.assert_called_once()
|
|
|
|
|
mock_consumer.close.assert_called_once()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
|
|
|
async def test_subscriber_drain_timeout():
|
|
|
|
|
"""Test Subscriber respects drain timeout."""
|
2025-12-19 08:53:21 +00:00
|
|
|
mock_backend = MagicMock()
|
2025-08-28 13:39:28 +01:00
|
|
|
mock_consumer = MagicMock()
|
2025-12-19 08:53:21 +00:00
|
|
|
mock_backend.create_consumer.return_value = mock_consumer
|
|
|
|
|
|
2025-08-28 13:39:28 +01:00
|
|
|
subscriber = Subscriber(
|
2025-12-19 08:53:21 +00:00
|
|
|
backend=mock_backend,
|
2025-08-28 13:39:28 +01:00
|
|
|
topic="test-topic",
|
2025-12-19 08:53:21 +00:00
|
|
|
subscription="test-subscription",
|
2025-08-28 13:39:28 +01:00
|
|
|
consumer_name="test-consumer",
|
|
|
|
|
schema=dict,
|
|
|
|
|
max_size=10,
|
|
|
|
|
drain_timeout=0.1 # Very short timeout
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
# Create subscription with many messages
|
|
|
|
|
queue = await subscriber.subscribe("test-queue")
|
|
|
|
|
# Fill queue to max capacity (subscriber max_size=10, but queue itself has maxsize=10)
|
|
|
|
|
for i in range(5): # Fill partway to avoid blocking
|
|
|
|
|
await queue.put({"data": f"msg{i}"})
|
|
|
|
|
|
|
|
|
|
# Test the timeout behavior without actually running start/stop
|
|
|
|
|
# Just verify the timeout value is set correctly and queue has messages
|
|
|
|
|
assert subscriber.drain_timeout == 0.1
|
|
|
|
|
assert not queue.empty()
|
|
|
|
|
assert queue.qsize() == 5
|
|
|
|
|
|
|
|
|
|
# Simulate what would happen during timeout - queue should still have messages
|
|
|
|
|
# This tests the concept without the complex async interaction
|
|
|
|
|
messages_remaining = queue.qsize()
|
|
|
|
|
assert messages_remaining > 0 # Should have messages that would timeout
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
|
|
|
async def test_subscriber_pending_acks_cleanup():
|
|
|
|
|
"""Test Subscriber cleans up pending acknowledgments on shutdown."""
|
2025-12-19 08:53:21 +00:00
|
|
|
mock_backend = MagicMock()
|
2025-08-28 13:39:28 +01:00
|
|
|
mock_consumer = MagicMock()
|
2025-12-19 08:53:21 +00:00
|
|
|
mock_backend.create_consumer.return_value = mock_consumer
|
|
|
|
|
|
2025-08-28 13:39:28 +01:00
|
|
|
subscriber = Subscriber(
|
2025-12-19 08:53:21 +00:00
|
|
|
backend=mock_backend,
|
2025-08-28 13:39:28 +01:00
|
|
|
topic="test-topic",
|
|
|
|
|
subscription="test-subscription",
|
|
|
|
|
consumer_name="test-consumer",
|
|
|
|
|
schema=dict,
|
|
|
|
|
max_size=10
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
# Add pending acknowledgments manually (simulating in-flight messages)
|
|
|
|
|
msg1 = create_mock_message("msg-1")
|
|
|
|
|
msg2 = create_mock_message("msg-2")
|
|
|
|
|
subscriber.pending_acks["ack-1"] = msg1
|
|
|
|
|
subscriber.pending_acks["ack-2"] = msg2
|
|
|
|
|
|
|
|
|
|
with patch.object(subscriber, 'run') as mock_run:
|
|
|
|
|
# Mock run that simulates cleanup of pending acks
|
|
|
|
|
async def mock_run_cleanup():
|
Fix RabbitMQ request/response race and chunker Flow API drift (#779)
* Fix Metadata/EntityEmbeddings schema migration tail and add regression tests (#776)
The Metadata dataclass dropped its `metadata: list[Triple]` field
and EntityEmbeddings/ChunkEmbeddings settled on a singular
`vector: list[float]` field, but several call sites kept passing
`Metadata(metadata=...)` and `EntityEmbeddings(vectors=...)`. The
bugs were latent until a websocket client first hit
`/api/v1/flow/default/import/entity-contexts`, at which point the
dispatcher TypeError'd on construction.
Production fixes (5 call sites on the same migration tail):
* trustgraph-flow gateway dispatchers entity_contexts_import.py
and graph_embeddings_import.py — drop the stale
Metadata(metadata=...) kwarg; switch graph_embeddings_import
to the singular `vector` wire key.
* trustgraph-base messaging translators knowledge.py and
document_loading.py — fix decode side to read the singular
`"vector"` key, matching what their own encode sides have
always written.
* trustgraph-flow tables/knowledge.py — fix Cassandra row
deserialiser to construct EntityEmbeddings(vector=...)
instead of vectors=.
* trustgraph-flow gateway core_import/core_export — switch the
kg-core msgpack wire format to the singular `"v"`/`"vector"`
key and drop the dead `m["m"]` envelope field that referenced
the removed Metadata.metadata triples list (it was a
guaranteed KeyError on the export side).
Defense-in-depth regression coverage (32 new tests across 7 files):
* tests/contract/test_schema_field_contracts.py — pin the field
set of Metadata, EntityEmbeddings, ChunkEmbeddings,
EntityContext so any future schema rename fails CI loudly
with a clear diff.
* tests/unit/test_translators/test_knowledge_translator_roundtrip.py
and test_document_embeddings_translator_roundtrip.py -
encode→decode round-trip the affected translators end to end,
locking in the singular `"vector"` wire key.
* tests/unit/test_gateway/test_entity_contexts_import_dispatcher.py
and test_graph_embeddings_import_dispatcher.py — exercise the
websocket dispatchers' receive() path with realistic
payloads, the direct regression test for the original
production crash.
* tests/unit/test_gateway/test_core_import_export_roundtrip.py
— pack/unpack the kg-core msgpack format through the real
dispatcher classes (with KnowledgeRequestor mocked),
including a full export→import round-trip.
* tests/unit/test_tables/test_knowledge_table_store.py —
exercise the Cassandra row → schema conversion via __new__ to
bypass the live cluster connection.
Also fixes an unrelated leaked-coroutine RuntimeWarning in
test_gateway/test_service.py::test_run_method_calls_web_run_app: the
mocked aiohttp.web.run_app now closes the coroutine that Api.run() hands
it, mirroring what the real run_app would do, instead of leaving it for
the GC to complain about.
* Fix RabbitMQ request/response race and chunker Flow API drift
Two unrelated regressions surfaced after the v2.2 queue class
refactor. Bundled here because both are small and both block
production.
1. Request/response race against ephemeral RabbitMQ response
queues
Commit feeb92b3 switched response/notify queues to per-subscriber
auto-delete exclusive queues. That fixed orphaned-queue
accumulation but introduced a setup race: Subscriber.start()
created the run() task and returned immediately, while the
underlying RabbitMQ consumer only declared and bound its queue
lazily on the first receive() call. RequestResponse.request()
therefore published the request before any queue was bound to the
matching routing key, and the broker dropped the reply. Symptoms:
"Failed to fetch config on notify" / "Request timeout exception"
repeating roughly every 10s in api-gateway, document-embeddings
and any other service exercising the config notify path.
Fix:
* Add ensure_connected() to the BackendConsumer protocol;
implement it on RabbitMQBackendConsumer (calls _connect
synchronously, declaring and binding the queue) and as a
no-op on PulsarBackendConsumer (Pulsar's client.subscribe is
already synchronous at construction).
* Convert Subscriber's readiness signal from a non-existent
Event to an asyncio.Future created in start(). run() calls
consumer.ensure_connected() immediately after
create_consumer() and sets _ready.set_result(None) on first
successful bind. start() awaits the future via asyncio.wait
so it returns only once the consumer is fully bound. Any
reply published after start() returns is therefore guaranteed
to land in a bound queue.
* First-attempt connection failures call
_ready.set_exception(e) and exit run() so start() unblocks
with the error rather than hanging forever — the existing
higher-level retry pattern in fetch_and_apply_config takes
over from there. Runtime failures after a successful start
still go through the existing retry-with-backoff path.
* Update the two existing graceful-shutdown tests that
monkey-patch Subscriber.run with a custom coroutine to honor
the new contract by signalling _ready themselves.
* Add tests/unit/test_base/test_subscriber_readiness.py with
five regression tests pinning the readiness contract:
ensure_connected must be called before start() returns;
start() must block while ensure_connected runs
(race-condition guard with a threading.Event gate);
first-attempt create_consumer and ensure_connected failures
must propagate to start() instead of hanging;
ensure_connected must run before any receive() call.
2. Chunker Flow parameter lookup using the wrong attribute
trustgraph-base/trustgraph/base/chunking_service.py was reading
flow.parameters.get("chunk-size") and chunk-overlap, but the Flow
class has no `parameters` attribute — parameter lookup is exposed
through Flow.__call__ (flow("chunk-size") returns the resolved
value or None). The exception was caught and logged as a
WARNING, so chunking continued with the default sizes and any
configured chunk-size / chunk-overlap was silently ignored:
chunker - WARNING - Could not parse chunk-size parameter:
'Flow' object has no attribute 'parameters'
The chunker tests didn't catch this because they constructed
mock_flow = MagicMock() and configured
mock_flow.parameters.get.side_effect = ..., which is the same
phantom attribute MagicMock auto-creates on demand. Tests and
production agreed on the wrong API.
Fix: switch chunking_service.py to flow("chunk-size") /
flow("chunk-overlap"). Update both chunker test files to mock the
__call__ side_effect instead of the phantom parameters.get,
merging parameter values into the existing flow() lookup the
on_message tests already used for producer resolution.
2026-04-11 01:29:38 +01:00
|
|
|
# Honor the readiness contract — see test_subscriber_graceful_shutdown.
|
|
|
|
|
subscriber._ready.set_result(None)
|
2025-08-28 13:39:28 +01:00
|
|
|
while subscriber.running or subscriber.draining:
|
|
|
|
|
await asyncio.sleep(0.05)
|
|
|
|
|
if subscriber.draining:
|
|
|
|
|
break
|
|
|
|
|
|
|
|
|
|
# Simulate cleanup in finally block
|
|
|
|
|
for msg in subscriber.pending_acks.values():
|
|
|
|
|
mock_consumer.negative_acknowledge(msg)
|
|
|
|
|
subscriber.pending_acks.clear()
|
|
|
|
|
|
|
|
|
|
mock_consumer.unsubscribe()
|
|
|
|
|
mock_consumer.close()
|
|
|
|
|
|
|
|
|
|
mock_run.side_effect = mock_run_cleanup
|
|
|
|
|
|
|
|
|
|
await subscriber.start()
|
|
|
|
|
|
|
|
|
|
# Stop subscriber
|
|
|
|
|
await subscriber.stop()
|
|
|
|
|
|
|
|
|
|
# Should negative acknowledge pending messages
|
|
|
|
|
assert mock_consumer.negative_acknowledge.call_count == 2
|
|
|
|
|
mock_consumer.negative_acknowledge.assert_any_call(msg1)
|
|
|
|
|
mock_consumer.negative_acknowledge.assert_any_call(msg2)
|
|
|
|
|
|
|
|
|
|
# Pending acks should be cleared
|
|
|
|
|
assert len(subscriber.pending_acks) == 0
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
|
|
|
async def test_subscriber_multiple_subscribers():
|
|
|
|
|
"""Test Subscriber with multiple concurrent subscribers."""
|
2025-12-19 08:53:21 +00:00
|
|
|
mock_backend = MagicMock()
|
2025-08-28 13:39:28 +01:00
|
|
|
mock_consumer = MagicMock()
|
2025-12-19 08:53:21 +00:00
|
|
|
mock_backend.create_consumer.return_value = mock_consumer
|
|
|
|
|
|
2025-08-28 13:39:28 +01:00
|
|
|
subscriber = Subscriber(
|
2025-12-19 08:53:21 +00:00
|
|
|
backend=mock_backend,
|
2025-08-28 13:39:28 +01:00
|
|
|
topic="test-topic",
|
|
|
|
|
subscription="test-subscription",
|
|
|
|
|
consumer_name="test-consumer",
|
|
|
|
|
schema=dict,
|
|
|
|
|
max_size=10
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
# Manually set consumer to test without complex async interactions
|
|
|
|
|
subscriber.consumer = mock_consumer
|
|
|
|
|
|
|
|
|
|
# Create multiple subscriptions
|
|
|
|
|
queue1 = await subscriber.subscribe("queue-1")
|
|
|
|
|
queue2 = await subscriber.subscribe("queue-2")
|
|
|
|
|
queue_all = await subscriber.subscribe_all("queue-all")
|
|
|
|
|
|
|
|
|
|
# Process message - use queue-1 as the target
|
|
|
|
|
msg = create_mock_message("queue-1", {"data": "broadcast"})
|
|
|
|
|
await subscriber._process_message(msg)
|
|
|
|
|
|
|
|
|
|
# Should acknowledge (successful delivery to all queues)
|
|
|
|
|
mock_consumer.acknowledge.assert_called_once_with(msg)
|
|
|
|
|
|
|
|
|
|
# Message should be in specific queue (queue-1) and broadcast queue
|
|
|
|
|
assert not queue1.empty()
|
|
|
|
|
assert queue2.empty() # No message for queue-2
|
|
|
|
|
assert not queue_all.empty()
|
|
|
|
|
|
|
|
|
|
# Verify message content
|
|
|
|
|
msg1 = await queue1.get()
|
|
|
|
|
msg_all = await queue_all.get()
|
|
|
|
|
assert msg1 == {"data": "broadcast"}
|
2026-04-14 11:07:23 +01:00
|
|
|
assert msg_all == {"data": "broadcast"}
|