Updated test suite for explainability & provenance (#696)

* Provenance tests

* Embeddings tests

* Test librarian

* Test triples stream

* Test concurrency

* Entity centric graph writes

* Agent tool service tests

* Structured data tests

* RDF tests

* Addition LLM tests

* Reliability tests
This commit is contained in:
cybermaggedon 2026-03-13 14:27:42 +00:00 committed by GitHub
parent e6623fc915
commit 29b4300808
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
36 changed files with 8799 additions and 0 deletions

View file

@ -0,0 +1 @@

View file

@ -0,0 +1,296 @@
"""
Tests for row embeddings query service: collection naming, query execution,
index filtering, result conversion, and error handling.
"""
import pytest
from unittest.mock import MagicMock, AsyncMock, patch
from trustgraph.schema import (
RowEmbeddingsRequest, RowEmbeddingsResponse,
RowIndexMatch, Error,
)
# ---------------------------------------------------------------------------
# Helpers
# ---------------------------------------------------------------------------
def _make_processor(qdrant_client=None):
"""Create a Processor without full FlowProcessor init."""
from trustgraph.query.row_embeddings.qdrant.service import Processor
proc = Processor.__new__(Processor)
proc.qdrant = qdrant_client or MagicMock()
return proc
def _make_request(vector=None, user="test-user", collection="test-col",
schema_name="customers", limit=10, index_name=None):
return RowEmbeddingsRequest(
vector=vector or [0.1, 0.2, 0.3],
user=user,
collection=collection,
schema_name=schema_name,
limit=limit,
index_name=index_name or "",
)
def _make_search_point(index_name, index_value, text, score):
point = MagicMock()
point.payload = {
"index_name": index_name,
"index_value": index_value,
"text": text,
}
point.score = score
return point
# ---------------------------------------------------------------------------
# sanitize_name
# ---------------------------------------------------------------------------
class TestSanitizeName:
def test_simple_name(self):
proc = _make_processor()
assert proc.sanitize_name("customers") == "customers"
def test_special_chars_replaced(self):
proc = _make_processor()
assert proc.sanitize_name("my-schema.v2") == "my_schema_v2"
def test_leading_digit_prefixed(self):
proc = _make_processor()
result = proc.sanitize_name("123schema")
assert result.startswith("r_")
assert "123schema" in result
def test_uppercase_lowercased(self):
proc = _make_processor()
assert proc.sanitize_name("MySchema") == "myschema"
def test_spaces_replaced(self):
proc = _make_processor()
assert proc.sanitize_name("my schema") == "my_schema"
# ---------------------------------------------------------------------------
# find_collection
# ---------------------------------------------------------------------------
class TestFindCollection:
def test_finds_matching_collection(self):
proc = _make_processor()
mock_coll = MagicMock()
mock_coll.name = "rows_test_user_test_col_customers_384"
mock_collections = MagicMock()
mock_collections.collections = [mock_coll]
proc.qdrant.get_collections.return_value = mock_collections
result = proc.find_collection("test-user", "test-col", "customers")
# Prefix: rows_test_user_test_col_customers_
assert result == "rows_test_user_test_col_customers_384"
def test_returns_none_when_no_match(self):
proc = _make_processor()
mock_coll = MagicMock()
mock_coll.name = "rows_other_user_other_col_schema_768"
mock_collections = MagicMock()
mock_collections.collections = [mock_coll]
proc.qdrant.get_collections.return_value = mock_collections
result = proc.find_collection("test-user", "test-col", "customers")
assert result is None
def test_returns_none_on_error(self):
proc = _make_processor()
proc.qdrant.get_collections.side_effect = Exception("connection error")
result = proc.find_collection("user", "col", "schema")
assert result is None
# ---------------------------------------------------------------------------
# query_row_embeddings
# ---------------------------------------------------------------------------
class TestQueryRowEmbeddings:
@pytest.mark.asyncio
async def test_empty_vector_returns_empty(self):
proc = _make_processor()
request = _make_request(vector=[])
result = await proc.query_row_embeddings(request)
assert result == []
@pytest.mark.asyncio
async def test_no_collection_returns_empty(self):
proc = _make_processor()
proc.find_collection = MagicMock(return_value=None)
request = _make_request()
result = await proc.query_row_embeddings(request)
assert result == []
@pytest.mark.asyncio
async def test_successful_query_returns_matches(self):
proc = _make_processor()
proc.find_collection = MagicMock(return_value="rows_u_c_s_384")
points = [
_make_search_point("name", ["Alice Smith"], "Alice Smith", 0.95),
_make_search_point("address", ["123 Main St"], "123 Main St", 0.82),
]
mock_result = MagicMock()
mock_result.points = points
proc.qdrant.query_points.return_value = mock_result
request = _make_request()
result = await proc.query_row_embeddings(request)
assert len(result) == 2
assert isinstance(result[0], RowIndexMatch)
assert result[0].index_name == "name"
assert result[0].index_value == ["Alice Smith"]
assert result[0].score == 0.95
assert result[1].index_name == "address"
@pytest.mark.asyncio
async def test_index_name_filter_applied(self):
"""When index_name is specified, a Qdrant filter should be used."""
proc = _make_processor()
proc.find_collection = MagicMock(return_value="rows_u_c_s_384")
mock_result = MagicMock()
mock_result.points = []
proc.qdrant.query_points.return_value = mock_result
request = _make_request(index_name="address")
await proc.query_row_embeddings(request)
call_kwargs = proc.qdrant.query_points.call_args[1]
assert call_kwargs["query_filter"] is not None
@pytest.mark.asyncio
async def test_no_index_name_no_filter(self):
"""When index_name is empty, no filter should be applied."""
proc = _make_processor()
proc.find_collection = MagicMock(return_value="rows_u_c_s_384")
mock_result = MagicMock()
mock_result.points = []
proc.qdrant.query_points.return_value = mock_result
request = _make_request(index_name="")
await proc.query_row_embeddings(request)
call_kwargs = proc.qdrant.query_points.call_args[1]
assert call_kwargs["query_filter"] is None
@pytest.mark.asyncio
async def test_missing_payload_fields_default(self):
"""Points with missing payload fields should use defaults."""
proc = _make_processor()
proc.find_collection = MagicMock(return_value="rows_u_c_s_384")
point = MagicMock()
point.payload = {} # Empty payload
point.score = 0.5
mock_result = MagicMock()
mock_result.points = [point]
proc.qdrant.query_points.return_value = mock_result
request = _make_request()
result = await proc.query_row_embeddings(request)
assert len(result) == 1
assert result[0].index_name == ""
assert result[0].index_value == []
assert result[0].text == ""
@pytest.mark.asyncio
async def test_qdrant_error_propagates(self):
proc = _make_processor()
proc.find_collection = MagicMock(return_value="rows_u_c_s_384")
proc.qdrant.query_points.side_effect = Exception("qdrant down")
request = _make_request()
with pytest.raises(Exception, match="qdrant down"):
await proc.query_row_embeddings(request)
# ---------------------------------------------------------------------------
# on_message handler
# ---------------------------------------------------------------------------
class TestOnMessage:
@pytest.mark.asyncio
async def test_successful_message_sends_response(self):
proc = _make_processor()
proc.query_row_embeddings = AsyncMock(return_value=[
RowIndexMatch(index_name="name", index_value=["Alice"],
text="Alice", score=0.9),
])
mock_pub = AsyncMock()
flow = lambda name: mock_pub
msg = MagicMock()
msg.value.return_value = _make_request()
msg.properties.return_value = {"id": "req-1"}
await proc.on_message(msg, MagicMock(), flow)
sent = mock_pub.send.call_args[0][0]
assert isinstance(sent, RowEmbeddingsResponse)
assert sent.error is None
assert len(sent.matches) == 1
@pytest.mark.asyncio
async def test_error_sends_error_response(self):
proc = _make_processor()
proc.query_row_embeddings = AsyncMock(
side_effect=Exception("query failed")
)
mock_pub = AsyncMock()
flow = lambda name: mock_pub
msg = MagicMock()
msg.value.return_value = _make_request()
msg.properties.return_value = {"id": "req-2"}
await proc.on_message(msg, MagicMock(), flow)
sent = mock_pub.send.call_args[0][0]
assert sent.error is not None
assert sent.error.type == "row-embeddings-query-error"
assert "query failed" in sent.error.message
assert sent.matches == []
@pytest.mark.asyncio
async def test_message_id_preserved(self):
proc = _make_processor()
proc.query_row_embeddings = AsyncMock(return_value=[])
mock_pub = AsyncMock()
flow = lambda name: mock_pub
msg = MagicMock()
msg.value.return_value = _make_request()
msg.properties.return_value = {"id": "unique-42"}
await proc.on_message(msg, MagicMock(), flow)
props = mock_pub.send.call_args[1]["properties"]
assert props["id"] == "unique-42"

View file

@ -0,0 +1,235 @@
"""
Tests for structured data type detection: CSV, JSON, XML format detection,
CSV option detection (delimiter, header), and helper functions.
"""
import pytest
from trustgraph.retrieval.structured_diag.type_detector import (
detect_data_type,
_check_json_format,
_check_xml_format,
_check_csv_format,
_check_csv_with_delimiter,
detect_csv_options,
_is_numeric,
)
# ---------------------------------------------------------------------------
# detect_data_type (top-level dispatcher)
# ---------------------------------------------------------------------------
class TestDetectDataType:
def test_empty_string_returns_none(self):
detected, confidence = detect_data_type("")
assert detected is None
assert confidence == 0.0
def test_whitespace_only_returns_none(self):
detected, confidence = detect_data_type(" \n \t ")
assert detected is None
assert confidence == 0.0
def test_none_returns_none(self):
detected, confidence = detect_data_type(None)
assert detected is None
assert confidence == 0.0
def test_json_object_detected(self):
detected, confidence = detect_data_type('{"name": "Alice"}')
assert detected == "json"
assert confidence > 0.5
def test_json_array_detected(self):
detected, confidence = detect_data_type('[{"id": 1}, {"id": 2}]')
assert detected == "json"
assert confidence > 0.5
def test_xml_with_declaration_detected(self):
detected, confidence = detect_data_type('<?xml version="1.0"?><root></root>')
assert detected == "xml"
assert confidence > 0.5
def test_xml_without_declaration_detected(self):
detected, confidence = detect_data_type('<root><item>val</item></root>')
assert detected == "xml"
assert confidence > 0.5
def test_csv_detected(self):
data = "name,age,city\nAlice,30,NYC\nBob,25,LA"
detected, confidence = detect_data_type(data)
assert detected == "csv"
assert confidence > 0.5
def test_plain_text_falls_through_to_csv(self):
"""Non-JSON/XML text defaults to CSV detection."""
detected, confidence = detect_data_type("just some text")
assert detected == "csv"
# ---------------------------------------------------------------------------
# _check_json_format
# ---------------------------------------------------------------------------
class TestCheckJsonFormat:
def test_valid_json_object(self):
assert _check_json_format('{"key": "value"}') > 0.9
def test_valid_json_array_of_objects(self):
assert _check_json_format('[{"id": 1}, {"id": 2}]') >= 0.9
def test_valid_json_array_of_primitives(self):
score = _check_json_format('[1, 2, 3]')
assert score > 0.5
assert score < 0.9 # Lower confidence for non-object arrays
def test_empty_json_object(self):
assert _check_json_format('{}') > 0.5
def test_invalid_json(self):
assert _check_json_format('{invalid json}') == 0.0
def test_non_json_starting_char(self):
assert _check_json_format('hello world') == 0.0
def test_empty_array(self):
score = _check_json_format('[]')
assert score > 0.0 # Parsed successfully but empty
# ---------------------------------------------------------------------------
# _check_xml_format
# ---------------------------------------------------------------------------
class TestCheckXmlFormat:
def test_valid_xml(self):
assert _check_xml_format('<root><item>val</item></root>') == 0.9
def test_xml_with_declaration(self):
xml = '<?xml version="1.0"?><root><item>test</item></root>'
assert _check_xml_format(xml) == 0.9
def test_malformed_xml(self):
score = _check_xml_format('<root><unclosed>')
# Has < and </ check fails since no closing tag
assert score < 0.9
def test_not_xml(self):
assert _check_xml_format('just text') == 0.0
def test_incomplete_xml_tag(self):
score = _check_xml_format('<root>')
# Starts with < but no closing tag
assert score <= 0.1
# ---------------------------------------------------------------------------
# _check_csv_format and _check_csv_with_delimiter
# ---------------------------------------------------------------------------
class TestCheckCsvFormat:
def test_valid_csv_comma(self):
data = "name,age,city\nAlice,30,NYC\nBob,25,LA"
assert _check_csv_format(data) > 0.7
def test_valid_csv_semicolon(self):
data = "name;age;city\nAlice;30;NYC\nBob;25;LA"
assert _check_csv_format(data) > 0.7
def test_valid_csv_tab(self):
data = "name\tage\tcity\nAlice\t30\tNYC\nBob\t25\tLA"
assert _check_csv_format(data) > 0.7
def test_valid_csv_pipe(self):
data = "name|age|city\nAlice|30|NYC\nBob|25|LA"
assert _check_csv_format(data) > 0.7
def test_single_line_not_csv(self):
assert _check_csv_format("just one line") == 0.0
def test_single_column_not_csv(self):
data = "a\nb\nc"
assert _check_csv_with_delimiter(data, ",") == 0.0
def test_inconsistent_columns_low_score(self):
data = "a,b,c\n1,2\n3,4,5,6"
score = _check_csv_with_delimiter(data, ",")
assert score < 0.7
def test_many_rows_higher_score(self):
rows = ["name,age,city"] + [f"person{i},{20+i},city{i}" for i in range(20)]
data = "\n".join(rows)
score = _check_csv_format(data)
assert score > 0.8
# ---------------------------------------------------------------------------
# detect_csv_options
# ---------------------------------------------------------------------------
class TestDetectCsvOptions:
def test_comma_delimiter_detected(self):
data = "name,age,city\nAlice,30,NYC\nBob,25,LA"
options = detect_csv_options(data)
assert options["delimiter"] == ","
def test_semicolon_delimiter_detected(self):
data = "name;age;city\nAlice;30;NYC\nBob;25;LA"
options = detect_csv_options(data)
assert options["delimiter"] == ";"
def test_tab_delimiter_detected(self):
data = "name\tage\tcity\nAlice\t30\tNYC\nBob\t25\tLA"
options = detect_csv_options(data)
assert options["delimiter"] == "\t"
def test_header_detected_when_first_row_text(self):
data = "name,age,salary\nAlice,30,50000\nBob,25,45000"
options = detect_csv_options(data)
assert options["has_header"] is True
def test_no_header_when_all_numeric(self):
data = "1,2,3\n4,5,6\n7,8,9"
options = detect_csv_options(data)
assert options["has_header"] is False
def test_single_line_returns_defaults(self):
options = detect_csv_options("just one line")
assert options["delimiter"] == ","
assert options["has_header"] is True
def test_encoding_default(self):
data = "a,b\n1,2"
options = detect_csv_options(data)
assert options["encoding"] == "utf-8"
# ---------------------------------------------------------------------------
# _is_numeric helper
# ---------------------------------------------------------------------------
class TestIsNumeric:
def test_integer(self):
assert _is_numeric("42") is True
def test_float(self):
assert _is_numeric("3.14") is True
def test_negative(self):
assert _is_numeric("-10") is True
def test_text(self):
assert _is_numeric("hello") is False
def test_empty(self):
assert _is_numeric("") is False
def test_whitespace_padded(self):
assert _is_numeric(" 42 ") is True