mirror of
https://github.com/trustgraph-ai/trustgraph.git
synced 2026-04-25 08:26:21 +02:00
Updated test suite for explainability & provenance (#696)
* Provenance tests * Embeddings tests * Test librarian * Test triples stream * Test concurrency * Entity centric graph writes * Agent tool service tests * Structured data tests * RDF tests * Addition LLM tests * Reliability tests
This commit is contained in:
parent
e6623fc915
commit
29b4300808
36 changed files with 8799 additions and 0 deletions
1
tests/unit/test_structured_data/__init__.py
Normal file
1
tests/unit/test_structured_data/__init__.py
Normal file
|
|
@ -0,0 +1 @@
|
|||
|
||||
296
tests/unit/test_structured_data/test_row_embeddings_query.py
Normal file
296
tests/unit/test_structured_data/test_row_embeddings_query.py
Normal file
|
|
@ -0,0 +1,296 @@
|
|||
"""
|
||||
Tests for row embeddings query service: collection naming, query execution,
|
||||
index filtering, result conversion, and error handling.
|
||||
"""
|
||||
|
||||
import pytest
|
||||
from unittest.mock import MagicMock, AsyncMock, patch
|
||||
|
||||
from trustgraph.schema import (
|
||||
RowEmbeddingsRequest, RowEmbeddingsResponse,
|
||||
RowIndexMatch, Error,
|
||||
)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Helpers
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def _make_processor(qdrant_client=None):
|
||||
"""Create a Processor without full FlowProcessor init."""
|
||||
from trustgraph.query.row_embeddings.qdrant.service import Processor
|
||||
proc = Processor.__new__(Processor)
|
||||
proc.qdrant = qdrant_client or MagicMock()
|
||||
return proc
|
||||
|
||||
|
||||
def _make_request(vector=None, user="test-user", collection="test-col",
|
||||
schema_name="customers", limit=10, index_name=None):
|
||||
return RowEmbeddingsRequest(
|
||||
vector=vector or [0.1, 0.2, 0.3],
|
||||
user=user,
|
||||
collection=collection,
|
||||
schema_name=schema_name,
|
||||
limit=limit,
|
||||
index_name=index_name or "",
|
||||
)
|
||||
|
||||
|
||||
def _make_search_point(index_name, index_value, text, score):
|
||||
point = MagicMock()
|
||||
point.payload = {
|
||||
"index_name": index_name,
|
||||
"index_value": index_value,
|
||||
"text": text,
|
||||
}
|
||||
point.score = score
|
||||
return point
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# sanitize_name
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
class TestSanitizeName:
|
||||
|
||||
def test_simple_name(self):
|
||||
proc = _make_processor()
|
||||
assert proc.sanitize_name("customers") == "customers"
|
||||
|
||||
def test_special_chars_replaced(self):
|
||||
proc = _make_processor()
|
||||
assert proc.sanitize_name("my-schema.v2") == "my_schema_v2"
|
||||
|
||||
def test_leading_digit_prefixed(self):
|
||||
proc = _make_processor()
|
||||
result = proc.sanitize_name("123schema")
|
||||
assert result.startswith("r_")
|
||||
assert "123schema" in result
|
||||
|
||||
def test_uppercase_lowercased(self):
|
||||
proc = _make_processor()
|
||||
assert proc.sanitize_name("MySchema") == "myschema"
|
||||
|
||||
def test_spaces_replaced(self):
|
||||
proc = _make_processor()
|
||||
assert proc.sanitize_name("my schema") == "my_schema"
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# find_collection
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
class TestFindCollection:
|
||||
|
||||
def test_finds_matching_collection(self):
|
||||
proc = _make_processor()
|
||||
mock_coll = MagicMock()
|
||||
mock_coll.name = "rows_test_user_test_col_customers_384"
|
||||
|
||||
mock_collections = MagicMock()
|
||||
mock_collections.collections = [mock_coll]
|
||||
proc.qdrant.get_collections.return_value = mock_collections
|
||||
|
||||
result = proc.find_collection("test-user", "test-col", "customers")
|
||||
|
||||
# Prefix: rows_test_user_test_col_customers_
|
||||
assert result == "rows_test_user_test_col_customers_384"
|
||||
|
||||
def test_returns_none_when_no_match(self):
|
||||
proc = _make_processor()
|
||||
mock_coll = MagicMock()
|
||||
mock_coll.name = "rows_other_user_other_col_schema_768"
|
||||
|
||||
mock_collections = MagicMock()
|
||||
mock_collections.collections = [mock_coll]
|
||||
proc.qdrant.get_collections.return_value = mock_collections
|
||||
|
||||
result = proc.find_collection("test-user", "test-col", "customers")
|
||||
assert result is None
|
||||
|
||||
def test_returns_none_on_error(self):
|
||||
proc = _make_processor()
|
||||
proc.qdrant.get_collections.side_effect = Exception("connection error")
|
||||
|
||||
result = proc.find_collection("user", "col", "schema")
|
||||
assert result is None
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# query_row_embeddings
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
class TestQueryRowEmbeddings:
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_empty_vector_returns_empty(self):
|
||||
proc = _make_processor()
|
||||
request = _make_request(vector=[])
|
||||
|
||||
result = await proc.query_row_embeddings(request)
|
||||
assert result == []
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_no_collection_returns_empty(self):
|
||||
proc = _make_processor()
|
||||
proc.find_collection = MagicMock(return_value=None)
|
||||
request = _make_request()
|
||||
|
||||
result = await proc.query_row_embeddings(request)
|
||||
assert result == []
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_successful_query_returns_matches(self):
|
||||
proc = _make_processor()
|
||||
proc.find_collection = MagicMock(return_value="rows_u_c_s_384")
|
||||
|
||||
points = [
|
||||
_make_search_point("name", ["Alice Smith"], "Alice Smith", 0.95),
|
||||
_make_search_point("address", ["123 Main St"], "123 Main St", 0.82),
|
||||
]
|
||||
mock_result = MagicMock()
|
||||
mock_result.points = points
|
||||
proc.qdrant.query_points.return_value = mock_result
|
||||
|
||||
request = _make_request()
|
||||
result = await proc.query_row_embeddings(request)
|
||||
|
||||
assert len(result) == 2
|
||||
assert isinstance(result[0], RowIndexMatch)
|
||||
assert result[0].index_name == "name"
|
||||
assert result[0].index_value == ["Alice Smith"]
|
||||
assert result[0].score == 0.95
|
||||
assert result[1].index_name == "address"
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_index_name_filter_applied(self):
|
||||
"""When index_name is specified, a Qdrant filter should be used."""
|
||||
proc = _make_processor()
|
||||
proc.find_collection = MagicMock(return_value="rows_u_c_s_384")
|
||||
|
||||
mock_result = MagicMock()
|
||||
mock_result.points = []
|
||||
proc.qdrant.query_points.return_value = mock_result
|
||||
|
||||
request = _make_request(index_name="address")
|
||||
await proc.query_row_embeddings(request)
|
||||
|
||||
call_kwargs = proc.qdrant.query_points.call_args[1]
|
||||
assert call_kwargs["query_filter"] is not None
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_no_index_name_no_filter(self):
|
||||
"""When index_name is empty, no filter should be applied."""
|
||||
proc = _make_processor()
|
||||
proc.find_collection = MagicMock(return_value="rows_u_c_s_384")
|
||||
|
||||
mock_result = MagicMock()
|
||||
mock_result.points = []
|
||||
proc.qdrant.query_points.return_value = mock_result
|
||||
|
||||
request = _make_request(index_name="")
|
||||
await proc.query_row_embeddings(request)
|
||||
|
||||
call_kwargs = proc.qdrant.query_points.call_args[1]
|
||||
assert call_kwargs["query_filter"] is None
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_missing_payload_fields_default(self):
|
||||
"""Points with missing payload fields should use defaults."""
|
||||
proc = _make_processor()
|
||||
proc.find_collection = MagicMock(return_value="rows_u_c_s_384")
|
||||
|
||||
point = MagicMock()
|
||||
point.payload = {} # Empty payload
|
||||
point.score = 0.5
|
||||
|
||||
mock_result = MagicMock()
|
||||
mock_result.points = [point]
|
||||
proc.qdrant.query_points.return_value = mock_result
|
||||
|
||||
request = _make_request()
|
||||
result = await proc.query_row_embeddings(request)
|
||||
|
||||
assert len(result) == 1
|
||||
assert result[0].index_name == ""
|
||||
assert result[0].index_value == []
|
||||
assert result[0].text == ""
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_qdrant_error_propagates(self):
|
||||
proc = _make_processor()
|
||||
proc.find_collection = MagicMock(return_value="rows_u_c_s_384")
|
||||
proc.qdrant.query_points.side_effect = Exception("qdrant down")
|
||||
|
||||
request = _make_request()
|
||||
|
||||
with pytest.raises(Exception, match="qdrant down"):
|
||||
await proc.query_row_embeddings(request)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# on_message handler
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
class TestOnMessage:
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_successful_message_sends_response(self):
|
||||
proc = _make_processor()
|
||||
proc.query_row_embeddings = AsyncMock(return_value=[
|
||||
RowIndexMatch(index_name="name", index_value=["Alice"],
|
||||
text="Alice", score=0.9),
|
||||
])
|
||||
|
||||
mock_pub = AsyncMock()
|
||||
flow = lambda name: mock_pub
|
||||
|
||||
msg = MagicMock()
|
||||
msg.value.return_value = _make_request()
|
||||
msg.properties.return_value = {"id": "req-1"}
|
||||
|
||||
await proc.on_message(msg, MagicMock(), flow)
|
||||
|
||||
sent = mock_pub.send.call_args[0][0]
|
||||
assert isinstance(sent, RowEmbeddingsResponse)
|
||||
assert sent.error is None
|
||||
assert len(sent.matches) == 1
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_error_sends_error_response(self):
|
||||
proc = _make_processor()
|
||||
proc.query_row_embeddings = AsyncMock(
|
||||
side_effect=Exception("query failed")
|
||||
)
|
||||
|
||||
mock_pub = AsyncMock()
|
||||
flow = lambda name: mock_pub
|
||||
|
||||
msg = MagicMock()
|
||||
msg.value.return_value = _make_request()
|
||||
msg.properties.return_value = {"id": "req-2"}
|
||||
|
||||
await proc.on_message(msg, MagicMock(), flow)
|
||||
|
||||
sent = mock_pub.send.call_args[0][0]
|
||||
assert sent.error is not None
|
||||
assert sent.error.type == "row-embeddings-query-error"
|
||||
assert "query failed" in sent.error.message
|
||||
assert sent.matches == []
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_message_id_preserved(self):
|
||||
proc = _make_processor()
|
||||
proc.query_row_embeddings = AsyncMock(return_value=[])
|
||||
|
||||
mock_pub = AsyncMock()
|
||||
flow = lambda name: mock_pub
|
||||
|
||||
msg = MagicMock()
|
||||
msg.value.return_value = _make_request()
|
||||
msg.properties.return_value = {"id": "unique-42"}
|
||||
|
||||
await proc.on_message(msg, MagicMock(), flow)
|
||||
|
||||
props = mock_pub.send.call_args[1]["properties"]
|
||||
assert props["id"] == "unique-42"
|
||||
235
tests/unit/test_structured_data/test_type_detector.py
Normal file
235
tests/unit/test_structured_data/test_type_detector.py
Normal file
|
|
@ -0,0 +1,235 @@
|
|||
"""
|
||||
Tests for structured data type detection: CSV, JSON, XML format detection,
|
||||
CSV option detection (delimiter, header), and helper functions.
|
||||
"""
|
||||
|
||||
import pytest
|
||||
|
||||
from trustgraph.retrieval.structured_diag.type_detector import (
|
||||
detect_data_type,
|
||||
_check_json_format,
|
||||
_check_xml_format,
|
||||
_check_csv_format,
|
||||
_check_csv_with_delimiter,
|
||||
detect_csv_options,
|
||||
_is_numeric,
|
||||
)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# detect_data_type (top-level dispatcher)
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
class TestDetectDataType:
|
||||
|
||||
def test_empty_string_returns_none(self):
|
||||
detected, confidence = detect_data_type("")
|
||||
assert detected is None
|
||||
assert confidence == 0.0
|
||||
|
||||
def test_whitespace_only_returns_none(self):
|
||||
detected, confidence = detect_data_type(" \n \t ")
|
||||
assert detected is None
|
||||
assert confidence == 0.0
|
||||
|
||||
def test_none_returns_none(self):
|
||||
detected, confidence = detect_data_type(None)
|
||||
assert detected is None
|
||||
assert confidence == 0.0
|
||||
|
||||
def test_json_object_detected(self):
|
||||
detected, confidence = detect_data_type('{"name": "Alice"}')
|
||||
assert detected == "json"
|
||||
assert confidence > 0.5
|
||||
|
||||
def test_json_array_detected(self):
|
||||
detected, confidence = detect_data_type('[{"id": 1}, {"id": 2}]')
|
||||
assert detected == "json"
|
||||
assert confidence > 0.5
|
||||
|
||||
def test_xml_with_declaration_detected(self):
|
||||
detected, confidence = detect_data_type('<?xml version="1.0"?><root></root>')
|
||||
assert detected == "xml"
|
||||
assert confidence > 0.5
|
||||
|
||||
def test_xml_without_declaration_detected(self):
|
||||
detected, confidence = detect_data_type('<root><item>val</item></root>')
|
||||
assert detected == "xml"
|
||||
assert confidence > 0.5
|
||||
|
||||
def test_csv_detected(self):
|
||||
data = "name,age,city\nAlice,30,NYC\nBob,25,LA"
|
||||
detected, confidence = detect_data_type(data)
|
||||
assert detected == "csv"
|
||||
assert confidence > 0.5
|
||||
|
||||
def test_plain_text_falls_through_to_csv(self):
|
||||
"""Non-JSON/XML text defaults to CSV detection."""
|
||||
detected, confidence = detect_data_type("just some text")
|
||||
assert detected == "csv"
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# _check_json_format
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
class TestCheckJsonFormat:
|
||||
|
||||
def test_valid_json_object(self):
|
||||
assert _check_json_format('{"key": "value"}') > 0.9
|
||||
|
||||
def test_valid_json_array_of_objects(self):
|
||||
assert _check_json_format('[{"id": 1}, {"id": 2}]') >= 0.9
|
||||
|
||||
def test_valid_json_array_of_primitives(self):
|
||||
score = _check_json_format('[1, 2, 3]')
|
||||
assert score > 0.5
|
||||
assert score < 0.9 # Lower confidence for non-object arrays
|
||||
|
||||
def test_empty_json_object(self):
|
||||
assert _check_json_format('{}') > 0.5
|
||||
|
||||
def test_invalid_json(self):
|
||||
assert _check_json_format('{invalid json}') == 0.0
|
||||
|
||||
def test_non_json_starting_char(self):
|
||||
assert _check_json_format('hello world') == 0.0
|
||||
|
||||
def test_empty_array(self):
|
||||
score = _check_json_format('[]')
|
||||
assert score > 0.0 # Parsed successfully but empty
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# _check_xml_format
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
class TestCheckXmlFormat:
|
||||
|
||||
def test_valid_xml(self):
|
||||
assert _check_xml_format('<root><item>val</item></root>') == 0.9
|
||||
|
||||
def test_xml_with_declaration(self):
|
||||
xml = '<?xml version="1.0"?><root><item>test</item></root>'
|
||||
assert _check_xml_format(xml) == 0.9
|
||||
|
||||
def test_malformed_xml(self):
|
||||
score = _check_xml_format('<root><unclosed>')
|
||||
# Has < and </ check fails since no closing tag
|
||||
assert score < 0.9
|
||||
|
||||
def test_not_xml(self):
|
||||
assert _check_xml_format('just text') == 0.0
|
||||
|
||||
def test_incomplete_xml_tag(self):
|
||||
score = _check_xml_format('<root>')
|
||||
# Starts with < but no closing tag
|
||||
assert score <= 0.1
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# _check_csv_format and _check_csv_with_delimiter
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
class TestCheckCsvFormat:
|
||||
|
||||
def test_valid_csv_comma(self):
|
||||
data = "name,age,city\nAlice,30,NYC\nBob,25,LA"
|
||||
assert _check_csv_format(data) > 0.7
|
||||
|
||||
def test_valid_csv_semicolon(self):
|
||||
data = "name;age;city\nAlice;30;NYC\nBob;25;LA"
|
||||
assert _check_csv_format(data) > 0.7
|
||||
|
||||
def test_valid_csv_tab(self):
|
||||
data = "name\tage\tcity\nAlice\t30\tNYC\nBob\t25\tLA"
|
||||
assert _check_csv_format(data) > 0.7
|
||||
|
||||
def test_valid_csv_pipe(self):
|
||||
data = "name|age|city\nAlice|30|NYC\nBob|25|LA"
|
||||
assert _check_csv_format(data) > 0.7
|
||||
|
||||
def test_single_line_not_csv(self):
|
||||
assert _check_csv_format("just one line") == 0.0
|
||||
|
||||
def test_single_column_not_csv(self):
|
||||
data = "a\nb\nc"
|
||||
assert _check_csv_with_delimiter(data, ",") == 0.0
|
||||
|
||||
def test_inconsistent_columns_low_score(self):
|
||||
data = "a,b,c\n1,2\n3,4,5,6"
|
||||
score = _check_csv_with_delimiter(data, ",")
|
||||
assert score < 0.7
|
||||
|
||||
def test_many_rows_higher_score(self):
|
||||
rows = ["name,age,city"] + [f"person{i},{20+i},city{i}" for i in range(20)]
|
||||
data = "\n".join(rows)
|
||||
score = _check_csv_format(data)
|
||||
assert score > 0.8
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# detect_csv_options
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
class TestDetectCsvOptions:
|
||||
|
||||
def test_comma_delimiter_detected(self):
|
||||
data = "name,age,city\nAlice,30,NYC\nBob,25,LA"
|
||||
options = detect_csv_options(data)
|
||||
assert options["delimiter"] == ","
|
||||
|
||||
def test_semicolon_delimiter_detected(self):
|
||||
data = "name;age;city\nAlice;30;NYC\nBob;25;LA"
|
||||
options = detect_csv_options(data)
|
||||
assert options["delimiter"] == ";"
|
||||
|
||||
def test_tab_delimiter_detected(self):
|
||||
data = "name\tage\tcity\nAlice\t30\tNYC\nBob\t25\tLA"
|
||||
options = detect_csv_options(data)
|
||||
assert options["delimiter"] == "\t"
|
||||
|
||||
def test_header_detected_when_first_row_text(self):
|
||||
data = "name,age,salary\nAlice,30,50000\nBob,25,45000"
|
||||
options = detect_csv_options(data)
|
||||
assert options["has_header"] is True
|
||||
|
||||
def test_no_header_when_all_numeric(self):
|
||||
data = "1,2,3\n4,5,6\n7,8,9"
|
||||
options = detect_csv_options(data)
|
||||
assert options["has_header"] is False
|
||||
|
||||
def test_single_line_returns_defaults(self):
|
||||
options = detect_csv_options("just one line")
|
||||
assert options["delimiter"] == ","
|
||||
assert options["has_header"] is True
|
||||
|
||||
def test_encoding_default(self):
|
||||
data = "a,b\n1,2"
|
||||
options = detect_csv_options(data)
|
||||
assert options["encoding"] == "utf-8"
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# _is_numeric helper
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
class TestIsNumeric:
|
||||
|
||||
def test_integer(self):
|
||||
assert _is_numeric("42") is True
|
||||
|
||||
def test_float(self):
|
||||
assert _is_numeric("3.14") is True
|
||||
|
||||
def test_negative(self):
|
||||
assert _is_numeric("-10") is True
|
||||
|
||||
def test_text(self):
|
||||
assert _is_numeric("hello") is False
|
||||
|
||||
def test_empty(self):
|
||||
assert _is_numeric("") is False
|
||||
|
||||
def test_whitespace_padded(self):
|
||||
assert _is_numeric(" 42 ") is True
|
||||
Loading…
Add table
Add a link
Reference in a new issue