From 2c3a699af39111411ab02389781d8fffe2871c0c Mon Sep 17 00:00:00 2001 From: cybermaggedon Date: Thu, 21 May 2026 10:50:11 +0100 Subject: [PATCH] feat: extend SPARQL evaluator with comprehensive function and operator support (#945) Add 30+ SPARQL 1.1 built-in functions and the MINUS algebra operator to the custom SPARQL query backend. String functions: - SUBSTR (2-arg and 3-arg forms), STRBEFORE, STRAFTER - REPLACE (regex with flags), ENCODE_FOR_URI Numeric functions: - FLOOR, CEIL, ROUND, ABS Date/time accessors: - YEAR, MONTH, DAY, HOURS, MINUTES, SECONDS - NOW, TZ Hash functions: - MD5, SHA1, SHA256, SHA512 Term constructors: - IRI/URI, BNODE, UUID, STRUUID Other functions: - LANGMATCHES, RAND - EXISTS / NOT EXISTS (with async pre-evaluation to bridge the sync expression evaluator and async algebra evaluator) Algebra: - MINUS set-difference operator - HAVING already works via rdflib's Filter mapping (verified) Fix SPARQL ORDER handling Includes 653 lines of new unit tests covering all added functionality across expressions, solutions, and algebra layers. --- tests/unit/test_query/test_sparql_algebra.py | 185 ++++++++ .../test_query/test_sparql_expressions.py | 432 ++++++++++++++++++ .../unit/test_query/test_sparql_solutions.py | 60 ++- .../trustgraph/query/sparql/algebra.py | 74 ++- .../trustgraph/query/sparql/expressions.py | 238 +++++++++- .../trustgraph/query/sparql/solutions.py | 61 ++- 6 files changed, 1021 insertions(+), 29 deletions(-) diff --git a/tests/unit/test_query/test_sparql_algebra.py b/tests/unit/test_query/test_sparql_algebra.py index 9827b2de..980ce870 100644 --- a/tests/unit/test_query/test_sparql_algebra.py +++ b/tests/unit/test_query/test_sparql_algebra.py @@ -84,6 +84,20 @@ def make_distinct(inner): return node +def make_filter(inner, expr): + node = CompValue("Filter") + node.p = inner + node.expr = expr + return node + + +def make_minus(left, right): + node = CompValue("Minus") + node.p1 = left + node.p2 = right + return node + + class TestQueryPattern: """Tests for _query_pattern — the leaf that calls TriplesClient.""" @@ -282,6 +296,177 @@ class TestEvaluate: assert len(solutions) == 1 + @pytest.mark.asyncio + async def test_minus_removes_matching(self): + tc = AsyncMock() + + alice = iri("http://example.com/alice") + bob = iri("http://example.com/bob") + knows = iri("http://example.com/knows") + hates = iri("http://example.com/hates") + charlie = iri("http://example.com/charlie") + + left_triple = make_triple(alice, knows, bob) + right_triple1 = make_triple(alice, knows, bob) + right_triple2 = make_triple(alice, hates, charlie) + + left_bgp = make_bgp( + (Variable("s"), URIRef("http://example.com/knows"), Variable("o")) + ) + right_bgp = make_bgp( + (Variable("s"), URIRef("http://example.com/hates"), Variable("r")) + ) + + async def mock_query(**kwargs): + pred = kwargs.get("p") + if pred and pred.iri == "http://example.com/knows": + return [left_triple] + elif pred and pred.iri == "http://example.com/hates": + return [right_triple2] + return [] + + tc.query.side_effect = mock_query + + tree = make_select( + make_project( + make_minus(left_bgp, right_bgp), + ["s", "o"] + ) + ) + + solutions = await evaluate(tree, tc, collection="default") + + # alice knows bob, but alice also hates charlie + # shared var is "s" (alice), so alice's solution is removed + assert len(solutions) == 0 + + @pytest.mark.asyncio + async def test_minus_no_shared_vars_preserves_all(self): + tc = AsyncMock() + + alice = iri("http://example.com/alice") + bob = iri("http://example.com/bob") + + left_triple = make_triple(alice, iri("http://example.com/p"), bob) + + left_bgp = make_bgp( + (Variable("s"), URIRef("http://example.com/p"), Variable("o")) + ) + right_bgp = make_bgp( + (Variable("x"), URIRef("http://example.com/q"), Variable("y")) + ) + + async def mock_query(**kwargs): + pred = kwargs.get("p") + if pred and pred.iri == "http://example.com/p": + return [left_triple] + return [] + + tc.query.side_effect = mock_query + + tree = make_select( + make_project( + make_minus(left_bgp, right_bgp), + ["s", "o"] + ) + ) + + solutions = await evaluate(tree, tc, collection="default") + + assert len(solutions) == 1 + + @pytest.mark.asyncio + async def test_filter_exists_keeps_matching(self): + tc = AsyncMock() + + alice = iri("http://example.com/alice") + bob = iri("http://example.com/bob") + charlie = iri("http://example.com/charlie") + + left_triple1 = make_triple(alice, iri("http://example.com/knows"), bob) + left_triple2 = make_triple(alice, iri("http://example.com/knows"), charlie) + exists_triple = make_triple(bob, iri("http://example.com/likes"), alice) + + left_bgp = make_bgp( + (Variable("s"), URIRef("http://example.com/knows"), Variable("o")) + ) + exists_bgp = make_bgp( + (Variable("o"), URIRef("http://example.com/likes"), Variable("_any")) + ) + + async def mock_query(**kwargs): + pred = kwargs.get("p") + if pred and pred.iri == "http://example.com/knows": + return [left_triple1, left_triple2] + elif pred and pred.iri == "http://example.com/likes": + return [exists_triple] + return [] + + tc.query.side_effect = mock_query + + exists_expr = CompValue("Builtin_EXISTS") + exists_expr.graph = exists_bgp + + tree = make_select( + make_project( + make_filter(left_bgp, exists_expr), + ["s", "o"] + ) + ) + + solutions = await evaluate(tree, tc, collection="default") + + # Only bob has a "likes" triple, so only the bob solution passes + result_objects = [s["o"].iri for s in solutions] + assert "http://example.com/bob" in result_objects + assert "http://example.com/charlie" not in result_objects + + @pytest.mark.asyncio + async def test_filter_not_exists_removes_matching(self): + tc = AsyncMock() + + alice = iri("http://example.com/alice") + bob = iri("http://example.com/bob") + charlie = iri("http://example.com/charlie") + + left_triple1 = make_triple(alice, iri("http://example.com/knows"), bob) + left_triple2 = make_triple(alice, iri("http://example.com/knows"), charlie) + exists_triple = make_triple(bob, iri("http://example.com/likes"), alice) + + left_bgp = make_bgp( + (Variable("s"), URIRef("http://example.com/knows"), Variable("o")) + ) + exists_bgp = make_bgp( + (Variable("o"), URIRef("http://example.com/likes"), Variable("_any")) + ) + + async def mock_query(**kwargs): + pred = kwargs.get("p") + if pred and pred.iri == "http://example.com/knows": + return [left_triple1, left_triple2] + elif pred and pred.iri == "http://example.com/likes": + return [exists_triple] + return [] + + tc.query.side_effect = mock_query + + not_exists_expr = CompValue("Builtin_NOTEXISTS") + not_exists_expr.graph = exists_bgp + + tree = make_select( + make_project( + make_filter(left_bgp, not_exists_expr), + ["s", "o"] + ) + ) + + solutions = await evaluate(tree, tc, collection="default") + + # bob has a "likes" triple so is removed; charlie stays + result_objects = [s["o"].iri for s in solutions] + assert "http://example.com/charlie" in result_objects + assert "http://example.com/bob" not in result_objects + @pytest.mark.asyncio async def test_unsupported_node_returns_empty_solution(self): tc = AsyncMock() diff --git a/tests/unit/test_query/test_sparql_expressions.py b/tests/unit/test_query/test_sparql_expressions.py index 63e9188f..87c862e8 100644 --- a/tests/unit/test_query/test_sparql_expressions.py +++ b/tests/unit/test_query/test_sparql_expressions.py @@ -300,6 +300,438 @@ class TestBuiltinFunctions: flags=None) assert evaluate_expression(expr, {"x": lit("hello")}) is False + def test_substr_three_args(self): + from rdflib.term import Variable + from rdflib import Literal + expr = self._make_builtin("SUBSTR", + arg=Variable("x"), + start=Literal(1), + length=Literal(4)) + result = evaluate_expression(expr, {"x": lit("2024-03-15")}) + assert result.type == LITERAL + assert result.value == "2024" + + def test_substr_two_args(self): + from rdflib.term import Variable + from rdflib import Literal + expr = self._make_builtin("SUBSTR", + arg=Variable("x"), + start=Literal(6), + length=None) + result = evaluate_expression(expr, {"x": lit("2024-03-15")}) + assert result.type == LITERAL + assert result.value == "03-15" + + def test_substr_middle(self): + from rdflib.term import Variable + from rdflib import Literal + expr = self._make_builtin("SUBSTR", + arg=Variable("x"), + start=Literal(6), + length=Literal(2)) + result = evaluate_expression(expr, {"x": lit("2024-03-15")}) + assert result.type == LITERAL + assert result.value == "03" + + def test_substr_null_start(self): + from rdflib.term import Variable + from rdflib import Literal + expr = self._make_builtin("SUBSTR", + arg=Variable("x"), + start=Variable("missing"), + length=None) + result = evaluate_expression(expr, {"x": lit("hello")}) + assert result is None + + def test_year(self): + from rdflib.term import Variable + expr = self._make_builtin("YEAR", arg=Variable("x")) + result = evaluate_expression( + expr, {"x": lit("2024-03-15", datatype=XSD + "date")} + ) + assert result == 2024 + + def test_month(self): + from rdflib.term import Variable + expr = self._make_builtin("MONTH", arg=Variable("x")) + result = evaluate_expression( + expr, {"x": lit("2024-03-15", datatype=XSD + "date")} + ) + assert result == 3 + + def test_day(self): + from rdflib.term import Variable + expr = self._make_builtin("DAY", arg=Variable("x")) + result = evaluate_expression( + expr, {"x": lit("2024-03-15", datatype=XSD + "date")} + ) + assert result == 15 + + def test_hours(self): + from rdflib.term import Variable + expr = self._make_builtin("HOURS", arg=Variable("x")) + result = evaluate_expression( + expr, {"x": lit("2024-03-15T10:30:45", datatype=XSD + "dateTime")} + ) + assert result == 10 + + def test_minutes(self): + from rdflib.term import Variable + expr = self._make_builtin("MINUTES", arg=Variable("x")) + result = evaluate_expression( + expr, {"x": lit("2024-03-15T10:30:45", datatype=XSD + "dateTime")} + ) + assert result == 30 + + def test_seconds(self): + from rdflib.term import Variable + expr = self._make_builtin("SECONDS", arg=Variable("x")) + result = evaluate_expression( + expr, {"x": lit("2024-03-15T10:30:45", datatype=XSD + "dateTime")} + ) + assert result == 45 + + def test_year_from_datetime(self): + from rdflib.term import Variable + expr = self._make_builtin("YEAR", arg=Variable("x")) + result = evaluate_expression( + expr, {"x": lit("2024-03-15T10:30:45", datatype=XSD + "dateTime")} + ) + assert result == 2024 + + def test_hours_from_date_returns_zero(self): + from rdflib.term import Variable + expr = self._make_builtin("HOURS", arg=Variable("x")) + result = evaluate_expression( + expr, {"x": lit("2024-03-15", datatype=XSD + "date")} + ) + assert result == 0 + + def test_year_invalid_date(self): + from rdflib.term import Variable + expr = self._make_builtin("YEAR", arg=Variable("x")) + result = evaluate_expression( + expr, {"x": lit("not-a-date")} + ) + assert result is None + + def test_floor(self): + from rdflib.term import Variable + expr = self._make_builtin("FLOOR", arg=Variable("x")) + assert evaluate_expression(expr, {"x": lit("3.7")}) == 3 + + def test_floor_negative(self): + from rdflib.term import Variable + expr = self._make_builtin("FLOOR", arg=Variable("x")) + assert evaluate_expression(expr, {"x": lit("-2.3")}) == -3 + + def test_floor_none(self): + from rdflib.term import Variable + expr = self._make_builtin("FLOOR", arg=Variable("x")) + assert evaluate_expression(expr, {"x": lit("abc")}) is None + + def test_ceil(self): + from rdflib.term import Variable + expr = self._make_builtin("CEIL", arg=Variable("x")) + assert evaluate_expression(expr, {"x": lit("3.2")}) == 4 + + def test_ceil_negative(self): + from rdflib.term import Variable + expr = self._make_builtin("CEIL", arg=Variable("x")) + assert evaluate_expression(expr, {"x": lit("-2.7")}) == -2 + + def test_abs_positive(self): + from rdflib.term import Variable + expr = self._make_builtin("ABS", arg=Variable("x")) + assert evaluate_expression(expr, {"x": lit("42")}) == 42 + + def test_abs_negative(self): + from rdflib.term import Variable + expr = self._make_builtin("ABS", arg=Variable("x")) + assert evaluate_expression(expr, {"x": lit("-42")}) == 42 + + def test_abs_none(self): + from rdflib.term import Variable + expr = self._make_builtin("ABS", arg=Variable("x")) + assert evaluate_expression(expr, {"x": lit("abc")}) is None + + def test_replace_simple(self): + from rdflib.term import Variable + from rdflib import Literal + expr = self._make_builtin("REPLACE", + arg=Variable("x"), + pattern=Literal(" BC"), + replacement=Literal(""), + flags=None) + result = evaluate_expression(expr, {"x": lit("500 BC")}) + assert result.type == LITERAL + assert result.value == "500" + + def test_replace_regex(self): + from rdflib.term import Variable + from rdflib import Literal + expr = self._make_builtin("REPLACE", + arg=Variable("x"), + pattern=Literal("[0-9]+"), + replacement=Literal("X"), + flags=None) + result = evaluate_expression(expr, {"x": lit("abc123def456")}) + assert result.value == "abcXdefX" + + def test_replace_case_insensitive(self): + from rdflib.term import Variable + from rdflib import Literal + expr = self._make_builtin("REPLACE", + arg=Variable("x"), + pattern=Literal("hello"), + replacement=Literal("world"), + flags=Literal("i")) + result = evaluate_expression(expr, {"x": lit("HELLO there")}) + assert result.value == "world there" + + def test_round_up(self): + from rdflib.term import Variable + expr = self._make_builtin("ROUND", arg=Variable("x")) + assert evaluate_expression(expr, {"x": lit("3.7")}) == 4 + + def test_round_down(self): + from rdflib.term import Variable + expr = self._make_builtin("ROUND", arg=Variable("x")) + assert evaluate_expression(expr, {"x": lit("3.2")}) == 3 + + def test_round_none(self): + from rdflib.term import Variable + expr = self._make_builtin("ROUND", arg=Variable("x")) + assert evaluate_expression(expr, {"x": lit("abc")}) is None + + def test_strbefore(self): + from rdflib.term import Variable + from rdflib import Literal + expr = self._make_builtin("STRBEFORE", + arg1=Variable("x"), arg2=Literal("-")) + result = evaluate_expression(expr, {"x": lit("2024-03-15")}) + assert result.value == "2024" + + def test_strbefore_not_found(self): + from rdflib.term import Variable + from rdflib import Literal + expr = self._make_builtin("STRBEFORE", + arg1=Variable("x"), arg2=Literal("/")) + result = evaluate_expression(expr, {"x": lit("hello")}) + assert result.value == "" + + def test_strafter(self): + from rdflib.term import Variable + from rdflib import Literal + expr = self._make_builtin("STRAFTER", + arg1=Variable("x"), arg2=Literal("-")) + result = evaluate_expression(expr, {"x": lit("2024-03-15")}) + assert result.value == "03-15" + + def test_strafter_not_found(self): + from rdflib.term import Variable + from rdflib import Literal + expr = self._make_builtin("STRAFTER", + arg1=Variable("x"), arg2=Literal("/")) + result = evaluate_expression(expr, {"x": lit("hello")}) + assert result.value == "" + + def test_encode_for_uri(self): + from rdflib.term import Variable + expr = self._make_builtin("ENCODE_FOR_URI", arg=Variable("x")) + result = evaluate_expression(expr, {"x": lit("hello world")}) + assert result.value == "hello%20world" + + def test_encode_for_uri_special_chars(self): + from rdflib.term import Variable + expr = self._make_builtin("ENCODE_FOR_URI", arg=Variable("x")) + result = evaluate_expression(expr, {"x": lit("a/b?c=d&e")}) + assert result.value == "a%2Fb%3Fc%3Dd%26e" + + def test_langmatches_basic(self): + from rdflib.term import Variable + from rdflib import Literal + expr = self._make_builtin("LANGMATCHES", + arg1=Literal("en"), arg2=Literal("en")) + assert evaluate_expression(expr, {}) is True + + def test_langmatches_subtag(self): + from rdflib.term import Variable + from rdflib import Literal + expr = self._make_builtin("LANGMATCHES", + arg1=Literal("en-US"), arg2=Literal("en")) + assert evaluate_expression(expr, {}) is True + + def test_langmatches_wildcard(self): + from rdflib.term import Variable + from rdflib import Literal + expr = self._make_builtin("LANGMATCHES", + arg1=Literal("fr"), arg2=Literal("*")) + assert evaluate_expression(expr, {}) is True + + def test_langmatches_wildcard_empty(self): + from rdflib.term import Variable + from rdflib import Literal + expr = self._make_builtin("LANGMATCHES", + arg1=Literal(""), arg2=Literal("*")) + assert evaluate_expression(expr, {}) is False + + def test_langmatches_no_match(self): + from rdflib.term import Variable + from rdflib import Literal + expr = self._make_builtin("LANGMATCHES", + arg1=Literal("fr"), arg2=Literal("en")) + assert evaluate_expression(expr, {}) is False + + def test_iri_constructor(self): + from rdflib.term import Variable + expr = self._make_builtin("IRI", arg=Variable("x")) + result = evaluate_expression( + expr, {"x": lit("http://example.com/test")} + ) + assert result.type == IRI + assert result.iri == "http://example.com/test" + + def test_uri_constructor(self): + from rdflib.term import Variable + expr = self._make_builtin("URI", arg=Variable("x")) + result = evaluate_expression( + expr, {"x": lit("http://example.com/test")} + ) + assert result.type == IRI + assert result.iri == "http://example.com/test" + + def test_bnode_no_arg(self): + expr = self._make_builtin("BNODE") + result = evaluate_expression(expr, {}) + assert result.type == BLANK + assert len(result.id) > 0 + + def test_bnode_with_label(self): + from rdflib import Literal + expr = self._make_builtin("BNODE", arg=Literal("mynode")) + result = evaluate_expression(expr, {}) + assert result.type == BLANK + assert result.id == "mynode" + + def test_now(self): + import re as re_mod + expr = self._make_builtin("NOW") + result = evaluate_expression(expr, {}) + assert result.type == LITERAL + assert result.datatype == XSD + "dateTime" + assert re_mod.match(r"\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}", result.value) + + def test_tz_with_utc(self): + from rdflib.term import Variable + expr = self._make_builtin("TZ", arg=Variable("x")) + result = evaluate_expression( + expr, {"x": lit("2024-03-15T10:30:45+0000", + datatype=XSD + "dateTime")} + ) + assert result.type == LITERAL + assert result.value == "+00:00" + + def test_tz_no_timezone(self): + from rdflib.term import Variable + expr = self._make_builtin("TZ", arg=Variable("x")) + result = evaluate_expression( + expr, {"x": lit("2024-03-15T10:30:45", + datatype=XSD + "dateTime")} + ) + assert result.value == "" + + def test_rand(self): + expr = self._make_builtin("RAND") + result = evaluate_expression(expr, {}) + assert isinstance(result, float) + assert 0.0 <= result < 1.0 + + def test_uuid(self): + import re as re_mod + expr = self._make_builtin("UUID") + result = evaluate_expression(expr, {}) + assert result.type == IRI + assert result.iri.startswith("urn:uuid:") + uuid_part = result.iri[len("urn:uuid:"):] + assert re_mod.match( + r"[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}", + uuid_part + ) + + def test_struuid(self): + import re as re_mod + expr = self._make_builtin("STRUUID") + result = evaluate_expression(expr, {}) + assert result.type == LITERAL + assert re_mod.match( + r"[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}", + result.value + ) + + def test_md5(self): + from rdflib.term import Variable + expr = self._make_builtin("MD5", arg=Variable("x")) + result = evaluate_expression(expr, {"x": lit("hello")}) + assert result.type == LITERAL + assert result.value == "5d41402abc4b2a76b9719d911017c592" + + def test_sha1(self): + from rdflib.term import Variable + expr = self._make_builtin("SHA1", arg=Variable("x")) + result = evaluate_expression(expr, {"x": lit("hello")}) + assert result.type == LITERAL + assert result.value == "aaf4c61ddcc5e8a2dabede0f3b482cd9aea9434d" + + def test_sha256(self): + from rdflib.term import Variable + expr = self._make_builtin("SHA256", arg=Variable("x")) + result = evaluate_expression(expr, {"x": lit("hello")}) + assert result.type == LITERAL + assert result.value == ( + "2cf24dba5fb0a30e26e83b2ac5b9e29e" + "1b161e5c1fa7425e73043362938b9824" + ) + + def test_sha512(self): + from rdflib.term import Variable + expr = self._make_builtin("SHA512", arg=Variable("x")) + result = evaluate_expression(expr, {"x": lit("hello")}) + assert result.type == LITERAL + assert len(result.value) == 128 + + def test_exists_with_callback(self): + from rdflib.plugins.sparql.parserutils import CompValue + graph = CompValue("BGP") + expr = self._make_builtin("EXISTS", graph=graph) + cb = lambda g, s: True + result = evaluate_expression(expr, {}, exists_cb=cb) + assert result is True + + def test_exists_callback_false(self): + from rdflib.plugins.sparql.parserutils import CompValue + graph = CompValue("BGP") + expr = self._make_builtin("EXISTS", graph=graph) + cb = lambda g, s: False + result = evaluate_expression(expr, {}, exists_cb=cb) + assert result is False + + def test_notexists_with_callback(self): + from rdflib.plugins.sparql.parserutils import CompValue + graph = CompValue("BGP") + expr = self._make_builtin("NOTEXISTS", graph=graph) + cb = lambda g, s: True + result = evaluate_expression(expr, {}, exists_cb=cb) + assert result is False + + def test_notexists_callback_false(self): + from rdflib.plugins.sparql.parserutils import CompValue + graph = CompValue("BGP") + expr = self._make_builtin("NOTEXISTS", graph=graph) + cb = lambda g, s: False + result = evaluate_expression(expr, {}, exists_cb=cb) + assert result is True + class TestEffectiveBoolean: diff --git a/tests/unit/test_query/test_sparql_solutions.py b/tests/unit/test_query/test_sparql_solutions.py index 5805ca84..7588a95b 100644 --- a/tests/unit/test_query/test_sparql_solutions.py +++ b/tests/unit/test_query/test_sparql_solutions.py @@ -5,7 +5,7 @@ Tests for SPARQL solution sequence operations. import pytest from trustgraph.schema import Term, IRI, LITERAL from trustgraph.query.sparql.solutions import ( - hash_join, left_join, union, project, distinct, + hash_join, left_join, minus, union, project, distinct, order_by, slice_solutions, _terms_equal, _compatible, ) @@ -311,6 +311,30 @@ class TestOrderBy: result = order_by(solutions, []) assert len(result) == 1 + def test_order_by_numeric_literals(self): + solutions = [ + {"year": lit("1950")}, + {"year": lit("700")}, + {"year": lit("2000")}, + {"year": lit("450")}, + {"year": lit("1200")}, + ] + key_fns = [(lambda sol: sol.get("year"), True)] + result = order_by(solutions, key_fns) + values = [s["year"].value for s in result] + assert values == ["450", "700", "1200", "1950", "2000"] + + def test_order_by_numeric_descending(self): + solutions = [ + {"year": lit("1950")}, + {"year": lit("700")}, + {"year": lit("2000")}, + ] + key_fns = [(lambda sol: sol.get("year"), False)] + result = order_by(solutions, key_fns) + values = [s["year"].value for s in result] + assert values == ["2000", "1950", "700"] + class TestSlice: @@ -343,3 +367,37 @@ class TestSlice: solutions = [{"s": alice}, {"s": bob}] result = slice_solutions(solutions) assert len(result) == 2 + + +class TestMinus: + + def test_removes_compatible(self, alice, bob): + left = [{"s": alice}, {"s": bob}] + right = [{"s": alice}] + result = minus(left, right) + assert len(result) == 1 + assert result[0]["s"].iri == "http://example.com/bob" + + def test_empty_right_preserves_all(self, alice, bob): + left = [{"s": alice}, {"s": bob}] + result = minus(left, []) + assert len(result) == 2 + + def test_no_shared_variables_preserves_all(self, alice, bob): + left = [{"s": alice}] + right = [{"t": bob}] + result = minus(left, right) + assert len(result) == 1 + + def test_all_removed(self, alice): + left = [{"s": alice}] + right = [{"s": alice}] + result = minus(left, right) + assert len(result) == 0 + + def test_partial_shared_variables(self, alice, bob): + left = [{"s": alice, "p": lit("x")}, {"s": bob, "p": lit("y")}] + right = [{"s": alice}] + result = minus(left, right) + assert len(result) == 1 + assert result[0]["s"].iri == "http://example.com/bob" diff --git a/trustgraph-flow/trustgraph/query/sparql/algebra.py b/trustgraph-flow/trustgraph/query/sparql/algebra.py index 76b1ad8e..6f0227c8 100644 --- a/trustgraph-flow/trustgraph/query/sparql/algebra.py +++ b/trustgraph-flow/trustgraph/query/sparql/algebra.py @@ -17,7 +17,7 @@ from ... knowledge import Uri from ... knowledge import Literal as KgLiteral from . parser import rdflib_term_to_term from . solutions import ( - hash_join, left_join, union, project, distinct, + hash_join, left_join, minus, union, project, distinct, order_by, slice_solutions, _term_key, ) from . expressions import evaluate_expression, _effective_boolean @@ -159,14 +159,69 @@ async def _eval_union(node, tc, collection, limit): return union(left, right)[:limit] +async def _eval_minus(node, tc, collection, limit): + """Evaluate a Minus node.""" + left = await evaluate(node.p1, tc, collection, limit) + right = await evaluate(node.p2, tc, collection, limit) + return minus(left, right) + + +async def _check_exists(graph_node, sol, tc, collection, limit): + """Evaluate an EXISTS graph pattern against a solution.""" + results = await evaluate(graph_node, tc, collection, limit) + for r in results: + shared = set(sol.keys()) & set(r.keys()) + if all( + _term_key(sol[v]) == _term_key(r[v]) + for v in shared + if sol.get(v) is not None and r.get(v) is not None + ): + return True + return False + + +async def _pre_eval_exists(expr, sol, tc, collection, limit, cache): + """Walk an expression tree, pre-evaluate EXISTS/NOT EXISTS, cache results.""" + if not isinstance(expr, CompValue): + return + if expr.name in ("Builtin_EXISTS", "Builtin_NOTEXISTS"): + key = id(expr.graph), id(sol) + if key not in cache: + cache[key] = await _check_exists( + expr.graph, sol, tc, collection, limit + ) + return + for attr in ("expr", "other", "arg", "arg1", "arg2", "arg3"): + child = getattr(expr, attr, None) + if child is None: + continue + if isinstance(child, CompValue): + await _pre_eval_exists(child, sol, tc, collection, limit, cache) + elif isinstance(child, (list, tuple)): + for item in child: + if isinstance(item, CompValue): + await _pre_eval_exists( + item, sol, tc, collection, limit, cache + ) + + async def _eval_filter(node, tc, collection, limit): """Evaluate a Filter node.""" solutions = await evaluate(node.p, tc, collection, limit) expr = node.expr - return [ - sol for sol in solutions - if _effective_boolean(evaluate_expression(expr, sol)) - ] + exists_cache = {} + + def exists_cb(graph_node, sol): + key = id(graph_node), id(sol) + return exists_cache.get(key, False) + + result = [] + for sol in solutions: + await _pre_eval_exists(expr, sol, tc, collection, limit, exists_cache) + if _effective_boolean(evaluate_expression(expr, sol, exists_cb=exists_cb)): + result.append(sol) + + return result async def _eval_distinct(node, tc, collection, limit): @@ -222,10 +277,16 @@ async def _eval_extend(node, tc, collection, limit): solutions = await evaluate(node.p, tc, collection, limit) var_name = str(node.var) expr = node.expr + exists_cache = {} + + def exists_cb(graph_node, sol): + key = id(graph_node), id(sol) + return exists_cache.get(key, False) result = [] for sol in solutions: - val = evaluate_expression(expr, sol) + await _pre_eval_exists(expr, sol, tc, collection, limit, exists_cache) + val = evaluate_expression(expr, sol, exists_cb=exists_cb) new_sol = dict(sol) if isinstance(val, Term): new_sol[var_name] = val @@ -525,6 +586,7 @@ _HANDLERS = { "Join": _eval_join, "LeftJoin": _eval_left_join, "Union": _eval_union, + "Minus": _eval_minus, "Filter": _eval_filter, "Distinct": _eval_distinct, "Reduced": _eval_reduced, diff --git a/trustgraph-flow/trustgraph/query/sparql/expressions.py b/trustgraph-flow/trustgraph/query/sparql/expressions.py index eac1199c..ad3202d9 100644 --- a/trustgraph-flow/trustgraph/query/sparql/expressions.py +++ b/trustgraph-flow/trustgraph/query/sparql/expressions.py @@ -5,9 +5,15 @@ Evaluates rdflib algebra expression nodes against a solution (variable binding) to produce a value or boolean result. """ +import hashlib +import math +import random import re import logging import operator +import uuid +from datetime import datetime, date, timezone +from urllib.parse import quote from rdflib.term import Variable, URIRef, Literal, BNode from rdflib.plugins.sparql.parserutils import CompValue @@ -17,23 +23,31 @@ from . parser import rdflib_term_to_term logger = logging.getLogger(__name__) +_exists_callback = None + class ExpressionError(Exception): """Raised when a SPARQL expression cannot be evaluated.""" pass -def evaluate_expression(expr, solution): +def evaluate_expression(expr, solution, exists_cb=None): """ Evaluate a SPARQL expression against a solution binding. Args: expr: rdflib algebra expression node solution: dict mapping variable names to Term values + exists_cb: optional callback(graph_node, solution) -> bool for + EXISTS/NOT EXISTS evaluation; provided by algebra.py Returns: The result value (Term, bool, number, string, or None) """ + global _exists_callback + if exists_cb is not None: + _exists_callback = exists_cb + if expr is None: return True @@ -119,15 +133,7 @@ def _evaluate_comp_value(node, solution): if name == "Function": return _eval_function(node, solution) - # Exists / NotExists - if name == "Builtin_EXISTS": - # EXISTS requires graph pattern evaluation - not handled here - logger.warning("EXISTS not supported in filter expressions") - return True - - if name == "Builtin_NOTEXISTS": - logger.warning("NOT EXISTS not supported in filter expressions") - return True + # Exists / NotExists — handled via _eval_builtin now # TrueFilter (used with OPTIONAL) if name == "TrueFilter": @@ -335,6 +341,197 @@ def _eval_builtin(name, node, solution): return val return None + if builtin == "YEAR": + dt = _to_datetime(evaluate_expression(node.arg, solution)) + return dt.year if dt is not None else None + + if builtin == "MONTH": + dt = _to_datetime(evaluate_expression(node.arg, solution)) + return dt.month if dt is not None else None + + if builtin == "DAY": + dt = _to_datetime(evaluate_expression(node.arg, solution)) + return dt.day if dt is not None else None + + if builtin == "HOURS": + dt = _to_datetime(evaluate_expression(node.arg, solution)) + if dt is None: + return None + return dt.hour if isinstance(dt, datetime) else 0 + + if builtin == "MINUTES": + dt = _to_datetime(evaluate_expression(node.arg, solution)) + if dt is None: + return None + return dt.minute if isinstance(dt, datetime) else 0 + + if builtin == "SECONDS": + dt = _to_datetime(evaluate_expression(node.arg, solution)) + if dt is None: + return None + return dt.second if isinstance(dt, datetime) else 0 + + if builtin == "FLOOR": + val = _to_numeric(evaluate_expression(node.arg, solution)) + if val is None: + return None + return int(math.floor(val)) + + if builtin == "CEIL": + val = _to_numeric(evaluate_expression(node.arg, solution)) + if val is None: + return None + return int(math.ceil(val)) + + if builtin == "ABS": + val = _to_numeric(evaluate_expression(node.arg, solution)) + if val is None: + return None + return abs(val) + + if builtin == "ROUND": + val = _to_numeric(evaluate_expression(node.arg, solution)) + if val is None: + return None + return round(val) + + if builtin == "STRBEFORE": + string = _to_string(evaluate_expression(node.arg1, solution)) + sep = _to_string(evaluate_expression(node.arg2, solution)) + idx = string.find(sep) + if idx < 0: + return Term(type=LITERAL, value="") + return Term(type=LITERAL, value=string[:idx]) + + if builtin == "STRAFTER": + string = _to_string(evaluate_expression(node.arg1, solution)) + sep = _to_string(evaluate_expression(node.arg2, solution)) + idx = string.find(sep) + if idx < 0: + return Term(type=LITERAL, value="") + return Term(type=LITERAL, value=string[idx + len(sep):]) + + if builtin == "ENCODE_FOR_URI": + val = _to_string(evaluate_expression(node.arg, solution)) + return Term(type=LITERAL, value=quote(val, safe="")) + + if builtin == "REPLACE": + string = _to_string(evaluate_expression(node.arg, solution)) + pattern = _to_string(evaluate_expression(node.pattern, solution)) + replacement = _to_string( + evaluate_expression(node.replacement, solution) + ) + flags_str = "" + if hasattr(node, "flags") and node.flags is not None: + flags_str = _to_string(evaluate_expression(node.flags, solution)) + re_flags = 0 + if "i" in flags_str: + re_flags |= re.IGNORECASE + if "m" in flags_str: + re_flags |= re.MULTILINE + if "s" in flags_str: + re_flags |= re.DOTALL + try: + result = re.sub(pattern, replacement, string, flags=re_flags) + return Term(type=LITERAL, value=result) + except re.error: + return None + + if builtin == "SUBSTR": + string = _to_string(evaluate_expression(node.arg, solution)) + start = _to_numeric(evaluate_expression(node.start, solution)) + if start is None: + return None + start_idx = max(int(start) - 1, 0) + if hasattr(node, "length") and node.length is not None: + length = _to_numeric(evaluate_expression(node.length, solution)) + if length is None: + return None + return Term( + type=LITERAL, value=string[start_idx:start_idx + int(length)] + ) + return Term(type=LITERAL, value=string[start_idx:]) + + if builtin == "EXISTS": + if _exists_callback is not None: + return _exists_callback(node.graph, solution) + logger.warning("EXISTS requires an exists_cb; not available") + return True + + if builtin == "NOTEXISTS": + if _exists_callback is not None: + return not _exists_callback(node.graph, solution) + logger.warning("NOT EXISTS requires an exists_cb; not available") + return True + + if builtin == "LANGMATCHES": + tag = _to_string(evaluate_expression(node.arg1, solution)) + rng = _to_string(evaluate_expression(node.arg2, solution)) + if rng == "*": + return len(tag) > 0 + return tag.lower().startswith(rng.lower()) + + if builtin == "IRI" or builtin == "URI": + val = _to_string(evaluate_expression(node.arg, solution)) + return Term(type=IRI, iri=val) + + if builtin == "BNODE": + if hasattr(node, "arg") and node.arg is not None: + label = _to_string(evaluate_expression(node.arg, solution)) + return Term(type=BLANK, id=label) + return Term(type=BLANK, id=str(uuid.uuid4())) + + if builtin == "NOW": + now = datetime.now(timezone.utc) + return Term( + type=LITERAL, + value=now.strftime("%Y-%m-%dT%H:%M:%S%z"), + datatype="http://www.w3.org/2001/XMLSchema#dateTime", + ) + + if builtin == "TZ": + dt = _to_datetime(evaluate_expression(node.arg, solution)) + if dt is None: + return Term(type=LITERAL, value="") + if dt.tzinfo is not None: + offset = dt.strftime("%z") + if offset: + return Term(type=LITERAL, value=offset[:3] + ":" + offset[3:]) + return Term(type=LITERAL, value="") + + if builtin == "RAND": + return random.random() + + if builtin == "UUID": + return Term(type=IRI, iri="urn:uuid:" + str(uuid.uuid4())) + + if builtin == "STRUUID": + return Term(type=LITERAL, value=str(uuid.uuid4())) + + if builtin == "MD5": + val = _to_string(evaluate_expression(node.arg, solution)) + return Term( + type=LITERAL, value=hashlib.md5(val.encode()).hexdigest() + ) + + if builtin == "SHA1": + val = _to_string(evaluate_expression(node.arg, solution)) + return Term( + type=LITERAL, value=hashlib.sha1(val.encode()).hexdigest() + ) + + if builtin == "SHA256": + val = _to_string(evaluate_expression(node.arg, solution)) + return Term( + type=LITERAL, value=hashlib.sha256(val.encode()).hexdigest() + ) + + if builtin == "SHA512": + val = _to_string(evaluate_expression(node.arg, solution)) + return Term( + type=LITERAL, value=hashlib.sha512(val.encode()).hexdigest() + ) + if builtin == "sameTerm": left = evaluate_expression(node.arg1, solution) right = evaluate_expression(node.arg2, solution) @@ -454,6 +651,27 @@ def _to_numeric(val): return None +def _to_datetime(val): + """Convert a value to a date or datetime object.""" + if val is None: + return None + s = _to_string(val) + for fmt in ( + "%Y-%m-%dT%H:%M:%S", "%Y-%m-%dT%H:%M:%S%z", + "%Y-%m-%dT%H:%M:%SZ", "%Y-%m-%dT%H:%M", + "%Y-%m-%d", + ): + try: + return datetime.strptime(s, fmt) + except ValueError: + continue + try: + return datetime.fromisoformat(s) + except ValueError: + pass + return None + + def _comparable_value(val): """ Convert a value to a form suitable for comparison. diff --git a/trustgraph-flow/trustgraph/query/sparql/solutions.py b/trustgraph-flow/trustgraph/query/sparql/solutions.py index d1ea8373..edf3401d 100644 --- a/trustgraph-flow/trustgraph/query/sparql/solutions.py +++ b/trustgraph-flow/trustgraph/query/sparql/solutions.py @@ -150,6 +150,30 @@ def left_join(left, right, filter_fn=None): return results +def minus(left, right): + """ + MINUS operation: remove left solutions that are compatible with any + right solution sharing at least one variable. + """ + if not right: + return list(left) + + right_vars = set() + for sol in right: + right_vars.update(sol.keys()) + + results = [] + for sol_l in left: + shared = set(sol_l.keys()) & right_vars + if not shared: + results.append(sol_l) + continue + if not any(_compatible(sol_l, sol_r) for sol_r in right): + results.append(sol_l) + + return results + + def union(left, right): """Union two solution sequences (concatenation).""" return list(left) + list(right) @@ -177,6 +201,28 @@ def distinct(solutions): return results +def _sort_comparable(val): + """Convert a value to a form suitable for sort ordering.""" + if val is None: + return (0, "") + if isinstance(val, (int, float)): + return (2, val) + if isinstance(val, Term): + if val.type == LITERAL: + try: + if "." in val.value: + return (2, float(val.value)) + return (2, int(val.value)) + except (ValueError, TypeError): + pass + return (3, val.value) + elif val.type == IRI: + return (4, val.iri) + elif val.type == BLANK: + return (5, val.id) + return (6, str(val)) + + def order_by(solutions, key_fns): """ Sort solutions by the given key functions. @@ -191,14 +237,7 @@ def order_by(solutions, key_fns): keys = [] for fn, ascending in key_fns: val = fn(sol) - # Convert to comparable form - if val is None: - comparable = ("", "") - elif isinstance(val, Term): - comparable = _term_key(val) - else: - comparable = ("v", str(val)) - keys.append(comparable) + keys.append(_sort_comparable(val)) return keys # Handle ascending/descending @@ -224,10 +263,8 @@ def _mixed_sort(solutions, key_fns): def compare(a, b): for fn, ascending in key_fns: - va = fn(a) - vb = fn(b) - ka = _term_key(va) if isinstance(va, Term) else ("v", str(va)) if va is not None else ("", "") - kb = _term_key(vb) if isinstance(vb, Term) else ("v", str(vb)) if vb is not None else ("", "") + ka = _sort_comparable(fn(a)) + kb = _sort_comparable(fn(b)) if ka < kb: return -1 if ascending else 1