feat: extend SPARQL evaluator with comprehensive function and operator support (#945)

Add 30+ SPARQL 1.1 built-in functions and the MINUS algebra operator to the
custom SPARQL query backend.

String functions:
- SUBSTR (2-arg and 3-arg forms), STRBEFORE, STRAFTER
- REPLACE (regex with flags), ENCODE_FOR_URI

Numeric functions:
- FLOOR, CEIL, ROUND, ABS

Date/time accessors:
- YEAR, MONTH, DAY, HOURS, MINUTES, SECONDS
- NOW, TZ

Hash functions:
- MD5, SHA1, SHA256, SHA512

Term constructors:
- IRI/URI, BNODE, UUID, STRUUID

Other functions:
- LANGMATCHES, RAND
- EXISTS / NOT EXISTS (with async pre-evaluation to bridge the
  sync expression evaluator and async algebra evaluator)

Algebra:
- MINUS set-difference operator
- HAVING already works via rdflib's Filter mapping (verified)

Fix SPARQL ORDER handling

Includes 653 lines of new unit tests covering all added functionality
across expressions, solutions, and algebra layers.
This commit is contained in:
cybermaggedon 2026-05-21 10:50:11 +01:00 committed by GitHub
parent e57f4669e1
commit 2c3a699af3
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
6 changed files with 1021 additions and 29 deletions

View file

@ -84,6 +84,20 @@ def make_distinct(inner):
return node return node
def make_filter(inner, expr):
node = CompValue("Filter")
node.p = inner
node.expr = expr
return node
def make_minus(left, right):
node = CompValue("Minus")
node.p1 = left
node.p2 = right
return node
class TestQueryPattern: class TestQueryPattern:
"""Tests for _query_pattern — the leaf that calls TriplesClient.""" """Tests for _query_pattern — the leaf that calls TriplesClient."""
@ -282,6 +296,177 @@ class TestEvaluate:
assert len(solutions) == 1 assert len(solutions) == 1
@pytest.mark.asyncio
async def test_minus_removes_matching(self):
tc = AsyncMock()
alice = iri("http://example.com/alice")
bob = iri("http://example.com/bob")
knows = iri("http://example.com/knows")
hates = iri("http://example.com/hates")
charlie = iri("http://example.com/charlie")
left_triple = make_triple(alice, knows, bob)
right_triple1 = make_triple(alice, knows, bob)
right_triple2 = make_triple(alice, hates, charlie)
left_bgp = make_bgp(
(Variable("s"), URIRef("http://example.com/knows"), Variable("o"))
)
right_bgp = make_bgp(
(Variable("s"), URIRef("http://example.com/hates"), Variable("r"))
)
async def mock_query(**kwargs):
pred = kwargs.get("p")
if pred and pred.iri == "http://example.com/knows":
return [left_triple]
elif pred and pred.iri == "http://example.com/hates":
return [right_triple2]
return []
tc.query.side_effect = mock_query
tree = make_select(
make_project(
make_minus(left_bgp, right_bgp),
["s", "o"]
)
)
solutions = await evaluate(tree, tc, collection="default")
# alice knows bob, but alice also hates charlie
# shared var is "s" (alice), so alice's solution is removed
assert len(solutions) == 0
@pytest.mark.asyncio
async def test_minus_no_shared_vars_preserves_all(self):
tc = AsyncMock()
alice = iri("http://example.com/alice")
bob = iri("http://example.com/bob")
left_triple = make_triple(alice, iri("http://example.com/p"), bob)
left_bgp = make_bgp(
(Variable("s"), URIRef("http://example.com/p"), Variable("o"))
)
right_bgp = make_bgp(
(Variable("x"), URIRef("http://example.com/q"), Variable("y"))
)
async def mock_query(**kwargs):
pred = kwargs.get("p")
if pred and pred.iri == "http://example.com/p":
return [left_triple]
return []
tc.query.side_effect = mock_query
tree = make_select(
make_project(
make_minus(left_bgp, right_bgp),
["s", "o"]
)
)
solutions = await evaluate(tree, tc, collection="default")
assert len(solutions) == 1
@pytest.mark.asyncio
async def test_filter_exists_keeps_matching(self):
tc = AsyncMock()
alice = iri("http://example.com/alice")
bob = iri("http://example.com/bob")
charlie = iri("http://example.com/charlie")
left_triple1 = make_triple(alice, iri("http://example.com/knows"), bob)
left_triple2 = make_triple(alice, iri("http://example.com/knows"), charlie)
exists_triple = make_triple(bob, iri("http://example.com/likes"), alice)
left_bgp = make_bgp(
(Variable("s"), URIRef("http://example.com/knows"), Variable("o"))
)
exists_bgp = make_bgp(
(Variable("o"), URIRef("http://example.com/likes"), Variable("_any"))
)
async def mock_query(**kwargs):
pred = kwargs.get("p")
if pred and pred.iri == "http://example.com/knows":
return [left_triple1, left_triple2]
elif pred and pred.iri == "http://example.com/likes":
return [exists_triple]
return []
tc.query.side_effect = mock_query
exists_expr = CompValue("Builtin_EXISTS")
exists_expr.graph = exists_bgp
tree = make_select(
make_project(
make_filter(left_bgp, exists_expr),
["s", "o"]
)
)
solutions = await evaluate(tree, tc, collection="default")
# Only bob has a "likes" triple, so only the bob solution passes
result_objects = [s["o"].iri for s in solutions]
assert "http://example.com/bob" in result_objects
assert "http://example.com/charlie" not in result_objects
@pytest.mark.asyncio
async def test_filter_not_exists_removes_matching(self):
tc = AsyncMock()
alice = iri("http://example.com/alice")
bob = iri("http://example.com/bob")
charlie = iri("http://example.com/charlie")
left_triple1 = make_triple(alice, iri("http://example.com/knows"), bob)
left_triple2 = make_triple(alice, iri("http://example.com/knows"), charlie)
exists_triple = make_triple(bob, iri("http://example.com/likes"), alice)
left_bgp = make_bgp(
(Variable("s"), URIRef("http://example.com/knows"), Variable("o"))
)
exists_bgp = make_bgp(
(Variable("o"), URIRef("http://example.com/likes"), Variable("_any"))
)
async def mock_query(**kwargs):
pred = kwargs.get("p")
if pred and pred.iri == "http://example.com/knows":
return [left_triple1, left_triple2]
elif pred and pred.iri == "http://example.com/likes":
return [exists_triple]
return []
tc.query.side_effect = mock_query
not_exists_expr = CompValue("Builtin_NOTEXISTS")
not_exists_expr.graph = exists_bgp
tree = make_select(
make_project(
make_filter(left_bgp, not_exists_expr),
["s", "o"]
)
)
solutions = await evaluate(tree, tc, collection="default")
# bob has a "likes" triple so is removed; charlie stays
result_objects = [s["o"].iri for s in solutions]
assert "http://example.com/charlie" in result_objects
assert "http://example.com/bob" not in result_objects
@pytest.mark.asyncio @pytest.mark.asyncio
async def test_unsupported_node_returns_empty_solution(self): async def test_unsupported_node_returns_empty_solution(self):
tc = AsyncMock() tc = AsyncMock()

View file

@ -300,6 +300,438 @@ class TestBuiltinFunctions:
flags=None) flags=None)
assert evaluate_expression(expr, {"x": lit("hello")}) is False assert evaluate_expression(expr, {"x": lit("hello")}) is False
def test_substr_three_args(self):
from rdflib.term import Variable
from rdflib import Literal
expr = self._make_builtin("SUBSTR",
arg=Variable("x"),
start=Literal(1),
length=Literal(4))
result = evaluate_expression(expr, {"x": lit("2024-03-15")})
assert result.type == LITERAL
assert result.value == "2024"
def test_substr_two_args(self):
from rdflib.term import Variable
from rdflib import Literal
expr = self._make_builtin("SUBSTR",
arg=Variable("x"),
start=Literal(6),
length=None)
result = evaluate_expression(expr, {"x": lit("2024-03-15")})
assert result.type == LITERAL
assert result.value == "03-15"
def test_substr_middle(self):
from rdflib.term import Variable
from rdflib import Literal
expr = self._make_builtin("SUBSTR",
arg=Variable("x"),
start=Literal(6),
length=Literal(2))
result = evaluate_expression(expr, {"x": lit("2024-03-15")})
assert result.type == LITERAL
assert result.value == "03"
def test_substr_null_start(self):
from rdflib.term import Variable
from rdflib import Literal
expr = self._make_builtin("SUBSTR",
arg=Variable("x"),
start=Variable("missing"),
length=None)
result = evaluate_expression(expr, {"x": lit("hello")})
assert result is None
def test_year(self):
from rdflib.term import Variable
expr = self._make_builtin("YEAR", arg=Variable("x"))
result = evaluate_expression(
expr, {"x": lit("2024-03-15", datatype=XSD + "date")}
)
assert result == 2024
def test_month(self):
from rdflib.term import Variable
expr = self._make_builtin("MONTH", arg=Variable("x"))
result = evaluate_expression(
expr, {"x": lit("2024-03-15", datatype=XSD + "date")}
)
assert result == 3
def test_day(self):
from rdflib.term import Variable
expr = self._make_builtin("DAY", arg=Variable("x"))
result = evaluate_expression(
expr, {"x": lit("2024-03-15", datatype=XSD + "date")}
)
assert result == 15
def test_hours(self):
from rdflib.term import Variable
expr = self._make_builtin("HOURS", arg=Variable("x"))
result = evaluate_expression(
expr, {"x": lit("2024-03-15T10:30:45", datatype=XSD + "dateTime")}
)
assert result == 10
def test_minutes(self):
from rdflib.term import Variable
expr = self._make_builtin("MINUTES", arg=Variable("x"))
result = evaluate_expression(
expr, {"x": lit("2024-03-15T10:30:45", datatype=XSD + "dateTime")}
)
assert result == 30
def test_seconds(self):
from rdflib.term import Variable
expr = self._make_builtin("SECONDS", arg=Variable("x"))
result = evaluate_expression(
expr, {"x": lit("2024-03-15T10:30:45", datatype=XSD + "dateTime")}
)
assert result == 45
def test_year_from_datetime(self):
from rdflib.term import Variable
expr = self._make_builtin("YEAR", arg=Variable("x"))
result = evaluate_expression(
expr, {"x": lit("2024-03-15T10:30:45", datatype=XSD + "dateTime")}
)
assert result == 2024
def test_hours_from_date_returns_zero(self):
from rdflib.term import Variable
expr = self._make_builtin("HOURS", arg=Variable("x"))
result = evaluate_expression(
expr, {"x": lit("2024-03-15", datatype=XSD + "date")}
)
assert result == 0
def test_year_invalid_date(self):
from rdflib.term import Variable
expr = self._make_builtin("YEAR", arg=Variable("x"))
result = evaluate_expression(
expr, {"x": lit("not-a-date")}
)
assert result is None
def test_floor(self):
from rdflib.term import Variable
expr = self._make_builtin("FLOOR", arg=Variable("x"))
assert evaluate_expression(expr, {"x": lit("3.7")}) == 3
def test_floor_negative(self):
from rdflib.term import Variable
expr = self._make_builtin("FLOOR", arg=Variable("x"))
assert evaluate_expression(expr, {"x": lit("-2.3")}) == -3
def test_floor_none(self):
from rdflib.term import Variable
expr = self._make_builtin("FLOOR", arg=Variable("x"))
assert evaluate_expression(expr, {"x": lit("abc")}) is None
def test_ceil(self):
from rdflib.term import Variable
expr = self._make_builtin("CEIL", arg=Variable("x"))
assert evaluate_expression(expr, {"x": lit("3.2")}) == 4
def test_ceil_negative(self):
from rdflib.term import Variable
expr = self._make_builtin("CEIL", arg=Variable("x"))
assert evaluate_expression(expr, {"x": lit("-2.7")}) == -2
def test_abs_positive(self):
from rdflib.term import Variable
expr = self._make_builtin("ABS", arg=Variable("x"))
assert evaluate_expression(expr, {"x": lit("42")}) == 42
def test_abs_negative(self):
from rdflib.term import Variable
expr = self._make_builtin("ABS", arg=Variable("x"))
assert evaluate_expression(expr, {"x": lit("-42")}) == 42
def test_abs_none(self):
from rdflib.term import Variable
expr = self._make_builtin("ABS", arg=Variable("x"))
assert evaluate_expression(expr, {"x": lit("abc")}) is None
def test_replace_simple(self):
from rdflib.term import Variable
from rdflib import Literal
expr = self._make_builtin("REPLACE",
arg=Variable("x"),
pattern=Literal(" BC"),
replacement=Literal(""),
flags=None)
result = evaluate_expression(expr, {"x": lit("500 BC")})
assert result.type == LITERAL
assert result.value == "500"
def test_replace_regex(self):
from rdflib.term import Variable
from rdflib import Literal
expr = self._make_builtin("REPLACE",
arg=Variable("x"),
pattern=Literal("[0-9]+"),
replacement=Literal("X"),
flags=None)
result = evaluate_expression(expr, {"x": lit("abc123def456")})
assert result.value == "abcXdefX"
def test_replace_case_insensitive(self):
from rdflib.term import Variable
from rdflib import Literal
expr = self._make_builtin("REPLACE",
arg=Variable("x"),
pattern=Literal("hello"),
replacement=Literal("world"),
flags=Literal("i"))
result = evaluate_expression(expr, {"x": lit("HELLO there")})
assert result.value == "world there"
def test_round_up(self):
from rdflib.term import Variable
expr = self._make_builtin("ROUND", arg=Variable("x"))
assert evaluate_expression(expr, {"x": lit("3.7")}) == 4
def test_round_down(self):
from rdflib.term import Variable
expr = self._make_builtin("ROUND", arg=Variable("x"))
assert evaluate_expression(expr, {"x": lit("3.2")}) == 3
def test_round_none(self):
from rdflib.term import Variable
expr = self._make_builtin("ROUND", arg=Variable("x"))
assert evaluate_expression(expr, {"x": lit("abc")}) is None
def test_strbefore(self):
from rdflib.term import Variable
from rdflib import Literal
expr = self._make_builtin("STRBEFORE",
arg1=Variable("x"), arg2=Literal("-"))
result = evaluate_expression(expr, {"x": lit("2024-03-15")})
assert result.value == "2024"
def test_strbefore_not_found(self):
from rdflib.term import Variable
from rdflib import Literal
expr = self._make_builtin("STRBEFORE",
arg1=Variable("x"), arg2=Literal("/"))
result = evaluate_expression(expr, {"x": lit("hello")})
assert result.value == ""
def test_strafter(self):
from rdflib.term import Variable
from rdflib import Literal
expr = self._make_builtin("STRAFTER",
arg1=Variable("x"), arg2=Literal("-"))
result = evaluate_expression(expr, {"x": lit("2024-03-15")})
assert result.value == "03-15"
def test_strafter_not_found(self):
from rdflib.term import Variable
from rdflib import Literal
expr = self._make_builtin("STRAFTER",
arg1=Variable("x"), arg2=Literal("/"))
result = evaluate_expression(expr, {"x": lit("hello")})
assert result.value == ""
def test_encode_for_uri(self):
from rdflib.term import Variable
expr = self._make_builtin("ENCODE_FOR_URI", arg=Variable("x"))
result = evaluate_expression(expr, {"x": lit("hello world")})
assert result.value == "hello%20world"
def test_encode_for_uri_special_chars(self):
from rdflib.term import Variable
expr = self._make_builtin("ENCODE_FOR_URI", arg=Variable("x"))
result = evaluate_expression(expr, {"x": lit("a/b?c=d&e")})
assert result.value == "a%2Fb%3Fc%3Dd%26e"
def test_langmatches_basic(self):
from rdflib.term import Variable
from rdflib import Literal
expr = self._make_builtin("LANGMATCHES",
arg1=Literal("en"), arg2=Literal("en"))
assert evaluate_expression(expr, {}) is True
def test_langmatches_subtag(self):
from rdflib.term import Variable
from rdflib import Literal
expr = self._make_builtin("LANGMATCHES",
arg1=Literal("en-US"), arg2=Literal("en"))
assert evaluate_expression(expr, {}) is True
def test_langmatches_wildcard(self):
from rdflib.term import Variable
from rdflib import Literal
expr = self._make_builtin("LANGMATCHES",
arg1=Literal("fr"), arg2=Literal("*"))
assert evaluate_expression(expr, {}) is True
def test_langmatches_wildcard_empty(self):
from rdflib.term import Variable
from rdflib import Literal
expr = self._make_builtin("LANGMATCHES",
arg1=Literal(""), arg2=Literal("*"))
assert evaluate_expression(expr, {}) is False
def test_langmatches_no_match(self):
from rdflib.term import Variable
from rdflib import Literal
expr = self._make_builtin("LANGMATCHES",
arg1=Literal("fr"), arg2=Literal("en"))
assert evaluate_expression(expr, {}) is False
def test_iri_constructor(self):
from rdflib.term import Variable
expr = self._make_builtin("IRI", arg=Variable("x"))
result = evaluate_expression(
expr, {"x": lit("http://example.com/test")}
)
assert result.type == IRI
assert result.iri == "http://example.com/test"
def test_uri_constructor(self):
from rdflib.term import Variable
expr = self._make_builtin("URI", arg=Variable("x"))
result = evaluate_expression(
expr, {"x": lit("http://example.com/test")}
)
assert result.type == IRI
assert result.iri == "http://example.com/test"
def test_bnode_no_arg(self):
expr = self._make_builtin("BNODE")
result = evaluate_expression(expr, {})
assert result.type == BLANK
assert len(result.id) > 0
def test_bnode_with_label(self):
from rdflib import Literal
expr = self._make_builtin("BNODE", arg=Literal("mynode"))
result = evaluate_expression(expr, {})
assert result.type == BLANK
assert result.id == "mynode"
def test_now(self):
import re as re_mod
expr = self._make_builtin("NOW")
result = evaluate_expression(expr, {})
assert result.type == LITERAL
assert result.datatype == XSD + "dateTime"
assert re_mod.match(r"\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}", result.value)
def test_tz_with_utc(self):
from rdflib.term import Variable
expr = self._make_builtin("TZ", arg=Variable("x"))
result = evaluate_expression(
expr, {"x": lit("2024-03-15T10:30:45+0000",
datatype=XSD + "dateTime")}
)
assert result.type == LITERAL
assert result.value == "+00:00"
def test_tz_no_timezone(self):
from rdflib.term import Variable
expr = self._make_builtin("TZ", arg=Variable("x"))
result = evaluate_expression(
expr, {"x": lit("2024-03-15T10:30:45",
datatype=XSD + "dateTime")}
)
assert result.value == ""
def test_rand(self):
expr = self._make_builtin("RAND")
result = evaluate_expression(expr, {})
assert isinstance(result, float)
assert 0.0 <= result < 1.0
def test_uuid(self):
import re as re_mod
expr = self._make_builtin("UUID")
result = evaluate_expression(expr, {})
assert result.type == IRI
assert result.iri.startswith("urn:uuid:")
uuid_part = result.iri[len("urn:uuid:"):]
assert re_mod.match(
r"[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}",
uuid_part
)
def test_struuid(self):
import re as re_mod
expr = self._make_builtin("STRUUID")
result = evaluate_expression(expr, {})
assert result.type == LITERAL
assert re_mod.match(
r"[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}",
result.value
)
def test_md5(self):
from rdflib.term import Variable
expr = self._make_builtin("MD5", arg=Variable("x"))
result = evaluate_expression(expr, {"x": lit("hello")})
assert result.type == LITERAL
assert result.value == "5d41402abc4b2a76b9719d911017c592"
def test_sha1(self):
from rdflib.term import Variable
expr = self._make_builtin("SHA1", arg=Variable("x"))
result = evaluate_expression(expr, {"x": lit("hello")})
assert result.type == LITERAL
assert result.value == "aaf4c61ddcc5e8a2dabede0f3b482cd9aea9434d"
def test_sha256(self):
from rdflib.term import Variable
expr = self._make_builtin("SHA256", arg=Variable("x"))
result = evaluate_expression(expr, {"x": lit("hello")})
assert result.type == LITERAL
assert result.value == (
"2cf24dba5fb0a30e26e83b2ac5b9e29e"
"1b161e5c1fa7425e73043362938b9824"
)
def test_sha512(self):
from rdflib.term import Variable
expr = self._make_builtin("SHA512", arg=Variable("x"))
result = evaluate_expression(expr, {"x": lit("hello")})
assert result.type == LITERAL
assert len(result.value) == 128
def test_exists_with_callback(self):
from rdflib.plugins.sparql.parserutils import CompValue
graph = CompValue("BGP")
expr = self._make_builtin("EXISTS", graph=graph)
cb = lambda g, s: True
result = evaluate_expression(expr, {}, exists_cb=cb)
assert result is True
def test_exists_callback_false(self):
from rdflib.plugins.sparql.parserutils import CompValue
graph = CompValue("BGP")
expr = self._make_builtin("EXISTS", graph=graph)
cb = lambda g, s: False
result = evaluate_expression(expr, {}, exists_cb=cb)
assert result is False
def test_notexists_with_callback(self):
from rdflib.plugins.sparql.parserutils import CompValue
graph = CompValue("BGP")
expr = self._make_builtin("NOTEXISTS", graph=graph)
cb = lambda g, s: True
result = evaluate_expression(expr, {}, exists_cb=cb)
assert result is False
def test_notexists_callback_false(self):
from rdflib.plugins.sparql.parserutils import CompValue
graph = CompValue("BGP")
expr = self._make_builtin("NOTEXISTS", graph=graph)
cb = lambda g, s: False
result = evaluate_expression(expr, {}, exists_cb=cb)
assert result is True
class TestEffectiveBoolean: class TestEffectiveBoolean:

View file

@ -5,7 +5,7 @@ Tests for SPARQL solution sequence operations.
import pytest import pytest
from trustgraph.schema import Term, IRI, LITERAL from trustgraph.schema import Term, IRI, LITERAL
from trustgraph.query.sparql.solutions import ( from trustgraph.query.sparql.solutions import (
hash_join, left_join, union, project, distinct, hash_join, left_join, minus, union, project, distinct,
order_by, slice_solutions, _terms_equal, _compatible, order_by, slice_solutions, _terms_equal, _compatible,
) )
@ -311,6 +311,30 @@ class TestOrderBy:
result = order_by(solutions, []) result = order_by(solutions, [])
assert len(result) == 1 assert len(result) == 1
def test_order_by_numeric_literals(self):
solutions = [
{"year": lit("1950")},
{"year": lit("700")},
{"year": lit("2000")},
{"year": lit("450")},
{"year": lit("1200")},
]
key_fns = [(lambda sol: sol.get("year"), True)]
result = order_by(solutions, key_fns)
values = [s["year"].value for s in result]
assert values == ["450", "700", "1200", "1950", "2000"]
def test_order_by_numeric_descending(self):
solutions = [
{"year": lit("1950")},
{"year": lit("700")},
{"year": lit("2000")},
]
key_fns = [(lambda sol: sol.get("year"), False)]
result = order_by(solutions, key_fns)
values = [s["year"].value for s in result]
assert values == ["2000", "1950", "700"]
class TestSlice: class TestSlice:
@ -343,3 +367,37 @@ class TestSlice:
solutions = [{"s": alice}, {"s": bob}] solutions = [{"s": alice}, {"s": bob}]
result = slice_solutions(solutions) result = slice_solutions(solutions)
assert len(result) == 2 assert len(result) == 2
class TestMinus:
def test_removes_compatible(self, alice, bob):
left = [{"s": alice}, {"s": bob}]
right = [{"s": alice}]
result = minus(left, right)
assert len(result) == 1
assert result[0]["s"].iri == "http://example.com/bob"
def test_empty_right_preserves_all(self, alice, bob):
left = [{"s": alice}, {"s": bob}]
result = minus(left, [])
assert len(result) == 2
def test_no_shared_variables_preserves_all(self, alice, bob):
left = [{"s": alice}]
right = [{"t": bob}]
result = minus(left, right)
assert len(result) == 1
def test_all_removed(self, alice):
left = [{"s": alice}]
right = [{"s": alice}]
result = minus(left, right)
assert len(result) == 0
def test_partial_shared_variables(self, alice, bob):
left = [{"s": alice, "p": lit("x")}, {"s": bob, "p": lit("y")}]
right = [{"s": alice}]
result = minus(left, right)
assert len(result) == 1
assert result[0]["s"].iri == "http://example.com/bob"

View file

@ -17,7 +17,7 @@ from ... knowledge import Uri
from ... knowledge import Literal as KgLiteral from ... knowledge import Literal as KgLiteral
from . parser import rdflib_term_to_term from . parser import rdflib_term_to_term
from . solutions import ( from . solutions import (
hash_join, left_join, union, project, distinct, hash_join, left_join, minus, union, project, distinct,
order_by, slice_solutions, _term_key, order_by, slice_solutions, _term_key,
) )
from . expressions import evaluate_expression, _effective_boolean from . expressions import evaluate_expression, _effective_boolean
@ -159,14 +159,69 @@ async def _eval_union(node, tc, collection, limit):
return union(left, right)[:limit] return union(left, right)[:limit]
async def _eval_minus(node, tc, collection, limit):
"""Evaluate a Minus node."""
left = await evaluate(node.p1, tc, collection, limit)
right = await evaluate(node.p2, tc, collection, limit)
return minus(left, right)
async def _check_exists(graph_node, sol, tc, collection, limit):
"""Evaluate an EXISTS graph pattern against a solution."""
results = await evaluate(graph_node, tc, collection, limit)
for r in results:
shared = set(sol.keys()) & set(r.keys())
if all(
_term_key(sol[v]) == _term_key(r[v])
for v in shared
if sol.get(v) is not None and r.get(v) is not None
):
return True
return False
async def _pre_eval_exists(expr, sol, tc, collection, limit, cache):
"""Walk an expression tree, pre-evaluate EXISTS/NOT EXISTS, cache results."""
if not isinstance(expr, CompValue):
return
if expr.name in ("Builtin_EXISTS", "Builtin_NOTEXISTS"):
key = id(expr.graph), id(sol)
if key not in cache:
cache[key] = await _check_exists(
expr.graph, sol, tc, collection, limit
)
return
for attr in ("expr", "other", "arg", "arg1", "arg2", "arg3"):
child = getattr(expr, attr, None)
if child is None:
continue
if isinstance(child, CompValue):
await _pre_eval_exists(child, sol, tc, collection, limit, cache)
elif isinstance(child, (list, tuple)):
for item in child:
if isinstance(item, CompValue):
await _pre_eval_exists(
item, sol, tc, collection, limit, cache
)
async def _eval_filter(node, tc, collection, limit): async def _eval_filter(node, tc, collection, limit):
"""Evaluate a Filter node.""" """Evaluate a Filter node."""
solutions = await evaluate(node.p, tc, collection, limit) solutions = await evaluate(node.p, tc, collection, limit)
expr = node.expr expr = node.expr
return [ exists_cache = {}
sol for sol in solutions
if _effective_boolean(evaluate_expression(expr, sol)) def exists_cb(graph_node, sol):
] key = id(graph_node), id(sol)
return exists_cache.get(key, False)
result = []
for sol in solutions:
await _pre_eval_exists(expr, sol, tc, collection, limit, exists_cache)
if _effective_boolean(evaluate_expression(expr, sol, exists_cb=exists_cb)):
result.append(sol)
return result
async def _eval_distinct(node, tc, collection, limit): async def _eval_distinct(node, tc, collection, limit):
@ -222,10 +277,16 @@ async def _eval_extend(node, tc, collection, limit):
solutions = await evaluate(node.p, tc, collection, limit) solutions = await evaluate(node.p, tc, collection, limit)
var_name = str(node.var) var_name = str(node.var)
expr = node.expr expr = node.expr
exists_cache = {}
def exists_cb(graph_node, sol):
key = id(graph_node), id(sol)
return exists_cache.get(key, False)
result = [] result = []
for sol in solutions: for sol in solutions:
val = evaluate_expression(expr, sol) await _pre_eval_exists(expr, sol, tc, collection, limit, exists_cache)
val = evaluate_expression(expr, sol, exists_cb=exists_cb)
new_sol = dict(sol) new_sol = dict(sol)
if isinstance(val, Term): if isinstance(val, Term):
new_sol[var_name] = val new_sol[var_name] = val
@ -525,6 +586,7 @@ _HANDLERS = {
"Join": _eval_join, "Join": _eval_join,
"LeftJoin": _eval_left_join, "LeftJoin": _eval_left_join,
"Union": _eval_union, "Union": _eval_union,
"Minus": _eval_minus,
"Filter": _eval_filter, "Filter": _eval_filter,
"Distinct": _eval_distinct, "Distinct": _eval_distinct,
"Reduced": _eval_reduced, "Reduced": _eval_reduced,

View file

@ -5,9 +5,15 @@ Evaluates rdflib algebra expression nodes against a solution (variable
binding) to produce a value or boolean result. binding) to produce a value or boolean result.
""" """
import hashlib
import math
import random
import re import re
import logging import logging
import operator import operator
import uuid
from datetime import datetime, date, timezone
from urllib.parse import quote
from rdflib.term import Variable, URIRef, Literal, BNode from rdflib.term import Variable, URIRef, Literal, BNode
from rdflib.plugins.sparql.parserutils import CompValue from rdflib.plugins.sparql.parserutils import CompValue
@ -17,23 +23,31 @@ from . parser import rdflib_term_to_term
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
_exists_callback = None
class ExpressionError(Exception): class ExpressionError(Exception):
"""Raised when a SPARQL expression cannot be evaluated.""" """Raised when a SPARQL expression cannot be evaluated."""
pass pass
def evaluate_expression(expr, solution): def evaluate_expression(expr, solution, exists_cb=None):
""" """
Evaluate a SPARQL expression against a solution binding. Evaluate a SPARQL expression against a solution binding.
Args: Args:
expr: rdflib algebra expression node expr: rdflib algebra expression node
solution: dict mapping variable names to Term values solution: dict mapping variable names to Term values
exists_cb: optional callback(graph_node, solution) -> bool for
EXISTS/NOT EXISTS evaluation; provided by algebra.py
Returns: Returns:
The result value (Term, bool, number, string, or None) The result value (Term, bool, number, string, or None)
""" """
global _exists_callback
if exists_cb is not None:
_exists_callback = exists_cb
if expr is None: if expr is None:
return True return True
@ -119,15 +133,7 @@ def _evaluate_comp_value(node, solution):
if name == "Function": if name == "Function":
return _eval_function(node, solution) return _eval_function(node, solution)
# Exists / NotExists # Exists / NotExists — handled via _eval_builtin now
if name == "Builtin_EXISTS":
# EXISTS requires graph pattern evaluation - not handled here
logger.warning("EXISTS not supported in filter expressions")
return True
if name == "Builtin_NOTEXISTS":
logger.warning("NOT EXISTS not supported in filter expressions")
return True
# TrueFilter (used with OPTIONAL) # TrueFilter (used with OPTIONAL)
if name == "TrueFilter": if name == "TrueFilter":
@ -335,6 +341,197 @@ def _eval_builtin(name, node, solution):
return val return val
return None return None
if builtin == "YEAR":
dt = _to_datetime(evaluate_expression(node.arg, solution))
return dt.year if dt is not None else None
if builtin == "MONTH":
dt = _to_datetime(evaluate_expression(node.arg, solution))
return dt.month if dt is not None else None
if builtin == "DAY":
dt = _to_datetime(evaluate_expression(node.arg, solution))
return dt.day if dt is not None else None
if builtin == "HOURS":
dt = _to_datetime(evaluate_expression(node.arg, solution))
if dt is None:
return None
return dt.hour if isinstance(dt, datetime) else 0
if builtin == "MINUTES":
dt = _to_datetime(evaluate_expression(node.arg, solution))
if dt is None:
return None
return dt.minute if isinstance(dt, datetime) else 0
if builtin == "SECONDS":
dt = _to_datetime(evaluate_expression(node.arg, solution))
if dt is None:
return None
return dt.second if isinstance(dt, datetime) else 0
if builtin == "FLOOR":
val = _to_numeric(evaluate_expression(node.arg, solution))
if val is None:
return None
return int(math.floor(val))
if builtin == "CEIL":
val = _to_numeric(evaluate_expression(node.arg, solution))
if val is None:
return None
return int(math.ceil(val))
if builtin == "ABS":
val = _to_numeric(evaluate_expression(node.arg, solution))
if val is None:
return None
return abs(val)
if builtin == "ROUND":
val = _to_numeric(evaluate_expression(node.arg, solution))
if val is None:
return None
return round(val)
if builtin == "STRBEFORE":
string = _to_string(evaluate_expression(node.arg1, solution))
sep = _to_string(evaluate_expression(node.arg2, solution))
idx = string.find(sep)
if idx < 0:
return Term(type=LITERAL, value="")
return Term(type=LITERAL, value=string[:idx])
if builtin == "STRAFTER":
string = _to_string(evaluate_expression(node.arg1, solution))
sep = _to_string(evaluate_expression(node.arg2, solution))
idx = string.find(sep)
if idx < 0:
return Term(type=LITERAL, value="")
return Term(type=LITERAL, value=string[idx + len(sep):])
if builtin == "ENCODE_FOR_URI":
val = _to_string(evaluate_expression(node.arg, solution))
return Term(type=LITERAL, value=quote(val, safe=""))
if builtin == "REPLACE":
string = _to_string(evaluate_expression(node.arg, solution))
pattern = _to_string(evaluate_expression(node.pattern, solution))
replacement = _to_string(
evaluate_expression(node.replacement, solution)
)
flags_str = ""
if hasattr(node, "flags") and node.flags is not None:
flags_str = _to_string(evaluate_expression(node.flags, solution))
re_flags = 0
if "i" in flags_str:
re_flags |= re.IGNORECASE
if "m" in flags_str:
re_flags |= re.MULTILINE
if "s" in flags_str:
re_flags |= re.DOTALL
try:
result = re.sub(pattern, replacement, string, flags=re_flags)
return Term(type=LITERAL, value=result)
except re.error:
return None
if builtin == "SUBSTR":
string = _to_string(evaluate_expression(node.arg, solution))
start = _to_numeric(evaluate_expression(node.start, solution))
if start is None:
return None
start_idx = max(int(start) - 1, 0)
if hasattr(node, "length") and node.length is not None:
length = _to_numeric(evaluate_expression(node.length, solution))
if length is None:
return None
return Term(
type=LITERAL, value=string[start_idx:start_idx + int(length)]
)
return Term(type=LITERAL, value=string[start_idx:])
if builtin == "EXISTS":
if _exists_callback is not None:
return _exists_callback(node.graph, solution)
logger.warning("EXISTS requires an exists_cb; not available")
return True
if builtin == "NOTEXISTS":
if _exists_callback is not None:
return not _exists_callback(node.graph, solution)
logger.warning("NOT EXISTS requires an exists_cb; not available")
return True
if builtin == "LANGMATCHES":
tag = _to_string(evaluate_expression(node.arg1, solution))
rng = _to_string(evaluate_expression(node.arg2, solution))
if rng == "*":
return len(tag) > 0
return tag.lower().startswith(rng.lower())
if builtin == "IRI" or builtin == "URI":
val = _to_string(evaluate_expression(node.arg, solution))
return Term(type=IRI, iri=val)
if builtin == "BNODE":
if hasattr(node, "arg") and node.arg is not None:
label = _to_string(evaluate_expression(node.arg, solution))
return Term(type=BLANK, id=label)
return Term(type=BLANK, id=str(uuid.uuid4()))
if builtin == "NOW":
now = datetime.now(timezone.utc)
return Term(
type=LITERAL,
value=now.strftime("%Y-%m-%dT%H:%M:%S%z"),
datatype="http://www.w3.org/2001/XMLSchema#dateTime",
)
if builtin == "TZ":
dt = _to_datetime(evaluate_expression(node.arg, solution))
if dt is None:
return Term(type=LITERAL, value="")
if dt.tzinfo is not None:
offset = dt.strftime("%z")
if offset:
return Term(type=LITERAL, value=offset[:3] + ":" + offset[3:])
return Term(type=LITERAL, value="")
if builtin == "RAND":
return random.random()
if builtin == "UUID":
return Term(type=IRI, iri="urn:uuid:" + str(uuid.uuid4()))
if builtin == "STRUUID":
return Term(type=LITERAL, value=str(uuid.uuid4()))
if builtin == "MD5":
val = _to_string(evaluate_expression(node.arg, solution))
return Term(
type=LITERAL, value=hashlib.md5(val.encode()).hexdigest()
)
if builtin == "SHA1":
val = _to_string(evaluate_expression(node.arg, solution))
return Term(
type=LITERAL, value=hashlib.sha1(val.encode()).hexdigest()
)
if builtin == "SHA256":
val = _to_string(evaluate_expression(node.arg, solution))
return Term(
type=LITERAL, value=hashlib.sha256(val.encode()).hexdigest()
)
if builtin == "SHA512":
val = _to_string(evaluate_expression(node.arg, solution))
return Term(
type=LITERAL, value=hashlib.sha512(val.encode()).hexdigest()
)
if builtin == "sameTerm": if builtin == "sameTerm":
left = evaluate_expression(node.arg1, solution) left = evaluate_expression(node.arg1, solution)
right = evaluate_expression(node.arg2, solution) right = evaluate_expression(node.arg2, solution)
@ -454,6 +651,27 @@ def _to_numeric(val):
return None return None
def _to_datetime(val):
"""Convert a value to a date or datetime object."""
if val is None:
return None
s = _to_string(val)
for fmt in (
"%Y-%m-%dT%H:%M:%S", "%Y-%m-%dT%H:%M:%S%z",
"%Y-%m-%dT%H:%M:%SZ", "%Y-%m-%dT%H:%M",
"%Y-%m-%d",
):
try:
return datetime.strptime(s, fmt)
except ValueError:
continue
try:
return datetime.fromisoformat(s)
except ValueError:
pass
return None
def _comparable_value(val): def _comparable_value(val):
""" """
Convert a value to a form suitable for comparison. Convert a value to a form suitable for comparison.

View file

@ -150,6 +150,30 @@ def left_join(left, right, filter_fn=None):
return results return results
def minus(left, right):
"""
MINUS operation: remove left solutions that are compatible with any
right solution sharing at least one variable.
"""
if not right:
return list(left)
right_vars = set()
for sol in right:
right_vars.update(sol.keys())
results = []
for sol_l in left:
shared = set(sol_l.keys()) & right_vars
if not shared:
results.append(sol_l)
continue
if not any(_compatible(sol_l, sol_r) for sol_r in right):
results.append(sol_l)
return results
def union(left, right): def union(left, right):
"""Union two solution sequences (concatenation).""" """Union two solution sequences (concatenation)."""
return list(left) + list(right) return list(left) + list(right)
@ -177,6 +201,28 @@ def distinct(solutions):
return results return results
def _sort_comparable(val):
"""Convert a value to a form suitable for sort ordering."""
if val is None:
return (0, "")
if isinstance(val, (int, float)):
return (2, val)
if isinstance(val, Term):
if val.type == LITERAL:
try:
if "." in val.value:
return (2, float(val.value))
return (2, int(val.value))
except (ValueError, TypeError):
pass
return (3, val.value)
elif val.type == IRI:
return (4, val.iri)
elif val.type == BLANK:
return (5, val.id)
return (6, str(val))
def order_by(solutions, key_fns): def order_by(solutions, key_fns):
""" """
Sort solutions by the given key functions. Sort solutions by the given key functions.
@ -191,14 +237,7 @@ def order_by(solutions, key_fns):
keys = [] keys = []
for fn, ascending in key_fns: for fn, ascending in key_fns:
val = fn(sol) val = fn(sol)
# Convert to comparable form keys.append(_sort_comparable(val))
if val is None:
comparable = ("", "")
elif isinstance(val, Term):
comparable = _term_key(val)
else:
comparable = ("v", str(val))
keys.append(comparable)
return keys return keys
# Handle ascending/descending # Handle ascending/descending
@ -224,10 +263,8 @@ def _mixed_sort(solutions, key_fns):
def compare(a, b): def compare(a, b):
for fn, ascending in key_fns: for fn, ascending in key_fns:
va = fn(a) ka = _sort_comparable(fn(a))
vb = fn(b) kb = _sort_comparable(fn(b))
ka = _term_key(va) if isinstance(va, Term) else ("v", str(va)) if va is not None else ("", "")
kb = _term_key(vb) if isinstance(vb, Term) else ("v", str(vb)) if vb is not None else ("", "")
if ka < kb: if ka < kb:
return -1 if ascending else 1 return -1 if ascending else 1