feat: extend SPARQL evaluator with comprehensive function and operator support (#945)

Add 30+ SPARQL 1.1 built-in functions and the MINUS algebra operator to the
custom SPARQL query backend.

String functions:
- SUBSTR (2-arg and 3-arg forms), STRBEFORE, STRAFTER
- REPLACE (regex with flags), ENCODE_FOR_URI

Numeric functions:
- FLOOR, CEIL, ROUND, ABS

Date/time accessors:
- YEAR, MONTH, DAY, HOURS, MINUTES, SECONDS
- NOW, TZ

Hash functions:
- MD5, SHA1, SHA256, SHA512

Term constructors:
- IRI/URI, BNODE, UUID, STRUUID

Other functions:
- LANGMATCHES, RAND
- EXISTS / NOT EXISTS (with async pre-evaluation to bridge the
  sync expression evaluator and async algebra evaluator)

Algebra:
- MINUS set-difference operator
- HAVING already works via rdflib's Filter mapping (verified)

Fix SPARQL ORDER handling

Includes 653 lines of new unit tests covering all added functionality
across expressions, solutions, and algebra layers.
This commit is contained in:
cybermaggedon 2026-05-21 10:50:11 +01:00 committed by GitHub
parent e57f4669e1
commit 2c3a699af3
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
6 changed files with 1021 additions and 29 deletions

View file

@ -17,7 +17,7 @@ from ... knowledge import Uri
from ... knowledge import Literal as KgLiteral
from . parser import rdflib_term_to_term
from . solutions import (
hash_join, left_join, union, project, distinct,
hash_join, left_join, minus, union, project, distinct,
order_by, slice_solutions, _term_key,
)
from . expressions import evaluate_expression, _effective_boolean
@ -159,14 +159,69 @@ async def _eval_union(node, tc, collection, limit):
return union(left, right)[:limit]
async def _eval_minus(node, tc, collection, limit):
"""Evaluate a Minus node."""
left = await evaluate(node.p1, tc, collection, limit)
right = await evaluate(node.p2, tc, collection, limit)
return minus(left, right)
async def _check_exists(graph_node, sol, tc, collection, limit):
"""Evaluate an EXISTS graph pattern against a solution."""
results = await evaluate(graph_node, tc, collection, limit)
for r in results:
shared = set(sol.keys()) & set(r.keys())
if all(
_term_key(sol[v]) == _term_key(r[v])
for v in shared
if sol.get(v) is not None and r.get(v) is not None
):
return True
return False
async def _pre_eval_exists(expr, sol, tc, collection, limit, cache):
"""Walk an expression tree, pre-evaluate EXISTS/NOT EXISTS, cache results."""
if not isinstance(expr, CompValue):
return
if expr.name in ("Builtin_EXISTS", "Builtin_NOTEXISTS"):
key = id(expr.graph), id(sol)
if key not in cache:
cache[key] = await _check_exists(
expr.graph, sol, tc, collection, limit
)
return
for attr in ("expr", "other", "arg", "arg1", "arg2", "arg3"):
child = getattr(expr, attr, None)
if child is None:
continue
if isinstance(child, CompValue):
await _pre_eval_exists(child, sol, tc, collection, limit, cache)
elif isinstance(child, (list, tuple)):
for item in child:
if isinstance(item, CompValue):
await _pre_eval_exists(
item, sol, tc, collection, limit, cache
)
async def _eval_filter(node, tc, collection, limit):
"""Evaluate a Filter node."""
solutions = await evaluate(node.p, tc, collection, limit)
expr = node.expr
return [
sol for sol in solutions
if _effective_boolean(evaluate_expression(expr, sol))
]
exists_cache = {}
def exists_cb(graph_node, sol):
key = id(graph_node), id(sol)
return exists_cache.get(key, False)
result = []
for sol in solutions:
await _pre_eval_exists(expr, sol, tc, collection, limit, exists_cache)
if _effective_boolean(evaluate_expression(expr, sol, exists_cb=exists_cb)):
result.append(sol)
return result
async def _eval_distinct(node, tc, collection, limit):
@ -222,10 +277,16 @@ async def _eval_extend(node, tc, collection, limit):
solutions = await evaluate(node.p, tc, collection, limit)
var_name = str(node.var)
expr = node.expr
exists_cache = {}
def exists_cb(graph_node, sol):
key = id(graph_node), id(sol)
return exists_cache.get(key, False)
result = []
for sol in solutions:
val = evaluate_expression(expr, sol)
await _pre_eval_exists(expr, sol, tc, collection, limit, exists_cache)
val = evaluate_expression(expr, sol, exists_cb=exists_cb)
new_sol = dict(sol)
if isinstance(val, Term):
new_sol[var_name] = val
@ -525,6 +586,7 @@ _HANDLERS = {
"Join": _eval_join,
"LeftJoin": _eval_left_join,
"Union": _eval_union,
"Minus": _eval_minus,
"Filter": _eval_filter,
"Distinct": _eval_distinct,
"Reduced": _eval_reduced,

View file

@ -5,9 +5,15 @@ Evaluates rdflib algebra expression nodes against a solution (variable
binding) to produce a value or boolean result.
"""
import hashlib
import math
import random
import re
import logging
import operator
import uuid
from datetime import datetime, date, timezone
from urllib.parse import quote
from rdflib.term import Variable, URIRef, Literal, BNode
from rdflib.plugins.sparql.parserutils import CompValue
@ -17,23 +23,31 @@ from . parser import rdflib_term_to_term
logger = logging.getLogger(__name__)
_exists_callback = None
class ExpressionError(Exception):
"""Raised when a SPARQL expression cannot be evaluated."""
pass
def evaluate_expression(expr, solution):
def evaluate_expression(expr, solution, exists_cb=None):
"""
Evaluate a SPARQL expression against a solution binding.
Args:
expr: rdflib algebra expression node
solution: dict mapping variable names to Term values
exists_cb: optional callback(graph_node, solution) -> bool for
EXISTS/NOT EXISTS evaluation; provided by algebra.py
Returns:
The result value (Term, bool, number, string, or None)
"""
global _exists_callback
if exists_cb is not None:
_exists_callback = exists_cb
if expr is None:
return True
@ -119,15 +133,7 @@ def _evaluate_comp_value(node, solution):
if name == "Function":
return _eval_function(node, solution)
# Exists / NotExists
if name == "Builtin_EXISTS":
# EXISTS requires graph pattern evaluation - not handled here
logger.warning("EXISTS not supported in filter expressions")
return True
if name == "Builtin_NOTEXISTS":
logger.warning("NOT EXISTS not supported in filter expressions")
return True
# Exists / NotExists — handled via _eval_builtin now
# TrueFilter (used with OPTIONAL)
if name == "TrueFilter":
@ -335,6 +341,197 @@ def _eval_builtin(name, node, solution):
return val
return None
if builtin == "YEAR":
dt = _to_datetime(evaluate_expression(node.arg, solution))
return dt.year if dt is not None else None
if builtin == "MONTH":
dt = _to_datetime(evaluate_expression(node.arg, solution))
return dt.month if dt is not None else None
if builtin == "DAY":
dt = _to_datetime(evaluate_expression(node.arg, solution))
return dt.day if dt is not None else None
if builtin == "HOURS":
dt = _to_datetime(evaluate_expression(node.arg, solution))
if dt is None:
return None
return dt.hour if isinstance(dt, datetime) else 0
if builtin == "MINUTES":
dt = _to_datetime(evaluate_expression(node.arg, solution))
if dt is None:
return None
return dt.minute if isinstance(dt, datetime) else 0
if builtin == "SECONDS":
dt = _to_datetime(evaluate_expression(node.arg, solution))
if dt is None:
return None
return dt.second if isinstance(dt, datetime) else 0
if builtin == "FLOOR":
val = _to_numeric(evaluate_expression(node.arg, solution))
if val is None:
return None
return int(math.floor(val))
if builtin == "CEIL":
val = _to_numeric(evaluate_expression(node.arg, solution))
if val is None:
return None
return int(math.ceil(val))
if builtin == "ABS":
val = _to_numeric(evaluate_expression(node.arg, solution))
if val is None:
return None
return abs(val)
if builtin == "ROUND":
val = _to_numeric(evaluate_expression(node.arg, solution))
if val is None:
return None
return round(val)
if builtin == "STRBEFORE":
string = _to_string(evaluate_expression(node.arg1, solution))
sep = _to_string(evaluate_expression(node.arg2, solution))
idx = string.find(sep)
if idx < 0:
return Term(type=LITERAL, value="")
return Term(type=LITERAL, value=string[:idx])
if builtin == "STRAFTER":
string = _to_string(evaluate_expression(node.arg1, solution))
sep = _to_string(evaluate_expression(node.arg2, solution))
idx = string.find(sep)
if idx < 0:
return Term(type=LITERAL, value="")
return Term(type=LITERAL, value=string[idx + len(sep):])
if builtin == "ENCODE_FOR_URI":
val = _to_string(evaluate_expression(node.arg, solution))
return Term(type=LITERAL, value=quote(val, safe=""))
if builtin == "REPLACE":
string = _to_string(evaluate_expression(node.arg, solution))
pattern = _to_string(evaluate_expression(node.pattern, solution))
replacement = _to_string(
evaluate_expression(node.replacement, solution)
)
flags_str = ""
if hasattr(node, "flags") and node.flags is not None:
flags_str = _to_string(evaluate_expression(node.flags, solution))
re_flags = 0
if "i" in flags_str:
re_flags |= re.IGNORECASE
if "m" in flags_str:
re_flags |= re.MULTILINE
if "s" in flags_str:
re_flags |= re.DOTALL
try:
result = re.sub(pattern, replacement, string, flags=re_flags)
return Term(type=LITERAL, value=result)
except re.error:
return None
if builtin == "SUBSTR":
string = _to_string(evaluate_expression(node.arg, solution))
start = _to_numeric(evaluate_expression(node.start, solution))
if start is None:
return None
start_idx = max(int(start) - 1, 0)
if hasattr(node, "length") and node.length is not None:
length = _to_numeric(evaluate_expression(node.length, solution))
if length is None:
return None
return Term(
type=LITERAL, value=string[start_idx:start_idx + int(length)]
)
return Term(type=LITERAL, value=string[start_idx:])
if builtin == "EXISTS":
if _exists_callback is not None:
return _exists_callback(node.graph, solution)
logger.warning("EXISTS requires an exists_cb; not available")
return True
if builtin == "NOTEXISTS":
if _exists_callback is not None:
return not _exists_callback(node.graph, solution)
logger.warning("NOT EXISTS requires an exists_cb; not available")
return True
if builtin == "LANGMATCHES":
tag = _to_string(evaluate_expression(node.arg1, solution))
rng = _to_string(evaluate_expression(node.arg2, solution))
if rng == "*":
return len(tag) > 0
return tag.lower().startswith(rng.lower())
if builtin == "IRI" or builtin == "URI":
val = _to_string(evaluate_expression(node.arg, solution))
return Term(type=IRI, iri=val)
if builtin == "BNODE":
if hasattr(node, "arg") and node.arg is not None:
label = _to_string(evaluate_expression(node.arg, solution))
return Term(type=BLANK, id=label)
return Term(type=BLANK, id=str(uuid.uuid4()))
if builtin == "NOW":
now = datetime.now(timezone.utc)
return Term(
type=LITERAL,
value=now.strftime("%Y-%m-%dT%H:%M:%S%z"),
datatype="http://www.w3.org/2001/XMLSchema#dateTime",
)
if builtin == "TZ":
dt = _to_datetime(evaluate_expression(node.arg, solution))
if dt is None:
return Term(type=LITERAL, value="")
if dt.tzinfo is not None:
offset = dt.strftime("%z")
if offset:
return Term(type=LITERAL, value=offset[:3] + ":" + offset[3:])
return Term(type=LITERAL, value="")
if builtin == "RAND":
return random.random()
if builtin == "UUID":
return Term(type=IRI, iri="urn:uuid:" + str(uuid.uuid4()))
if builtin == "STRUUID":
return Term(type=LITERAL, value=str(uuid.uuid4()))
if builtin == "MD5":
val = _to_string(evaluate_expression(node.arg, solution))
return Term(
type=LITERAL, value=hashlib.md5(val.encode()).hexdigest()
)
if builtin == "SHA1":
val = _to_string(evaluate_expression(node.arg, solution))
return Term(
type=LITERAL, value=hashlib.sha1(val.encode()).hexdigest()
)
if builtin == "SHA256":
val = _to_string(evaluate_expression(node.arg, solution))
return Term(
type=LITERAL, value=hashlib.sha256(val.encode()).hexdigest()
)
if builtin == "SHA512":
val = _to_string(evaluate_expression(node.arg, solution))
return Term(
type=LITERAL, value=hashlib.sha512(val.encode()).hexdigest()
)
if builtin == "sameTerm":
left = evaluate_expression(node.arg1, solution)
right = evaluate_expression(node.arg2, solution)
@ -454,6 +651,27 @@ def _to_numeric(val):
return None
def _to_datetime(val):
"""Convert a value to a date or datetime object."""
if val is None:
return None
s = _to_string(val)
for fmt in (
"%Y-%m-%dT%H:%M:%S", "%Y-%m-%dT%H:%M:%S%z",
"%Y-%m-%dT%H:%M:%SZ", "%Y-%m-%dT%H:%M",
"%Y-%m-%d",
):
try:
return datetime.strptime(s, fmt)
except ValueError:
continue
try:
return datetime.fromisoformat(s)
except ValueError:
pass
return None
def _comparable_value(val):
"""
Convert a value to a form suitable for comparison.

View file

@ -150,6 +150,30 @@ def left_join(left, right, filter_fn=None):
return results
def minus(left, right):
"""
MINUS operation: remove left solutions that are compatible with any
right solution sharing at least one variable.
"""
if not right:
return list(left)
right_vars = set()
for sol in right:
right_vars.update(sol.keys())
results = []
for sol_l in left:
shared = set(sol_l.keys()) & right_vars
if not shared:
results.append(sol_l)
continue
if not any(_compatible(sol_l, sol_r) for sol_r in right):
results.append(sol_l)
return results
def union(left, right):
"""Union two solution sequences (concatenation)."""
return list(left) + list(right)
@ -177,6 +201,28 @@ def distinct(solutions):
return results
def _sort_comparable(val):
"""Convert a value to a form suitable for sort ordering."""
if val is None:
return (0, "")
if isinstance(val, (int, float)):
return (2, val)
if isinstance(val, Term):
if val.type == LITERAL:
try:
if "." in val.value:
return (2, float(val.value))
return (2, int(val.value))
except (ValueError, TypeError):
pass
return (3, val.value)
elif val.type == IRI:
return (4, val.iri)
elif val.type == BLANK:
return (5, val.id)
return (6, str(val))
def order_by(solutions, key_fns):
"""
Sort solutions by the given key functions.
@ -191,14 +237,7 @@ def order_by(solutions, key_fns):
keys = []
for fn, ascending in key_fns:
val = fn(sol)
# Convert to comparable form
if val is None:
comparable = ("", "")
elif isinstance(val, Term):
comparable = _term_key(val)
else:
comparable = ("v", str(val))
keys.append(comparable)
keys.append(_sort_comparable(val))
return keys
# Handle ascending/descending
@ -224,10 +263,8 @@ def _mixed_sort(solutions, key_fns):
def compare(a, b):
for fn, ascending in key_fns:
va = fn(a)
vb = fn(b)
ka = _term_key(va) if isinstance(va, Term) else ("v", str(va)) if va is not None else ("", "")
kb = _term_key(vb) if isinstance(vb, Term) else ("v", str(vb)) if vb is not None else ("", "")
ka = _sort_comparable(fn(a))
kb = _sort_comparable(fn(b))
if ka < kb:
return -1 if ascending else 1