feat: extend SPARQL evaluator with comprehensive function and operator support (#945)

Add 30+ SPARQL 1.1 built-in functions and the MINUS algebra operator to the custom SPARQL query backend. String functions: - SUBSTR (2-arg and 3-arg forms), STRBEFORE, STRAFTER - REPLACE (regex with flags), ENCODE_FOR_URI Numeric functions: - FLOOR, CEIL, ROUND, ABS Date/time accessors: - YEAR, MONTH, DAY, HOURS, MINUTES, SECONDS - NOW, TZ Hash functions: - MD5, SHA1, SHA256, SHA512 Term constructors: - IRI/URI, BNODE, UUID, STRUUID Other functions: - LANGMATCHES, RAND - EXISTS / NOT EXISTS (with async pre-evaluation to bridge the sync expression evaluator and async algebra evaluator) Algebra: - MINUS set-difference operator - HAVING already works via rdflib's Filter mapping (verified) Fix SPARQL ORDER handling Includes 653 lines of new unit tests covering all added functionality across expressions, solutions, and algebra layers.
2026-07-12 14:52:11 +02:00 · 2026-05-21 10:50:11 +01:00 · 2026-05-21 10:50:11 +01:00 · 2c3a699af3
commit 2c3a699af3
parent e57f4669e1
6 changed files with 1021 additions and 29 deletions
--- a/trustgraph-flow/trustgraph/query/sparql/algebra.py
+++ b/trustgraph-flow/trustgraph/query/sparql/algebra.py
@ -17,7 +17,7 @@ from ... knowledge import Uri
 from ... knowledge import Literal as KgLiteral
 from . parser import rdflib_term_to_term
 from . solutions import (
-    hash_join, left_join, union, project, distinct,
+    hash_join, left_join, minus, union, project, distinct,
    order_by, slice_solutions, _term_key,
 )
 from . expressions import evaluate_expression, _effective_boolean
@ -159,14 +159,69 @@ async def _eval_union(node, tc, collection, limit):
    return union(left, right)[:limit]


+async def _eval_minus(node, tc, collection, limit):
+    """Evaluate a Minus node."""
+    left = await evaluate(node.p1, tc, collection, limit)
+    right = await evaluate(node.p2, tc, collection, limit)
+    return minus(left, right)
+
+
+async def _check_exists(graph_node, sol, tc, collection, limit):
+    """Evaluate an EXISTS graph pattern against a solution."""
+    results = await evaluate(graph_node, tc, collection, limit)
+    for r in results:
+        shared = set(sol.keys()) & set(r.keys())
+        if all(
+            _term_key(sol[v]) == _term_key(r[v])
+            for v in shared
+            if sol.get(v) is not None and r.get(v) is not None
+        ):
+            return True
+    return False
+
+
+async def _pre_eval_exists(expr, sol, tc, collection, limit, cache):
+    """Walk an expression tree, pre-evaluate EXISTS/NOT EXISTS, cache results."""
+    if not isinstance(expr, CompValue):
+        return
+    if expr.name in ("Builtin_EXISTS", "Builtin_NOTEXISTS"):
+        key = id(expr.graph), id(sol)
+        if key not in cache:
+            cache[key] = await _check_exists(
+                expr.graph, sol, tc, collection, limit
+            )
+        return
+    for attr in ("expr", "other", "arg", "arg1", "arg2", "arg3"):
+        child = getattr(expr, attr, None)
+        if child is None:
+            continue
+        if isinstance(child, CompValue):
+            await _pre_eval_exists(child, sol, tc, collection, limit, cache)
+        elif isinstance(child, (list, tuple)):
+            for item in child:
+                if isinstance(item, CompValue):
+                    await _pre_eval_exists(
+                        item, sol, tc, collection, limit, cache
+                    )
+
+
 async def _eval_filter(node, tc, collection, limit):
    """Evaluate a Filter node."""
    solutions = await evaluate(node.p, tc, collection, limit)
    expr = node.expr
-    return [
-        sol for sol in solutions
-        if _effective_boolean(evaluate_expression(expr, sol))
-    ]
+    exists_cache = {}
+
+    def exists_cb(graph_node, sol):
+        key = id(graph_node), id(sol)
+        return exists_cache.get(key, False)
+
+    result = []
+    for sol in solutions:
+        await _pre_eval_exists(expr, sol, tc, collection, limit, exists_cache)
+        if _effective_boolean(evaluate_expression(expr, sol, exists_cb=exists_cb)):
+            result.append(sol)
+
+    return result


 async def _eval_distinct(node, tc, collection, limit):
@ -222,10 +277,16 @@ async def _eval_extend(node, tc, collection, limit):
    solutions = await evaluate(node.p, tc, collection, limit)
    var_name = str(node.var)
    expr = node.expr
+    exists_cache = {}
+
+    def exists_cb(graph_node, sol):
+        key = id(graph_node), id(sol)
+        return exists_cache.get(key, False)

    result = []
    for sol in solutions:
-        val = evaluate_expression(expr, sol)
+        await _pre_eval_exists(expr, sol, tc, collection, limit, exists_cache)
+        val = evaluate_expression(expr, sol, exists_cb=exists_cb)
        new_sol = dict(sol)
        if isinstance(val, Term):
            new_sol[var_name] = val
@ -525,6 +586,7 @@ _HANDLERS = {
    "Join": _eval_join,
    "LeftJoin": _eval_left_join,
    "Union": _eval_union,
+    "Minus": _eval_minus,
    "Filter": _eval_filter,
    "Distinct": _eval_distinct,
    "Reduced": _eval_reduced,
--- a/trustgraph-flow/trustgraph/query/sparql/expressions.py
+++ b/trustgraph-flow/trustgraph/query/sparql/expressions.py
@ -5,9 +5,15 @@ Evaluates rdflib algebra expression nodes against a solution (variable
 binding) to produce a value or boolean result.
 """

+import hashlib
+import math
+import random
 import re
 import logging
 import operator
+import uuid
+from datetime import datetime, date, timezone
+from urllib.parse import quote

 from rdflib.term import Variable, URIRef, Literal, BNode
 from rdflib.plugins.sparql.parserutils import CompValue
@ -17,23 +23,31 @@ from . parser import rdflib_term_to_term

 logger = logging.getLogger(__name__)

+_exists_callback = None
+

 class ExpressionError(Exception):
    """Raised when a SPARQL expression cannot be evaluated."""
    pass


-def evaluate_expression(expr, solution):
+def evaluate_expression(expr, solution, exists_cb=None):
    """
    Evaluate a SPARQL expression against a solution binding.

    Args:
        expr: rdflib algebra expression node
        solution: dict mapping variable names to Term values
+        exists_cb: optional callback(graph_node, solution) -> bool for
+            EXISTS/NOT EXISTS evaluation; provided by algebra.py

    Returns:
        The result value (Term, bool, number, string, or None)
    """
+    global _exists_callback
+    if exists_cb is not None:
+        _exists_callback = exists_cb
+
    if expr is None:
        return True

@ -119,15 +133,7 @@ def _evaluate_comp_value(node, solution):
    if name == "Function":
        return _eval_function(node, solution)

-    # Exists / NotExists
-    if name == "Builtin_EXISTS":
-        # EXISTS requires graph pattern evaluation - not handled here
-        logger.warning("EXISTS not supported in filter expressions")
-        return True
-
-    if name == "Builtin_NOTEXISTS":
-        logger.warning("NOT EXISTS not supported in filter expressions")
-        return True
+    # Exists / NotExists — handled via _eval_builtin now

    # TrueFilter (used with OPTIONAL)
    if name == "TrueFilter":
@ -335,6 +341,197 @@ def _eval_builtin(name, node, solution):
                return val
        return None

+    if builtin == "YEAR":
+        dt = _to_datetime(evaluate_expression(node.arg, solution))
+        return dt.year if dt is not None else None
+
+    if builtin == "MONTH":
+        dt = _to_datetime(evaluate_expression(node.arg, solution))
+        return dt.month if dt is not None else None
+
+    if builtin == "DAY":
+        dt = _to_datetime(evaluate_expression(node.arg, solution))
+        return dt.day if dt is not None else None
+
+    if builtin == "HOURS":
+        dt = _to_datetime(evaluate_expression(node.arg, solution))
+        if dt is None:
+            return None
+        return dt.hour if isinstance(dt, datetime) else 0
+
+    if builtin == "MINUTES":
+        dt = _to_datetime(evaluate_expression(node.arg, solution))
+        if dt is None:
+            return None
+        return dt.minute if isinstance(dt, datetime) else 0
+
+    if builtin == "SECONDS":
+        dt = _to_datetime(evaluate_expression(node.arg, solution))
+        if dt is None:
+            return None
+        return dt.second if isinstance(dt, datetime) else 0
+
+    if builtin == "FLOOR":
+        val = _to_numeric(evaluate_expression(node.arg, solution))
+        if val is None:
+            return None
+        return int(math.floor(val))
+
+    if builtin == "CEIL":
+        val = _to_numeric(evaluate_expression(node.arg, solution))
+        if val is None:
+            return None
+        return int(math.ceil(val))
+
+    if builtin == "ABS":
+        val = _to_numeric(evaluate_expression(node.arg, solution))
+        if val is None:
+            return None
+        return abs(val)
+
+    if builtin == "ROUND":
+        val = _to_numeric(evaluate_expression(node.arg, solution))
+        if val is None:
+            return None
+        return round(val)
+
+    if builtin == "STRBEFORE":
+        string = _to_string(evaluate_expression(node.arg1, solution))
+        sep = _to_string(evaluate_expression(node.arg2, solution))
+        idx = string.find(sep)
+        if idx < 0:
+            return Term(type=LITERAL, value="")
+        return Term(type=LITERAL, value=string[:idx])
+
+    if builtin == "STRAFTER":
+        string = _to_string(evaluate_expression(node.arg1, solution))
+        sep = _to_string(evaluate_expression(node.arg2, solution))
+        idx = string.find(sep)
+        if idx < 0:
+            return Term(type=LITERAL, value="")
+        return Term(type=LITERAL, value=string[idx + len(sep):])
+
+    if builtin == "ENCODE_FOR_URI":
+        val = _to_string(evaluate_expression(node.arg, solution))
+        return Term(type=LITERAL, value=quote(val, safe=""))
+
+    if builtin == "REPLACE":
+        string = _to_string(evaluate_expression(node.arg, solution))
+        pattern = _to_string(evaluate_expression(node.pattern, solution))
+        replacement = _to_string(
+            evaluate_expression(node.replacement, solution)
+        )
+        flags_str = ""
+        if hasattr(node, "flags") and node.flags is not None:
+            flags_str = _to_string(evaluate_expression(node.flags, solution))
+        re_flags = 0
+        if "i" in flags_str:
+            re_flags |= re.IGNORECASE
+        if "m" in flags_str:
+            re_flags |= re.MULTILINE
+        if "s" in flags_str:
+            re_flags |= re.DOTALL
+        try:
+            result = re.sub(pattern, replacement, string, flags=re_flags)
+            return Term(type=LITERAL, value=result)
+        except re.error:
+            return None
+
+    if builtin == "SUBSTR":
+        string = _to_string(evaluate_expression(node.arg, solution))
+        start = _to_numeric(evaluate_expression(node.start, solution))
+        if start is None:
+            return None
+        start_idx = max(int(start) - 1, 0)
+        if hasattr(node, "length") and node.length is not None:
+            length = _to_numeric(evaluate_expression(node.length, solution))
+            if length is None:
+                return None
+            return Term(
+                type=LITERAL, value=string[start_idx:start_idx + int(length)]
+            )
+        return Term(type=LITERAL, value=string[start_idx:])
+
+    if builtin == "EXISTS":
+        if _exists_callback is not None:
+            return _exists_callback(node.graph, solution)
+        logger.warning("EXISTS requires an exists_cb; not available")
+        return True
+
+    if builtin == "NOTEXISTS":
+        if _exists_callback is not None:
+            return not _exists_callback(node.graph, solution)
+        logger.warning("NOT EXISTS requires an exists_cb; not available")
+        return True
+
+    if builtin == "LANGMATCHES":
+        tag = _to_string(evaluate_expression(node.arg1, solution))
+        rng = _to_string(evaluate_expression(node.arg2, solution))
+        if rng == "*":
+            return len(tag) > 0
+        return tag.lower().startswith(rng.lower())
+
+    if builtin == "IRI" or builtin == "URI":
+        val = _to_string(evaluate_expression(node.arg, solution))
+        return Term(type=IRI, iri=val)
+
+    if builtin == "BNODE":
+        if hasattr(node, "arg") and node.arg is not None:
+            label = _to_string(evaluate_expression(node.arg, solution))
+            return Term(type=BLANK, id=label)
+        return Term(type=BLANK, id=str(uuid.uuid4()))
+
+    if builtin == "NOW":
+        now = datetime.now(timezone.utc)
+        return Term(
+            type=LITERAL,
+            value=now.strftime("%Y-%m-%dT%H:%M:%S%z"),
+            datatype="http://www.w3.org/2001/XMLSchema#dateTime",
+        )
+
+    if builtin == "TZ":
+        dt = _to_datetime(evaluate_expression(node.arg, solution))
+        if dt is None:
+            return Term(type=LITERAL, value="")
+        if dt.tzinfo is not None:
+            offset = dt.strftime("%z")
+            if offset:
+                return Term(type=LITERAL, value=offset[:3] + ":" + offset[3:])
+        return Term(type=LITERAL, value="")
+
+    if builtin == "RAND":
+        return random.random()
+
+    if builtin == "UUID":
+        return Term(type=IRI, iri="urn:uuid:" + str(uuid.uuid4()))
+
+    if builtin == "STRUUID":
+        return Term(type=LITERAL, value=str(uuid.uuid4()))
+
+    if builtin == "MD5":
+        val = _to_string(evaluate_expression(node.arg, solution))
+        return Term(
+            type=LITERAL, value=hashlib.md5(val.encode()).hexdigest()
+        )
+
+    if builtin == "SHA1":
+        val = _to_string(evaluate_expression(node.arg, solution))
+        return Term(
+            type=LITERAL, value=hashlib.sha1(val.encode()).hexdigest()
+        )
+
+    if builtin == "SHA256":
+        val = _to_string(evaluate_expression(node.arg, solution))
+        return Term(
+            type=LITERAL, value=hashlib.sha256(val.encode()).hexdigest()
+        )
+
+    if builtin == "SHA512":
+        val = _to_string(evaluate_expression(node.arg, solution))
+        return Term(
+            type=LITERAL, value=hashlib.sha512(val.encode()).hexdigest()
+        )
+
    if builtin == "sameTerm":
        left = evaluate_expression(node.arg1, solution)
        right = evaluate_expression(node.arg2, solution)
@ -454,6 +651,27 @@ def _to_numeric(val):
    return None


+def _to_datetime(val):
+    """Convert a value to a date or datetime object."""
+    if val is None:
+        return None
+    s = _to_string(val)
+    for fmt in (
+        "%Y-%m-%dT%H:%M:%S", "%Y-%m-%dT%H:%M:%S%z",
+        "%Y-%m-%dT%H:%M:%SZ", "%Y-%m-%dT%H:%M",
+        "%Y-%m-%d",
+    ):
+        try:
+            return datetime.strptime(s, fmt)
+        except ValueError:
+            continue
+    try:
+        return datetime.fromisoformat(s)
+    except ValueError:
+        pass
+    return None
+
+
 def _comparable_value(val):
    """
    Convert a value to a form suitable for comparison.
--- a/trustgraph-flow/trustgraph/query/sparql/solutions.py
+++ b/trustgraph-flow/trustgraph/query/sparql/solutions.py
@ -150,6 +150,30 @@ def left_join(left, right, filter_fn=None):
    return results


+def minus(left, right):
+    """
+    MINUS operation: remove left solutions that are compatible with any
+    right solution sharing at least one variable.
+    """
+    if not right:
+        return list(left)
+
+    right_vars = set()
+    for sol in right:
+        right_vars.update(sol.keys())
+
+    results = []
+    for sol_l in left:
+        shared = set(sol_l.keys()) & right_vars
+        if not shared:
+            results.append(sol_l)
+            continue
+        if not any(_compatible(sol_l, sol_r) for sol_r in right):
+            results.append(sol_l)
+
+    return results
+
+
 def union(left, right):
    """Union two solution sequences (concatenation)."""
    return list(left) + list(right)
@ -177,6 +201,28 @@ def distinct(solutions):
    return results


+def _sort_comparable(val):
+    """Convert a value to a form suitable for sort ordering."""
+    if val is None:
+        return (0, "")
+    if isinstance(val, (int, float)):
+        return (2, val)
+    if isinstance(val, Term):
+        if val.type == LITERAL:
+            try:
+                if "." in val.value:
+                    return (2, float(val.value))
+                return (2, int(val.value))
+            except (ValueError, TypeError):
+                pass
+            return (3, val.value)
+        elif val.type == IRI:
+            return (4, val.iri)
+        elif val.type == BLANK:
+            return (5, val.id)
+    return (6, str(val))
+
+
 def order_by(solutions, key_fns):
    """
    Sort solutions by the given key functions.
@ -191,14 +237,7 @@ def order_by(solutions, key_fns):
        keys = []
        for fn, ascending in key_fns:
            val = fn(sol)
-            # Convert to comparable form
-            if val is None:
-                comparable = ("", "")
-            elif isinstance(val, Term):
-                comparable = _term_key(val)
-            else:
-                comparable = ("v", str(val))
-            keys.append(comparable)
+            keys.append(_sort_comparable(val))
        return keys

    # Handle ascending/descending
@ -224,10 +263,8 @@ def _mixed_sort(solutions, key_fns):

    def compare(a, b):
        for fn, ascending in key_fns:
-            va = fn(a)
-            vb = fn(b)
-            ka = _term_key(va) if isinstance(va, Term) else ("v", str(va)) if va is not None else ("", "")
-            kb = _term_key(vb) if isinstance(vb, Term) else ("v", str(vb)) if vb is not None else ("", "")
+            ka = _sort_comparable(fn(a))
+            kb = _sort_comparable(fn(b))

            if ka < kb:
                return -1 if ascending else 1