mirror of
https://github.com/trustgraph-ai/trustgraph.git
synced 2026-05-28 00:35:13 +02:00
feat: extend SPARQL evaluator with comprehensive function and operator support (#945)
Add 30+ SPARQL 1.1 built-in functions and the MINUS algebra operator to the custom SPARQL query backend. String functions: - SUBSTR (2-arg and 3-arg forms), STRBEFORE, STRAFTER - REPLACE (regex with flags), ENCODE_FOR_URI Numeric functions: - FLOOR, CEIL, ROUND, ABS Date/time accessors: - YEAR, MONTH, DAY, HOURS, MINUTES, SECONDS - NOW, TZ Hash functions: - MD5, SHA1, SHA256, SHA512 Term constructors: - IRI/URI, BNODE, UUID, STRUUID Other functions: - LANGMATCHES, RAND - EXISTS / NOT EXISTS (with async pre-evaluation to bridge the sync expression evaluator and async algebra evaluator) Algebra: - MINUS set-difference operator - HAVING already works via rdflib's Filter mapping (verified) Fix SPARQL ORDER handling Includes 653 lines of new unit tests covering all added functionality across expressions, solutions, and algebra layers.
This commit is contained in:
parent
e57f4669e1
commit
2c3a699af3
6 changed files with 1021 additions and 29 deletions
|
|
@ -17,7 +17,7 @@ from ... knowledge import Uri
|
|||
from ... knowledge import Literal as KgLiteral
|
||||
from . parser import rdflib_term_to_term
|
||||
from . solutions import (
|
||||
hash_join, left_join, union, project, distinct,
|
||||
hash_join, left_join, minus, union, project, distinct,
|
||||
order_by, slice_solutions, _term_key,
|
||||
)
|
||||
from . expressions import evaluate_expression, _effective_boolean
|
||||
|
|
@ -159,14 +159,69 @@ async def _eval_union(node, tc, collection, limit):
|
|||
return union(left, right)[:limit]
|
||||
|
||||
|
||||
async def _eval_minus(node, tc, collection, limit):
|
||||
"""Evaluate a Minus node."""
|
||||
left = await evaluate(node.p1, tc, collection, limit)
|
||||
right = await evaluate(node.p2, tc, collection, limit)
|
||||
return minus(left, right)
|
||||
|
||||
|
||||
async def _check_exists(graph_node, sol, tc, collection, limit):
|
||||
"""Evaluate an EXISTS graph pattern against a solution."""
|
||||
results = await evaluate(graph_node, tc, collection, limit)
|
||||
for r in results:
|
||||
shared = set(sol.keys()) & set(r.keys())
|
||||
if all(
|
||||
_term_key(sol[v]) == _term_key(r[v])
|
||||
for v in shared
|
||||
if sol.get(v) is not None and r.get(v) is not None
|
||||
):
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
async def _pre_eval_exists(expr, sol, tc, collection, limit, cache):
|
||||
"""Walk an expression tree, pre-evaluate EXISTS/NOT EXISTS, cache results."""
|
||||
if not isinstance(expr, CompValue):
|
||||
return
|
||||
if expr.name in ("Builtin_EXISTS", "Builtin_NOTEXISTS"):
|
||||
key = id(expr.graph), id(sol)
|
||||
if key not in cache:
|
||||
cache[key] = await _check_exists(
|
||||
expr.graph, sol, tc, collection, limit
|
||||
)
|
||||
return
|
||||
for attr in ("expr", "other", "arg", "arg1", "arg2", "arg3"):
|
||||
child = getattr(expr, attr, None)
|
||||
if child is None:
|
||||
continue
|
||||
if isinstance(child, CompValue):
|
||||
await _pre_eval_exists(child, sol, tc, collection, limit, cache)
|
||||
elif isinstance(child, (list, tuple)):
|
||||
for item in child:
|
||||
if isinstance(item, CompValue):
|
||||
await _pre_eval_exists(
|
||||
item, sol, tc, collection, limit, cache
|
||||
)
|
||||
|
||||
|
||||
async def _eval_filter(node, tc, collection, limit):
|
||||
"""Evaluate a Filter node."""
|
||||
solutions = await evaluate(node.p, tc, collection, limit)
|
||||
expr = node.expr
|
||||
return [
|
||||
sol for sol in solutions
|
||||
if _effective_boolean(evaluate_expression(expr, sol))
|
||||
]
|
||||
exists_cache = {}
|
||||
|
||||
def exists_cb(graph_node, sol):
|
||||
key = id(graph_node), id(sol)
|
||||
return exists_cache.get(key, False)
|
||||
|
||||
result = []
|
||||
for sol in solutions:
|
||||
await _pre_eval_exists(expr, sol, tc, collection, limit, exists_cache)
|
||||
if _effective_boolean(evaluate_expression(expr, sol, exists_cb=exists_cb)):
|
||||
result.append(sol)
|
||||
|
||||
return result
|
||||
|
||||
|
||||
async def _eval_distinct(node, tc, collection, limit):
|
||||
|
|
@ -222,10 +277,16 @@ async def _eval_extend(node, tc, collection, limit):
|
|||
solutions = await evaluate(node.p, tc, collection, limit)
|
||||
var_name = str(node.var)
|
||||
expr = node.expr
|
||||
exists_cache = {}
|
||||
|
||||
def exists_cb(graph_node, sol):
|
||||
key = id(graph_node), id(sol)
|
||||
return exists_cache.get(key, False)
|
||||
|
||||
result = []
|
||||
for sol in solutions:
|
||||
val = evaluate_expression(expr, sol)
|
||||
await _pre_eval_exists(expr, sol, tc, collection, limit, exists_cache)
|
||||
val = evaluate_expression(expr, sol, exists_cb=exists_cb)
|
||||
new_sol = dict(sol)
|
||||
if isinstance(val, Term):
|
||||
new_sol[var_name] = val
|
||||
|
|
@ -525,6 +586,7 @@ _HANDLERS = {
|
|||
"Join": _eval_join,
|
||||
"LeftJoin": _eval_left_join,
|
||||
"Union": _eval_union,
|
||||
"Minus": _eval_minus,
|
||||
"Filter": _eval_filter,
|
||||
"Distinct": _eval_distinct,
|
||||
"Reduced": _eval_reduced,
|
||||
|
|
|
|||
|
|
@ -5,9 +5,15 @@ Evaluates rdflib algebra expression nodes against a solution (variable
|
|||
binding) to produce a value or boolean result.
|
||||
"""
|
||||
|
||||
import hashlib
|
||||
import math
|
||||
import random
|
||||
import re
|
||||
import logging
|
||||
import operator
|
||||
import uuid
|
||||
from datetime import datetime, date, timezone
|
||||
from urllib.parse import quote
|
||||
|
||||
from rdflib.term import Variable, URIRef, Literal, BNode
|
||||
from rdflib.plugins.sparql.parserutils import CompValue
|
||||
|
|
@ -17,23 +23,31 @@ from . parser import rdflib_term_to_term
|
|||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
_exists_callback = None
|
||||
|
||||
|
||||
class ExpressionError(Exception):
|
||||
"""Raised when a SPARQL expression cannot be evaluated."""
|
||||
pass
|
||||
|
||||
|
||||
def evaluate_expression(expr, solution):
|
||||
def evaluate_expression(expr, solution, exists_cb=None):
|
||||
"""
|
||||
Evaluate a SPARQL expression against a solution binding.
|
||||
|
||||
Args:
|
||||
expr: rdflib algebra expression node
|
||||
solution: dict mapping variable names to Term values
|
||||
exists_cb: optional callback(graph_node, solution) -> bool for
|
||||
EXISTS/NOT EXISTS evaluation; provided by algebra.py
|
||||
|
||||
Returns:
|
||||
The result value (Term, bool, number, string, or None)
|
||||
"""
|
||||
global _exists_callback
|
||||
if exists_cb is not None:
|
||||
_exists_callback = exists_cb
|
||||
|
||||
if expr is None:
|
||||
return True
|
||||
|
||||
|
|
@ -119,15 +133,7 @@ def _evaluate_comp_value(node, solution):
|
|||
if name == "Function":
|
||||
return _eval_function(node, solution)
|
||||
|
||||
# Exists / NotExists
|
||||
if name == "Builtin_EXISTS":
|
||||
# EXISTS requires graph pattern evaluation - not handled here
|
||||
logger.warning("EXISTS not supported in filter expressions")
|
||||
return True
|
||||
|
||||
if name == "Builtin_NOTEXISTS":
|
||||
logger.warning("NOT EXISTS not supported in filter expressions")
|
||||
return True
|
||||
# Exists / NotExists — handled via _eval_builtin now
|
||||
|
||||
# TrueFilter (used with OPTIONAL)
|
||||
if name == "TrueFilter":
|
||||
|
|
@ -335,6 +341,197 @@ def _eval_builtin(name, node, solution):
|
|||
return val
|
||||
return None
|
||||
|
||||
if builtin == "YEAR":
|
||||
dt = _to_datetime(evaluate_expression(node.arg, solution))
|
||||
return dt.year if dt is not None else None
|
||||
|
||||
if builtin == "MONTH":
|
||||
dt = _to_datetime(evaluate_expression(node.arg, solution))
|
||||
return dt.month if dt is not None else None
|
||||
|
||||
if builtin == "DAY":
|
||||
dt = _to_datetime(evaluate_expression(node.arg, solution))
|
||||
return dt.day if dt is not None else None
|
||||
|
||||
if builtin == "HOURS":
|
||||
dt = _to_datetime(evaluate_expression(node.arg, solution))
|
||||
if dt is None:
|
||||
return None
|
||||
return dt.hour if isinstance(dt, datetime) else 0
|
||||
|
||||
if builtin == "MINUTES":
|
||||
dt = _to_datetime(evaluate_expression(node.arg, solution))
|
||||
if dt is None:
|
||||
return None
|
||||
return dt.minute if isinstance(dt, datetime) else 0
|
||||
|
||||
if builtin == "SECONDS":
|
||||
dt = _to_datetime(evaluate_expression(node.arg, solution))
|
||||
if dt is None:
|
||||
return None
|
||||
return dt.second if isinstance(dt, datetime) else 0
|
||||
|
||||
if builtin == "FLOOR":
|
||||
val = _to_numeric(evaluate_expression(node.arg, solution))
|
||||
if val is None:
|
||||
return None
|
||||
return int(math.floor(val))
|
||||
|
||||
if builtin == "CEIL":
|
||||
val = _to_numeric(evaluate_expression(node.arg, solution))
|
||||
if val is None:
|
||||
return None
|
||||
return int(math.ceil(val))
|
||||
|
||||
if builtin == "ABS":
|
||||
val = _to_numeric(evaluate_expression(node.arg, solution))
|
||||
if val is None:
|
||||
return None
|
||||
return abs(val)
|
||||
|
||||
if builtin == "ROUND":
|
||||
val = _to_numeric(evaluate_expression(node.arg, solution))
|
||||
if val is None:
|
||||
return None
|
||||
return round(val)
|
||||
|
||||
if builtin == "STRBEFORE":
|
||||
string = _to_string(evaluate_expression(node.arg1, solution))
|
||||
sep = _to_string(evaluate_expression(node.arg2, solution))
|
||||
idx = string.find(sep)
|
||||
if idx < 0:
|
||||
return Term(type=LITERAL, value="")
|
||||
return Term(type=LITERAL, value=string[:idx])
|
||||
|
||||
if builtin == "STRAFTER":
|
||||
string = _to_string(evaluate_expression(node.arg1, solution))
|
||||
sep = _to_string(evaluate_expression(node.arg2, solution))
|
||||
idx = string.find(sep)
|
||||
if idx < 0:
|
||||
return Term(type=LITERAL, value="")
|
||||
return Term(type=LITERAL, value=string[idx + len(sep):])
|
||||
|
||||
if builtin == "ENCODE_FOR_URI":
|
||||
val = _to_string(evaluate_expression(node.arg, solution))
|
||||
return Term(type=LITERAL, value=quote(val, safe=""))
|
||||
|
||||
if builtin == "REPLACE":
|
||||
string = _to_string(evaluate_expression(node.arg, solution))
|
||||
pattern = _to_string(evaluate_expression(node.pattern, solution))
|
||||
replacement = _to_string(
|
||||
evaluate_expression(node.replacement, solution)
|
||||
)
|
||||
flags_str = ""
|
||||
if hasattr(node, "flags") and node.flags is not None:
|
||||
flags_str = _to_string(evaluate_expression(node.flags, solution))
|
||||
re_flags = 0
|
||||
if "i" in flags_str:
|
||||
re_flags |= re.IGNORECASE
|
||||
if "m" in flags_str:
|
||||
re_flags |= re.MULTILINE
|
||||
if "s" in flags_str:
|
||||
re_flags |= re.DOTALL
|
||||
try:
|
||||
result = re.sub(pattern, replacement, string, flags=re_flags)
|
||||
return Term(type=LITERAL, value=result)
|
||||
except re.error:
|
||||
return None
|
||||
|
||||
if builtin == "SUBSTR":
|
||||
string = _to_string(evaluate_expression(node.arg, solution))
|
||||
start = _to_numeric(evaluate_expression(node.start, solution))
|
||||
if start is None:
|
||||
return None
|
||||
start_idx = max(int(start) - 1, 0)
|
||||
if hasattr(node, "length") and node.length is not None:
|
||||
length = _to_numeric(evaluate_expression(node.length, solution))
|
||||
if length is None:
|
||||
return None
|
||||
return Term(
|
||||
type=LITERAL, value=string[start_idx:start_idx + int(length)]
|
||||
)
|
||||
return Term(type=LITERAL, value=string[start_idx:])
|
||||
|
||||
if builtin == "EXISTS":
|
||||
if _exists_callback is not None:
|
||||
return _exists_callback(node.graph, solution)
|
||||
logger.warning("EXISTS requires an exists_cb; not available")
|
||||
return True
|
||||
|
||||
if builtin == "NOTEXISTS":
|
||||
if _exists_callback is not None:
|
||||
return not _exists_callback(node.graph, solution)
|
||||
logger.warning("NOT EXISTS requires an exists_cb; not available")
|
||||
return True
|
||||
|
||||
if builtin == "LANGMATCHES":
|
||||
tag = _to_string(evaluate_expression(node.arg1, solution))
|
||||
rng = _to_string(evaluate_expression(node.arg2, solution))
|
||||
if rng == "*":
|
||||
return len(tag) > 0
|
||||
return tag.lower().startswith(rng.lower())
|
||||
|
||||
if builtin == "IRI" or builtin == "URI":
|
||||
val = _to_string(evaluate_expression(node.arg, solution))
|
||||
return Term(type=IRI, iri=val)
|
||||
|
||||
if builtin == "BNODE":
|
||||
if hasattr(node, "arg") and node.arg is not None:
|
||||
label = _to_string(evaluate_expression(node.arg, solution))
|
||||
return Term(type=BLANK, id=label)
|
||||
return Term(type=BLANK, id=str(uuid.uuid4()))
|
||||
|
||||
if builtin == "NOW":
|
||||
now = datetime.now(timezone.utc)
|
||||
return Term(
|
||||
type=LITERAL,
|
||||
value=now.strftime("%Y-%m-%dT%H:%M:%S%z"),
|
||||
datatype="http://www.w3.org/2001/XMLSchema#dateTime",
|
||||
)
|
||||
|
||||
if builtin == "TZ":
|
||||
dt = _to_datetime(evaluate_expression(node.arg, solution))
|
||||
if dt is None:
|
||||
return Term(type=LITERAL, value="")
|
||||
if dt.tzinfo is not None:
|
||||
offset = dt.strftime("%z")
|
||||
if offset:
|
||||
return Term(type=LITERAL, value=offset[:3] + ":" + offset[3:])
|
||||
return Term(type=LITERAL, value="")
|
||||
|
||||
if builtin == "RAND":
|
||||
return random.random()
|
||||
|
||||
if builtin == "UUID":
|
||||
return Term(type=IRI, iri="urn:uuid:" + str(uuid.uuid4()))
|
||||
|
||||
if builtin == "STRUUID":
|
||||
return Term(type=LITERAL, value=str(uuid.uuid4()))
|
||||
|
||||
if builtin == "MD5":
|
||||
val = _to_string(evaluate_expression(node.arg, solution))
|
||||
return Term(
|
||||
type=LITERAL, value=hashlib.md5(val.encode()).hexdigest()
|
||||
)
|
||||
|
||||
if builtin == "SHA1":
|
||||
val = _to_string(evaluate_expression(node.arg, solution))
|
||||
return Term(
|
||||
type=LITERAL, value=hashlib.sha1(val.encode()).hexdigest()
|
||||
)
|
||||
|
||||
if builtin == "SHA256":
|
||||
val = _to_string(evaluate_expression(node.arg, solution))
|
||||
return Term(
|
||||
type=LITERAL, value=hashlib.sha256(val.encode()).hexdigest()
|
||||
)
|
||||
|
||||
if builtin == "SHA512":
|
||||
val = _to_string(evaluate_expression(node.arg, solution))
|
||||
return Term(
|
||||
type=LITERAL, value=hashlib.sha512(val.encode()).hexdigest()
|
||||
)
|
||||
|
||||
if builtin == "sameTerm":
|
||||
left = evaluate_expression(node.arg1, solution)
|
||||
right = evaluate_expression(node.arg2, solution)
|
||||
|
|
@ -454,6 +651,27 @@ def _to_numeric(val):
|
|||
return None
|
||||
|
||||
|
||||
def _to_datetime(val):
|
||||
"""Convert a value to a date or datetime object."""
|
||||
if val is None:
|
||||
return None
|
||||
s = _to_string(val)
|
||||
for fmt in (
|
||||
"%Y-%m-%dT%H:%M:%S", "%Y-%m-%dT%H:%M:%S%z",
|
||||
"%Y-%m-%dT%H:%M:%SZ", "%Y-%m-%dT%H:%M",
|
||||
"%Y-%m-%d",
|
||||
):
|
||||
try:
|
||||
return datetime.strptime(s, fmt)
|
||||
except ValueError:
|
||||
continue
|
||||
try:
|
||||
return datetime.fromisoformat(s)
|
||||
except ValueError:
|
||||
pass
|
||||
return None
|
||||
|
||||
|
||||
def _comparable_value(val):
|
||||
"""
|
||||
Convert a value to a form suitable for comparison.
|
||||
|
|
|
|||
|
|
@ -150,6 +150,30 @@ def left_join(left, right, filter_fn=None):
|
|||
return results
|
||||
|
||||
|
||||
def minus(left, right):
|
||||
"""
|
||||
MINUS operation: remove left solutions that are compatible with any
|
||||
right solution sharing at least one variable.
|
||||
"""
|
||||
if not right:
|
||||
return list(left)
|
||||
|
||||
right_vars = set()
|
||||
for sol in right:
|
||||
right_vars.update(sol.keys())
|
||||
|
||||
results = []
|
||||
for sol_l in left:
|
||||
shared = set(sol_l.keys()) & right_vars
|
||||
if not shared:
|
||||
results.append(sol_l)
|
||||
continue
|
||||
if not any(_compatible(sol_l, sol_r) for sol_r in right):
|
||||
results.append(sol_l)
|
||||
|
||||
return results
|
||||
|
||||
|
||||
def union(left, right):
|
||||
"""Union two solution sequences (concatenation)."""
|
||||
return list(left) + list(right)
|
||||
|
|
@ -177,6 +201,28 @@ def distinct(solutions):
|
|||
return results
|
||||
|
||||
|
||||
def _sort_comparable(val):
|
||||
"""Convert a value to a form suitable for sort ordering."""
|
||||
if val is None:
|
||||
return (0, "")
|
||||
if isinstance(val, (int, float)):
|
||||
return (2, val)
|
||||
if isinstance(val, Term):
|
||||
if val.type == LITERAL:
|
||||
try:
|
||||
if "." in val.value:
|
||||
return (2, float(val.value))
|
||||
return (2, int(val.value))
|
||||
except (ValueError, TypeError):
|
||||
pass
|
||||
return (3, val.value)
|
||||
elif val.type == IRI:
|
||||
return (4, val.iri)
|
||||
elif val.type == BLANK:
|
||||
return (5, val.id)
|
||||
return (6, str(val))
|
||||
|
||||
|
||||
def order_by(solutions, key_fns):
|
||||
"""
|
||||
Sort solutions by the given key functions.
|
||||
|
|
@ -191,14 +237,7 @@ def order_by(solutions, key_fns):
|
|||
keys = []
|
||||
for fn, ascending in key_fns:
|
||||
val = fn(sol)
|
||||
# Convert to comparable form
|
||||
if val is None:
|
||||
comparable = ("", "")
|
||||
elif isinstance(val, Term):
|
||||
comparable = _term_key(val)
|
||||
else:
|
||||
comparable = ("v", str(val))
|
||||
keys.append(comparable)
|
||||
keys.append(_sort_comparable(val))
|
||||
return keys
|
||||
|
||||
# Handle ascending/descending
|
||||
|
|
@ -224,10 +263,8 @@ def _mixed_sort(solutions, key_fns):
|
|||
|
||||
def compare(a, b):
|
||||
for fn, ascending in key_fns:
|
||||
va = fn(a)
|
||||
vb = fn(b)
|
||||
ka = _term_key(va) if isinstance(va, Term) else ("v", str(va)) if va is not None else ("", "")
|
||||
kb = _term_key(vb) if isinstance(vb, Term) else ("v", str(vb)) if vb is not None else ("", "")
|
||||
ka = _sort_comparable(fn(a))
|
||||
kb = _sort_comparable(fn(b))
|
||||
|
||||
if ka < kb:
|
||||
return -1 if ascending else 1
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue