trustgraph/trustgraph-flow/trustgraph/query/sparql/solutions.py
cybermaggedon 2c3a699af3
feat: extend SPARQL evaluator with comprehensive function and operator support (#945)
Add 30+ SPARQL 1.1 built-in functions and the MINUS algebra operator to the
custom SPARQL query backend.

String functions:
- SUBSTR (2-arg and 3-arg forms), STRBEFORE, STRAFTER
- REPLACE (regex with flags), ENCODE_FOR_URI

Numeric functions:
- FLOOR, CEIL, ROUND, ABS

Date/time accessors:
- YEAR, MONTH, DAY, HOURS, MINUTES, SECONDS
- NOW, TZ

Hash functions:
- MD5, SHA1, SHA256, SHA512

Term constructors:
- IRI/URI, BNODE, UUID, STRUUID

Other functions:
- LANGMATCHES, RAND
- EXISTS / NOT EXISTS (with async pre-evaluation to bridge the
  sync expression evaluator and async algebra evaluator)

Algebra:
- MINUS set-difference operator
- HAVING already works via rdflib's Filter mapping (verified)

Fix SPARQL ORDER handling

Includes 653 lines of new unit tests covering all added functionality
across expressions, solutions, and algebra layers.
2026-05-21 10:50:11 +01:00

285 lines
7.5 KiB
Python

"""
Solution sequence operations for SPARQL evaluation.
A solution is a dict mapping variable names (str) to Term values.
A solution sequence is a list of solutions.
"""
import logging
from collections import defaultdict
from ... schema import Term, IRI, LITERAL, BLANK
logger = logging.getLogger(__name__)
def _term_key(term):
"""Create a hashable key from a Term for join/distinct operations."""
if term is None:
return None
if term.type == IRI:
return ("i", term.iri)
elif term.type == LITERAL:
return ("l", term.value, term.datatype, term.language)
elif term.type == BLANK:
return ("b", term.id)
else:
return ("?", str(term))
def _solution_key(solution, variables):
"""Create a hashable key from a solution for the given variables."""
return tuple(_term_key(solution.get(v)) for v in variables)
def _terms_equal(a, b):
"""Check if two Terms are equal."""
if a is None and b is None:
return True
if a is None or b is None:
return False
return _term_key(a) == _term_key(b)
def _compatible(sol_a, sol_b):
"""Check if two solutions are compatible (agree on shared variables)."""
shared = set(sol_a.keys()) & set(sol_b.keys())
return all(_terms_equal(sol_a[v], sol_b[v]) for v in shared)
def _merge(sol_a, sol_b):
"""Merge two compatible solutions into one."""
result = dict(sol_a)
result.update(sol_b)
return result
def hash_join(left, right):
"""
Inner join two solution sequences on shared variables.
Uses hash join for efficiency.
"""
if not left or not right:
return []
left_vars = set()
for sol in left:
left_vars.update(sol.keys())
right_vars = set()
for sol in right:
right_vars.update(sol.keys())
shared = sorted(left_vars & right_vars)
if not shared:
# Cross product
return [_merge(l, r) for l in left for r in right]
# Build hash table on the smaller side
if len(left) <= len(right):
index = defaultdict(list)
for sol in left:
key = _solution_key(sol, shared)
index[key].append(sol)
results = []
for sol_r in right:
key = _solution_key(sol_r, shared)
for sol_l in index.get(key, []):
results.append(_merge(sol_l, sol_r))
return results
else:
index = defaultdict(list)
for sol in right:
key = _solution_key(sol, shared)
index[key].append(sol)
results = []
for sol_l in left:
key = _solution_key(sol_l, shared)
for sol_r in index.get(key, []):
results.append(_merge(sol_l, sol_r))
return results
def left_join(left, right, filter_fn=None):
"""
Left outer join (OPTIONAL semantics).
Every left solution is preserved. If it joins with right solutions
(and passes the optional filter), the merged solutions are included.
Otherwise the original left solution is kept.
"""
if not left:
return []
if not right:
return list(left)
right_vars = set()
for sol in right:
right_vars.update(sol.keys())
left_vars = set()
for sol in left:
left_vars.update(sol.keys())
shared = sorted(left_vars & right_vars)
# Build hash table on right side
index = defaultdict(list)
for sol in right:
key = _solution_key(sol, shared) if shared else ()
index[key].append(sol)
results = []
for sol_l in left:
key = _solution_key(sol_l, shared) if shared else ()
matches = index.get(key, [])
matched = False
for sol_r in matches:
merged = _merge(sol_l, sol_r)
if filter_fn is None or filter_fn(merged):
results.append(merged)
matched = True
if not matched:
results.append(dict(sol_l))
return results
def minus(left, right):
"""
MINUS operation: remove left solutions that are compatible with any
right solution sharing at least one variable.
"""
if not right:
return list(left)
right_vars = set()
for sol in right:
right_vars.update(sol.keys())
results = []
for sol_l in left:
shared = set(sol_l.keys()) & right_vars
if not shared:
results.append(sol_l)
continue
if not any(_compatible(sol_l, sol_r) for sol_r in right):
results.append(sol_l)
return results
def union(left, right):
"""Union two solution sequences (concatenation)."""
return list(left) + list(right)
def project(solutions, variables):
"""Keep only the specified variables in each solution."""
return [
{v: sol[v] for v in variables if v in sol}
for sol in solutions
]
def distinct(solutions):
"""Remove duplicate solutions."""
seen = set()
results = []
for sol in solutions:
key = tuple(sorted(
(k, _term_key(v)) for k, v in sol.items()
))
if key not in seen:
seen.add(key)
results.append(sol)
return results
def _sort_comparable(val):
"""Convert a value to a form suitable for sort ordering."""
if val is None:
return (0, "")
if isinstance(val, (int, float)):
return (2, val)
if isinstance(val, Term):
if val.type == LITERAL:
try:
if "." in val.value:
return (2, float(val.value))
return (2, int(val.value))
except (ValueError, TypeError):
pass
return (3, val.value)
elif val.type == IRI:
return (4, val.iri)
elif val.type == BLANK:
return (5, val.id)
return (6, str(val))
def order_by(solutions, key_fns):
"""
Sort solutions by the given key functions.
key_fns is a list of (fn, ascending) tuples where fn extracts
a comparable value from a solution.
"""
if not key_fns:
return solutions
def sort_key(sol):
keys = []
for fn, ascending in key_fns:
val = fn(sol)
keys.append(_sort_comparable(val))
return keys
# Handle ascending/descending
# For simplicity, sort ascending then reverse individual keys
# This works for single sort keys; for multiple mixed keys we
# need a wrapper
result = sorted(solutions, key=sort_key)
# If any key is descending, we need a more complex approach.
# Check if all are same direction for the simple case.
if key_fns and all(not asc for _, asc in key_fns):
result.reverse()
elif key_fns and not all(asc for _, asc in key_fns):
# Mixed ascending/descending - use negation wrapper
result = _mixed_sort(solutions, key_fns)
return result
def _mixed_sort(solutions, key_fns):
"""Sort with mixed ascending/descending keys."""
import functools
def compare(a, b):
for fn, ascending in key_fns:
ka = _sort_comparable(fn(a))
kb = _sort_comparable(fn(b))
if ka < kb:
return -1 if ascending else 1
elif ka > kb:
return 1 if ascending else -1
return 0
return sorted(solutions, key=functools.cmp_to_key(compare))
def slice_solutions(solutions, offset=0, limit=None):
"""Apply OFFSET and LIMIT to a solution sequence."""
if offset:
solutions = solutions[offset:]
if limit is not None:
solutions = solutions[:limit]
return solutions