mirror of
https://github.com/trustgraph-ai/trustgraph.git
synced 2026-05-27 16:25:12 +02:00
Add 30+ SPARQL 1.1 built-in functions and the MINUS algebra operator to the custom SPARQL query backend. String functions: - SUBSTR (2-arg and 3-arg forms), STRBEFORE, STRAFTER - REPLACE (regex with flags), ENCODE_FOR_URI Numeric functions: - FLOOR, CEIL, ROUND, ABS Date/time accessors: - YEAR, MONTH, DAY, HOURS, MINUTES, SECONDS - NOW, TZ Hash functions: - MD5, SHA1, SHA256, SHA512 Term constructors: - IRI/URI, BNODE, UUID, STRUUID Other functions: - LANGMATCHES, RAND - EXISTS / NOT EXISTS (with async pre-evaluation to bridge the sync expression evaluator and async algebra evaluator) Algebra: - MINUS set-difference operator - HAVING already works via rdflib's Filter mapping (verified) Fix SPARQL ORDER handling Includes 653 lines of new unit tests covering all added functionality across expressions, solutions, and algebra layers.
285 lines
7.5 KiB
Python
285 lines
7.5 KiB
Python
"""
|
|
Solution sequence operations for SPARQL evaluation.
|
|
|
|
A solution is a dict mapping variable names (str) to Term values.
|
|
A solution sequence is a list of solutions.
|
|
"""
|
|
|
|
import logging
|
|
from collections import defaultdict
|
|
|
|
from ... schema import Term, IRI, LITERAL, BLANK
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
def _term_key(term):
|
|
"""Create a hashable key from a Term for join/distinct operations."""
|
|
if term is None:
|
|
return None
|
|
if term.type == IRI:
|
|
return ("i", term.iri)
|
|
elif term.type == LITERAL:
|
|
return ("l", term.value, term.datatype, term.language)
|
|
elif term.type == BLANK:
|
|
return ("b", term.id)
|
|
else:
|
|
return ("?", str(term))
|
|
|
|
|
|
def _solution_key(solution, variables):
|
|
"""Create a hashable key from a solution for the given variables."""
|
|
return tuple(_term_key(solution.get(v)) for v in variables)
|
|
|
|
|
|
def _terms_equal(a, b):
|
|
"""Check if two Terms are equal."""
|
|
if a is None and b is None:
|
|
return True
|
|
if a is None or b is None:
|
|
return False
|
|
return _term_key(a) == _term_key(b)
|
|
|
|
|
|
def _compatible(sol_a, sol_b):
|
|
"""Check if two solutions are compatible (agree on shared variables)."""
|
|
shared = set(sol_a.keys()) & set(sol_b.keys())
|
|
return all(_terms_equal(sol_a[v], sol_b[v]) for v in shared)
|
|
|
|
|
|
def _merge(sol_a, sol_b):
|
|
"""Merge two compatible solutions into one."""
|
|
result = dict(sol_a)
|
|
result.update(sol_b)
|
|
return result
|
|
|
|
|
|
def hash_join(left, right):
|
|
"""
|
|
Inner join two solution sequences on shared variables.
|
|
Uses hash join for efficiency.
|
|
"""
|
|
if not left or not right:
|
|
return []
|
|
|
|
left_vars = set()
|
|
for sol in left:
|
|
left_vars.update(sol.keys())
|
|
|
|
right_vars = set()
|
|
for sol in right:
|
|
right_vars.update(sol.keys())
|
|
|
|
shared = sorted(left_vars & right_vars)
|
|
|
|
if not shared:
|
|
# Cross product
|
|
return [_merge(l, r) for l in left for r in right]
|
|
|
|
# Build hash table on the smaller side
|
|
if len(left) <= len(right):
|
|
index = defaultdict(list)
|
|
for sol in left:
|
|
key = _solution_key(sol, shared)
|
|
index[key].append(sol)
|
|
|
|
results = []
|
|
for sol_r in right:
|
|
key = _solution_key(sol_r, shared)
|
|
for sol_l in index.get(key, []):
|
|
results.append(_merge(sol_l, sol_r))
|
|
return results
|
|
else:
|
|
index = defaultdict(list)
|
|
for sol in right:
|
|
key = _solution_key(sol, shared)
|
|
index[key].append(sol)
|
|
|
|
results = []
|
|
for sol_l in left:
|
|
key = _solution_key(sol_l, shared)
|
|
for sol_r in index.get(key, []):
|
|
results.append(_merge(sol_l, sol_r))
|
|
return results
|
|
|
|
|
|
def left_join(left, right, filter_fn=None):
|
|
"""
|
|
Left outer join (OPTIONAL semantics).
|
|
Every left solution is preserved. If it joins with right solutions
|
|
(and passes the optional filter), the merged solutions are included.
|
|
Otherwise the original left solution is kept.
|
|
"""
|
|
if not left:
|
|
return []
|
|
|
|
if not right:
|
|
return list(left)
|
|
|
|
right_vars = set()
|
|
for sol in right:
|
|
right_vars.update(sol.keys())
|
|
|
|
left_vars = set()
|
|
for sol in left:
|
|
left_vars.update(sol.keys())
|
|
|
|
shared = sorted(left_vars & right_vars)
|
|
|
|
# Build hash table on right side
|
|
index = defaultdict(list)
|
|
for sol in right:
|
|
key = _solution_key(sol, shared) if shared else ()
|
|
index[key].append(sol)
|
|
|
|
results = []
|
|
for sol_l in left:
|
|
key = _solution_key(sol_l, shared) if shared else ()
|
|
matches = index.get(key, [])
|
|
|
|
matched = False
|
|
for sol_r in matches:
|
|
merged = _merge(sol_l, sol_r)
|
|
if filter_fn is None or filter_fn(merged):
|
|
results.append(merged)
|
|
matched = True
|
|
|
|
if not matched:
|
|
results.append(dict(sol_l))
|
|
|
|
return results
|
|
|
|
|
|
def minus(left, right):
|
|
"""
|
|
MINUS operation: remove left solutions that are compatible with any
|
|
right solution sharing at least one variable.
|
|
"""
|
|
if not right:
|
|
return list(left)
|
|
|
|
right_vars = set()
|
|
for sol in right:
|
|
right_vars.update(sol.keys())
|
|
|
|
results = []
|
|
for sol_l in left:
|
|
shared = set(sol_l.keys()) & right_vars
|
|
if not shared:
|
|
results.append(sol_l)
|
|
continue
|
|
if not any(_compatible(sol_l, sol_r) for sol_r in right):
|
|
results.append(sol_l)
|
|
|
|
return results
|
|
|
|
|
|
def union(left, right):
|
|
"""Union two solution sequences (concatenation)."""
|
|
return list(left) + list(right)
|
|
|
|
|
|
def project(solutions, variables):
|
|
"""Keep only the specified variables in each solution."""
|
|
return [
|
|
{v: sol[v] for v in variables if v in sol}
|
|
for sol in solutions
|
|
]
|
|
|
|
|
|
def distinct(solutions):
|
|
"""Remove duplicate solutions."""
|
|
seen = set()
|
|
results = []
|
|
for sol in solutions:
|
|
key = tuple(sorted(
|
|
(k, _term_key(v)) for k, v in sol.items()
|
|
))
|
|
if key not in seen:
|
|
seen.add(key)
|
|
results.append(sol)
|
|
return results
|
|
|
|
|
|
def _sort_comparable(val):
|
|
"""Convert a value to a form suitable for sort ordering."""
|
|
if val is None:
|
|
return (0, "")
|
|
if isinstance(val, (int, float)):
|
|
return (2, val)
|
|
if isinstance(val, Term):
|
|
if val.type == LITERAL:
|
|
try:
|
|
if "." in val.value:
|
|
return (2, float(val.value))
|
|
return (2, int(val.value))
|
|
except (ValueError, TypeError):
|
|
pass
|
|
return (3, val.value)
|
|
elif val.type == IRI:
|
|
return (4, val.iri)
|
|
elif val.type == BLANK:
|
|
return (5, val.id)
|
|
return (6, str(val))
|
|
|
|
|
|
def order_by(solutions, key_fns):
|
|
"""
|
|
Sort solutions by the given key functions.
|
|
|
|
key_fns is a list of (fn, ascending) tuples where fn extracts
|
|
a comparable value from a solution.
|
|
"""
|
|
if not key_fns:
|
|
return solutions
|
|
|
|
def sort_key(sol):
|
|
keys = []
|
|
for fn, ascending in key_fns:
|
|
val = fn(sol)
|
|
keys.append(_sort_comparable(val))
|
|
return keys
|
|
|
|
# Handle ascending/descending
|
|
# For simplicity, sort ascending then reverse individual keys
|
|
# This works for single sort keys; for multiple mixed keys we
|
|
# need a wrapper
|
|
result = sorted(solutions, key=sort_key)
|
|
|
|
# If any key is descending, we need a more complex approach.
|
|
# Check if all are same direction for the simple case.
|
|
if key_fns and all(not asc for _, asc in key_fns):
|
|
result.reverse()
|
|
elif key_fns and not all(asc for _, asc in key_fns):
|
|
# Mixed ascending/descending - use negation wrapper
|
|
result = _mixed_sort(solutions, key_fns)
|
|
|
|
return result
|
|
|
|
|
|
def _mixed_sort(solutions, key_fns):
|
|
"""Sort with mixed ascending/descending keys."""
|
|
import functools
|
|
|
|
def compare(a, b):
|
|
for fn, ascending in key_fns:
|
|
ka = _sort_comparable(fn(a))
|
|
kb = _sort_comparable(fn(b))
|
|
|
|
if ka < kb:
|
|
return -1 if ascending else 1
|
|
elif ka > kb:
|
|
return 1 if ascending else -1
|
|
|
|
return 0
|
|
|
|
return sorted(solutions, key=functools.cmp_to_key(compare))
|
|
|
|
|
|
def slice_solutions(solutions, offset=0, limit=None):
|
|
"""Apply OFFSET and LIMIT to a solution sequence."""
|
|
if offset:
|
|
solutions = solutions[offset:]
|
|
if limit is not None:
|
|
solutions = solutions[:limit]
|
|
return solutions
|