Initial open-source release

This commit is contained in:
Andrey Avtomonov 2026-05-10 23:12:26 +02:00
commit 1a42152e6f
1199 changed files with 257054 additions and 0 deletions

104
python/klo-daemon/README.md Normal file
View file

@ -0,0 +1,104 @@
# klo-daemon
`klo-daemon` is the portable Python compute package for KLO.
It supports portable compute in two modes:
- One-shot commands, used by default by `@klo/context`.
- An explicit HTTP server for long-running local MCP sessions.
## One-shot semantic query
```bash
printf '%s\n' '{"sources":[],"query":{"measures":[],"dimensions":[]},"dialect":"postgres"}' \
| klo-daemon semantic-query
```
## One-shot source generation
Generate semantic-layer sources from schema scan data:
```bash
printf '%s\n' '{"tables":[{"name":"orders","db":"public","columns":[{"name":"id","type":"integer","primary_key":true}]}],"links":[],"dialect":"postgres"}' \
| klo-daemon semantic-generate-sources
```
## One-shot database introspection
Introspect a Postgres database schema:
```bash
printf '%s\n' '{"connection_id":"warehouse","driver":"postgres","url":"postgresql://readonly@example.test/warehouse","schemas":["public"]}' \
| klo-daemon database-introspect
```
## One-shot LookML parsing
Parse LookML projects into resolved, KSL-ready structures:
```bash
printf '%s\n' '{"files":[{"path":"views/orders.view.lkml","content":"view: orders { sql_table_name: public.orders ;; measure: order_count { type: count } }"}],"dialect":"postgres"}' \
| klo-daemon lookml-parse
```
## One-shot embeddings
Compute text embeddings locally:
```bash
printf '%s\n' '{"text":"hello"}' \
| klo-daemon embedding-compute
```
Compute text embeddings locally in bulk:
```bash
printf '%s\n' '{"texts":["hello","world"]}' \
| klo-daemon embedding-compute-bulk
```
## One-shot code execution
Execute Python code with the current in-process boundary:
```bash
printf '%s\n' '{"code":"result = 1 + 2"}' \
| klo-daemon code-execute
```
## HTTP compute server
Start the HTTP compute server with code execution disabled:
```bash
klo-daemon serve-http --host 127.0.0.1 --port 8765
```
Enable HTTP code execution explicitly:
```bash
klo-daemon serve-http --host 127.0.0.1 --port 8765 --enable-code-execution
```
Available HTTP endpoints:
- `GET /health`
- `POST /database/introspect`
- `POST /embeddings/compute`
- `POST /embeddings/compute-bulk`
- `POST /lookml/parse`
- `POST /semantic-layer/generate-sources`
- `POST /semantic-layer/query`
- `POST /semantic-layer/validate`
- `POST /code/execute` when `--enable-code-execution` is passed
The HTTP server exposes Postgres database introspection, LookML parsing, local
embedding compute, and semantic-layer compute for source generation, query
compilation, and validation.
Code execution is off by default. When enabled, it runs Python `exec` in the
daemon process with the same in-process boundary as the one-shot
`code-execute` command and does not provide OS-level sandboxing.
HTTP code execution uses the standalone KLO boundary. It does not forward
caller authorization headers to a host app and does not connect scratchpad or
visualization helpers to host application APIs.

View file

@ -0,0 +1,50 @@
[project]
name = "klo-daemon"
version = "0.1.0"
description = "Portable compute package for KLO semantic-layer operations"
readme = "README.md"
requires-python = ">=3.13"
license = "Apache-2.0"
dependencies = [
"fastapi>=0.115.0",
"klo-sl",
"lkml>=1.3.7",
"numpy>=2.2.6",
"orjson>=3.11.4",
"pandas>=2.2.3",
"psycopg[binary]>=3.2.0",
"pydantic>=2.9.0",
"requests>=2.32.0",
"sentence-transformers>=5.1.1",
"sqlglot>=26",
"torch>=2.2.0",
"uvicorn[standard]>=0.32.0",
]
[project.scripts]
klo-daemon = "klo_daemon.__main__:main"
[project.urls]
Homepage = "https://github.com/kaelio/ktx"
Repository = "https://github.com/kaelio/ktx"
Issues = "https://github.com/kaelio/ktx/issues"
[build-system]
requires = ["hatchling"]
build-backend = "hatchling.build"
[tool.hatch.build.targets.wheel]
packages = ["src/klo_daemon"]
[dependency-groups]
dev = [
"httpx>=0.28.1",
"pytest>=9.0.2",
]
[tool.uv.sources]
klo-sl = { workspace = true }
[tool.pytest.ini_options]
testpaths = ["tests"]
pythonpath = ["src"]

View file

@ -0,0 +1,6 @@
"""Portable compute package for KLO."""
PACKAGE_NAME = "klo-daemon"
VERSION = "0.1.0"
__all__ = ["PACKAGE_NAME", "VERSION"]

View file

@ -0,0 +1,172 @@
"""Command entry point for one-shot KLO daemon compute operations."""
from __future__ import annotations
import argparse
import json
import sys
from typing import Any
from pydantic import ValidationError
from klo_daemon.code_execution import ExecuteCodeRequest, execute_code_response
from klo_daemon.database_introspection import (
DatabaseIntrospectionRequest,
introspect_database_response,
)
from klo_daemon.embeddings import (
ComputeEmbeddingBulkRequest,
ComputeEmbeddingRequest,
compute_embedding_bulk_response,
compute_embedding_response,
)
from klo_daemon.lookml import ParseLookMLRequest, parse_lookml_project
from klo_daemon.semantic_layer import (
SemanticLayerQueryRequest,
ValidateSourcesRequest,
query_semantic_layer,
validate_semantic_layer,
)
from klo_daemon.source_generation import (
GenerateSourcesRequest,
generate_sources_response,
)
def build_parser() -> argparse.ArgumentParser:
parser = argparse.ArgumentParser(prog="klo-daemon")
subcommands = parser.add_subparsers(dest="command", required=True)
subcommands.add_parser("semantic-query", help="Compile a semantic-layer query")
subcommands.add_parser("semantic-validate", help="Validate semantic-layer sources")
subcommands.add_parser(
"semantic-generate-sources",
help="Generate semantic-layer sources from schema scan data",
)
subcommands.add_parser(
"database-introspect",
help="Introspect a Postgres database schema",
)
subcommands.add_parser(
"lookml-parse",
help="Parse LookML files into KSL-ready structures",
)
subcommands.add_parser(
"embedding-compute",
help="Compute one local text embedding",
)
subcommands.add_parser(
"embedding-compute-bulk",
help="Compute local text embeddings in bulk",
)
subcommands.add_parser(
"code-execute",
help="Execute Python code with the current in-process boundary",
)
serve_http = subcommands.add_parser(
"serve-http",
help="Run the KLO daemon portable compute HTTP server",
)
serve_http.add_argument("--host", default="127.0.0.1")
serve_http.add_argument("--port", type=int, default=8765)
serve_http.add_argument(
"--log-level",
default="info",
choices=["critical", "error", "warning", "info", "debug", "trace"],
)
serve_http.add_argument(
"--enable-code-execution",
action="store_true",
help="Expose POST /code/execute on the HTTP server",
)
return parser
def _read_stdin_json() -> dict[str, Any]:
raw = sys.stdin.read()
parsed = json.loads(raw)
if not isinstance(parsed, dict):
raise ValueError("stdin JSON must be an object")
return parsed
def run_http_server(
*,
host: str,
port: int,
log_level: str,
enable_code_execution: bool,
) -> None:
import uvicorn
from klo_daemon.app import create_app
uvicorn.run(
create_app(enable_code_execution=enable_code_execution),
host=host,
port=port,
log_level=log_level,
)
def main(argv: list[str] | None = None) -> int:
parser = build_parser()
args = parser.parse_args(argv)
if args.command == "serve-http":
run_http_server(
host=args.host,
port=args.port,
log_level=args.log_level,
enable_code_execution=args.enable_code_execution,
)
return 0
try:
payload = _read_stdin_json()
if args.command == "semantic-query":
response = query_semantic_layer(
SemanticLayerQueryRequest.model_validate(payload)
)
elif args.command == "semantic-validate":
response = validate_semantic_layer(
ValidateSourcesRequest.model_validate(payload)
)
elif args.command == "semantic-generate-sources":
response = generate_sources_response(
GenerateSourcesRequest.model_validate(payload)
)
elif args.command == "database-introspect":
response = introspect_database_response(
DatabaseIntrospectionRequest.model_validate(payload)
)
elif args.command == "lookml-parse":
response = parse_lookml_project(ParseLookMLRequest.model_validate(payload))
elif args.command == "embedding-compute":
response = compute_embedding_response(
ComputeEmbeddingRequest.model_validate(payload)
)
elif args.command == "embedding-compute-bulk":
response = compute_embedding_bulk_response(
ComputeEmbeddingBulkRequest.model_validate(payload)
)
elif args.command == "code-execute":
response = execute_code_response(
ExecuteCodeRequest.model_validate(payload),
nest_api_url=None,
auth_header=None,
)
else:
parser.error(f"Unknown command: {args.command}")
return 2
sys.stdout.write(response.model_dump_json() + "\n")
return 0
except (json.JSONDecodeError, ValidationError, ValueError) as error:
sys.stderr.write(f"{error}\n")
return 1
except Exception as error:
sys.stderr.write(f"{type(error).__name__}: {error}\n")
return 1
if __name__ == "__main__":
raise SystemExit(main())

View file

@ -0,0 +1,228 @@
"""FastAPI app factory for the KLO daemon semantic compute server."""
from __future__ import annotations
import logging
from collections.abc import Callable
from typing import Any
from fastapi import FastAPI, HTTPException
from fastapi.responses import Response
from klo_daemon.code_execution import (
ExecuteCodeRequest,
ExecuteCodeResponse,
dumps_numpy_json,
execute_code_response,
)
from klo_daemon.database_introspection import (
DatabaseIntrospectionRequest,
DatabaseIntrospectionResponse,
introspect_database_response,
)
from klo_daemon.embeddings import (
ComputeEmbeddingBulkRequest,
ComputeEmbeddingBulkResponse,
ComputeEmbeddingRequest,
ComputeEmbeddingResponse,
EmbeddingProvider,
compute_embedding_bulk_response,
compute_embedding_response,
)
from klo_daemon.lookml import (
ParseLookMLRequest,
ParseLookMLResponse,
parse_lookml_project,
)
from klo_daemon.semantic_layer import (
SemanticLayerQueryRequest,
SemanticLayerQueryResponse,
ValidateSourcesRequest,
ValidateSourcesResponse,
query_semantic_layer,
validate_semantic_layer,
)
from klo_daemon.source_generation import (
GenerateSourcesRequest,
GenerateSourcesResponse,
generate_sources_response,
)
from klo_daemon.table_identifier import (
ParseTableIdentifierBatchRequest,
ParseTableIdentifierBatchResponse,
parse_table_identifier_response,
)
logger = logging.getLogger(__name__)
class NumpyORJSONResponse(Response):
media_type = "application/json"
def render(self, content: Any) -> bytes:
return dumps_numpy_json(content)
def create_app(
*,
embedding_provider: EmbeddingProvider | None = None,
database_introspector: Callable[
[DatabaseIntrospectionRequest], DatabaseIntrospectionResponse
]
| None = None,
enable_code_execution: bool = False,
) -> FastAPI:
app = FastAPI(
title="KLO Daemon",
description="Stateless portable compute server for KLO.",
version="0.1.0",
)
@app.get("/health")
async def health() -> dict[str, str]:
return {"status": "healthy"}
@app.post("/database/introspect", response_model=DatabaseIntrospectionResponse)
async def database_introspect(
request: DatabaseIntrospectionRequest,
) -> DatabaseIntrospectionResponse:
try:
introspector = database_introspector or introspect_database_response
return introspector(request)
except ValueError as error:
logger.warning("Database introspection rejected: %s", error)
raise HTTPException(status_code=400, detail=str(error)) from error
except Exception as error:
logger.exception("Database introspection failed: %s", error)
raise HTTPException(
status_code=500,
detail=f"Database introspection failed: {error}",
) from error
@app.post("/embeddings/compute", response_model=ComputeEmbeddingResponse)
async def embedding_compute(
request: ComputeEmbeddingRequest,
) -> ComputeEmbeddingResponse:
try:
return compute_embedding_response(
request,
provider=embedding_provider,
)
except ValueError as error:
logger.warning("Embedding compute rejected: %s", error)
raise HTTPException(status_code=400, detail=str(error)) from error
except Exception as error:
logger.exception("Embedding compute failed: %s", error)
raise HTTPException(
status_code=500,
detail=f"Embedding compute failed: {error}",
) from error
@app.post(
"/embeddings/compute-bulk",
response_model=ComputeEmbeddingBulkResponse,
)
async def embedding_compute_bulk(
request: ComputeEmbeddingBulkRequest,
) -> ComputeEmbeddingBulkResponse:
try:
return compute_embedding_bulk_response(
request,
provider=embedding_provider,
)
except ValueError as error:
logger.warning("Bulk embedding compute rejected: %s", error)
raise HTTPException(status_code=400, detail=str(error)) from error
except Exception as error:
logger.exception("Bulk embedding compute failed: %s", error)
raise HTTPException(
status_code=500,
detail=f"Bulk embedding compute failed: {error}",
) from error
if enable_code_execution:
@app.post(
"/code/execute",
response_model=ExecuteCodeResponse,
response_class=NumpyORJSONResponse,
)
async def code_execute(request: ExecuteCodeRequest) -> ExecuteCodeResponse:
try:
return execute_code_response(
request,
nest_api_url=None,
auth_header=None,
)
except Exception as error:
logger.exception("Code execution failed: %s", error)
raise HTTPException(
status_code=500,
detail=f"Code execution failed: {error}",
) from error
@app.post("/lookml/parse", response_model=ParseLookMLResponse)
async def lookml_parse(request: ParseLookMLRequest) -> ParseLookMLResponse:
try:
return parse_lookml_project(request)
except Exception as error:
logger.exception("LookML parsing failed: %s", error)
raise HTTPException(
status_code=500,
detail=f"LookML parsing failed: {error}",
) from error
@app.post(
"/sql/parse-table-identifier",
response_model=ParseTableIdentifierBatchResponse,
)
async def sql_parse_table_identifier(
request: ParseTableIdentifierBatchRequest,
) -> ParseTableIdentifierBatchResponse:
try:
return parse_table_identifier_response(request)
except Exception as error:
logger.exception("Table identifier parsing failed: %s", error)
raise HTTPException(
status_code=500,
detail=f"Table identifier parsing failed: {error}",
) from error
@app.post(
"/semantic-layer/generate-sources", response_model=GenerateSourcesResponse
)
async def semantic_generate_sources(
request: GenerateSourcesRequest,
) -> GenerateSourcesResponse:
try:
return generate_sources_response(request)
except Exception as error:
logger.exception("Semantic source generation failed: %s", error)
raise HTTPException(
status_code=500,
detail=f"Semantic source generation failed: {error}",
) from error
@app.post("/semantic-layer/query", response_model=SemanticLayerQueryResponse)
async def semantic_query(
request: SemanticLayerQueryRequest,
) -> SemanticLayerQueryResponse:
try:
return query_semantic_layer(request)
except ValueError as error:
logger.warning("Semantic query rejected: %s", error)
raise HTTPException(status_code=400, detail=str(error)) from error
except Exception as error:
logger.exception("Semantic query failed: %s", error)
raise HTTPException(
status_code=500,
detail=f"Semantic layer query failed: {error}",
) from error
@app.post("/semantic-layer/validate", response_model=ValidateSourcesResponse)
async def semantic_validate(
request: ValidateSourcesRequest,
) -> ValidateSourcesResponse:
return validate_semantic_layer(request)
return app

View file

@ -0,0 +1,333 @@
"""Portable in-process code execution helpers for KLO daemon.
This module preserves the host application's current Python execution behavior.
It runs code with Python ``exec`` in the current process and does not provide
OS-level sandboxing.
"""
from __future__ import annotations
import json
import logging
import re
import sys
from collections.abc import Callable
from io import BytesIO, StringIO
from typing import Any
import numpy as np
import orjson
import pandas as pd
import requests
from pydantic import BaseModel, Field
logger = logging.getLogger(__name__)
VALID_VISUALIZATION_TYPES = ["pie", "bar", "line", "area", "table", "boxplot"]
class ExecuteCodeRequest(BaseModel):
"""Request schema for executing Python code."""
code: str = Field(..., description="Python code to execute")
source_id: str | None = Field(
None,
description="Chat/dashboard ID for scratchpad file access",
)
message_id: str | None = Field(
None,
description="Message ID for visualization association",
)
class VisualizationSpec(BaseModel):
"""Specification for a visualization to be saved by the host application."""
type: str = Field(..., description="Type marker, always 'visualization'")
vis_type: str = Field(
...,
description="Visualization type: pie, bar, line, area, table",
)
config: dict[str, Any] = Field(
...,
description="Visualization configuration",
)
data: list[dict[str, Any]] = Field(
...,
description="Visualization data",
)
title: str | None = Field(None, description="Optional title")
class ExecuteCodeResponse(BaseModel):
"""Response schema for code execution."""
formatted_result: str = Field(
...,
description="Formatted execution result for display",
)
result: Any | None = Field(
None,
description="The value of the 'result' variable if set",
)
console_output: str | None = Field(
None,
description="Captured stdout from print statements",
)
error: str | None = Field(None, description="Error message if execution failed")
message: str | None = Field(
None,
description="Message if no clear result was returned",
)
visualizations: list[VisualizationSpec] | None = Field(
None,
description="List of visualizations detected in the result",
)
ScratchpadHelpers = tuple[
Callable[[pd.DataFrame, str | None], str],
Callable[[str], pd.DataFrame],
Callable[[str, dict[str, Any], list[dict[str, Any]]], str],
]
def dumps_numpy_json(content: Any) -> bytes:
"""Serialize JSON response content with numpy scalar and array support."""
return orjson.dumps(content, option=orjson.OPT_SERIALIZE_NUMPY)
def _strip_ansi_sequences(text: str) -> str:
ansi_escape = re.compile(
r"\x1b\[[0-9;]*[a-zA-Z]|\x1b\([0-9;]*[a-zA-Z]|\x1b\[[0-9;]*~"
)
return ansi_escape.sub("", text)
def create_scratchpad_helpers(
nest_api_url: str | None,
auth_header: str | None,
source_id: str | None,
message_id: str | None = None,
http_client: Any = requests,
) -> ScratchpadHelpers:
"""Create scratchpad and visualization helpers that call host app APIs."""
def save_df_to_scratchpad(df: pd.DataFrame, filename: str | None = None) -> str:
if not nest_api_url or not auth_header or not source_id:
raise ValueError(
"nest_api_url, Authorization header, and source_id are required "
"for scratchpad operations"
)
data_json = df.to_dict(orient="records")
url = f"{nest_api_url}/private_api/scratchpad/{source_id}/files"
response = http_client.post(
url,
data=dumps_numpy_json(
{"filename": filename, "data": data_json, "format": "json"}
),
headers={"Authorization": auth_header, "Content-Type": "application/json"},
timeout=30,
)
response.raise_for_status()
saved_filename = response.json()["filename"]
rows, _cols = df.shape
return f"{rows} rows saved to {saved_filename}"
def read_scratchpad_file(filename: str) -> pd.DataFrame:
if not nest_api_url or not auth_header or not source_id:
raise ValueError(
"nest_api_url, Authorization header, and source_id are required "
"for scratchpad operations"
)
url = f"{nest_api_url}/private_api/scratchpad/{source_id}/files/{filename}?format=raw"
response = http_client.get(
url,
headers={"Authorization": auth_header, "Accept": "text/csv"},
timeout=30,
)
response.raise_for_status()
content_type = response.headers.get("content-type", "")
if "text/csv" in content_type:
return pd.read_csv(BytesIO(response.content))
data = response.json()["data"]
return pd.DataFrame(data)
def save_visualization(
vis_type: str,
config: dict[str, Any],
data: list[dict[str, Any]],
) -> str:
if not nest_api_url or not auth_header or not source_id:
raise ValueError(
"nest_api_url, Authorization header, and source_id are required "
"for visualization operations"
)
if not message_id:
raise ValueError("message_id is required for visualization operations")
if vis_type not in VALID_VISUALIZATION_TYPES:
raise ValueError(
f"Invalid visualization type: {vis_type}. Must be one of {VALID_VISUALIZATION_TYPES}"
)
url = f"{nest_api_url}/private_api/visualizations/{source_id}"
payload = {
"visualizationType": vis_type,
"config": config,
"data": data,
"messageId": message_id,
}
response = http_client.post(
url,
data=dumps_numpy_json(payload),
headers={"Authorization": auth_header, "Content-Type": "application/json"},
timeout=30,
)
response.raise_for_status()
filename = response.json()["filename"]
print(f"Visualization saved: {filename}")
return f"![viz]({filename})"
return save_df_to_scratchpad, read_scratchpad_file, save_visualization
def detect_visualizations(result: Any) -> list[dict[str, Any]]:
"""Detect visualization specs in a code execution result value."""
visualizations = []
if isinstance(result, dict) and result.get("type") == "visualization":
visualizations.append(result)
elif isinstance(result, list):
for item in result:
if isinstance(item, dict) and item.get("type") == "visualization":
visualizations.append(item)
return visualizations
def execute_code(
code: str,
nest_api_url: str | None = None,
auth_header: str | None = None,
source_id: str | None = None,
message_id: str | None = None,
scratchpad_helpers: ScratchpadHelpers | None = None,
) -> dict[str, Any]:
"""Execute Python code with the current in-process execution boundary."""
logger.info("Starting code execution")
save_df, read_file, save_viz = scratchpad_helpers or create_scratchpad_helpers(
nest_api_url,
auth_header,
source_id,
message_id,
)
namespace = {
"pd": pd,
"np": np,
"json": json,
"requests": requests,
"save_df_to_scratchpad": save_df,
"read_scratchpad_file": read_file,
"save_visualization": save_viz,
}
stdout_capture = StringIO()
original_stdout = sys.stdout
sys.stdout = stdout_capture
console_output = ""
try:
logger.info("Executing code in current process namespace")
exec(code, namespace)
console_output = stdout_capture.getvalue()
if "result" in namespace:
logger.info("Code execution complete, 'result' variable found")
result_value = namespace["result"]
visualizations = detect_visualizations(result_value)
result = {"result": result_value}
if console_output:
result["console_output"] = console_output
if visualizations:
result["visualizations"] = visualizations
return result
logger.info("No result variable found")
result = {
"message": "Code executed successfully but no result variable was set"
}
if console_output:
result["console_output"] = console_output
return result
except Exception as error:
logger.exception("Error executing code: %s", error)
result = {"error": str(error)}
if console_output:
result["console_output"] = console_output
return result
finally:
sys.stdout = original_stdout
def format_execution_result(result: dict[str, Any]) -> str:
"""Format execution output for display in host chat responses."""
formatted_result = ""
if "console_output" in result:
formatted_result += "=== Console Output ===\n\n"
formatted_result += _strip_ansi_sequences(result["console_output"])
if "result" in result:
formatted_result += "\n\n=== Result ===\n\n"
formatted_result += str(result["result"])
elif "message" in result:
formatted_result += "\n\n=== Message ===\n\n"
formatted_result += result["message"]
elif "error" in result:
formatted_result += "\n\n=== Error ===\n\n"
formatted_result += result["error"]
return formatted_result
def execute_code_response(
request: ExecuteCodeRequest,
*,
nest_api_url: str | None,
auth_header: str | None,
) -> ExecuteCodeResponse:
"""Execute a validated request and return the public response model."""
result = execute_code(
code=request.code,
nest_api_url=nest_api_url,
auth_header=auth_header,
source_id=request.source_id,
message_id=request.message_id,
)
return ExecuteCodeResponse(
formatted_result=format_execution_result(result),
result=result.get("result"),
console_output=result.get("console_output"),
error=result.get("error"),
message=result.get("message"),
visualizations=result.get("visualizations"),
)

View file

@ -0,0 +1,284 @@
"""Portable database introspection helpers for KLO daemon."""
from __future__ import annotations
from collections.abc import Callable, Mapping, Sequence
from dataclasses import dataclass
from datetime import datetime, timezone
from typing import Any
from pydantic import BaseModel, Field, field_validator
TABLES_SQL = """
select
t.table_catalog,
t.table_schema,
t.table_name,
obj_description(c.oid) as table_comment
from information_schema.tables t
join pg_catalog.pg_namespace n
on n.nspname = t.table_schema
join pg_catalog.pg_class c
on c.relnamespace = n.oid
and c.relname = t.table_name
where t.table_schema = any(%s)
and t.table_type = 'BASE TABLE'
order by t.table_schema, t.table_name
"""
COLUMNS_SQL = """
select
current_database() as table_catalog,
n.nspname as table_schema,
c.relname as table_name,
a.attname as column_name,
pg_catalog.format_type(a.atttypid, a.atttypmod) as formatted_type,
not a.attnotnull as is_nullable,
exists (
select 1
from pg_catalog.pg_index i
where i.indrelid = c.oid
and i.indisprimary
and a.attnum = any(i.indkey)
) as is_primary_key,
pg_catalog.col_description(c.oid, a.attnum) as column_comment
from pg_catalog.pg_attribute a
join pg_catalog.pg_class c
on c.oid = a.attrelid
join pg_catalog.pg_namespace n
on n.oid = c.relnamespace
where n.nspname = any(%s)
and c.relkind in ('r', 'p')
and a.attnum > 0
and not a.attisdropped
order by n.nspname, c.relname, a.attnum
"""
FOREIGN_KEYS_SQL = """
select
current_database() as table_catalog,
source_constraint.table_schema,
source_constraint.table_name,
source_key.column_name as from_column,
target_key.table_name as to_table,
target_key.column_name as to_column,
source_constraint.constraint_name
from information_schema.table_constraints source_constraint
join information_schema.key_column_usage source_key
on source_key.constraint_catalog = source_constraint.constraint_catalog
and source_key.constraint_schema = source_constraint.constraint_schema
and source_key.constraint_name = source_constraint.constraint_name
join information_schema.referential_constraints ref_constraint
on ref_constraint.constraint_catalog = source_constraint.constraint_catalog
and ref_constraint.constraint_schema = source_constraint.constraint_schema
and ref_constraint.constraint_name = source_constraint.constraint_name
join information_schema.key_column_usage target_key
on target_key.constraint_catalog = ref_constraint.unique_constraint_catalog
and target_key.constraint_schema = ref_constraint.unique_constraint_schema
and target_key.constraint_name = ref_constraint.unique_constraint_name
and target_key.ordinal_position = source_key.position_in_unique_constraint
where source_constraint.constraint_type = 'FOREIGN KEY'
and source_constraint.table_schema = any(%s)
order by source_constraint.table_schema, source_constraint.table_name, source_constraint.constraint_name, source_key.ordinal_position
"""
class LiveDatabaseColumn(BaseModel):
name: str
type: str
nullable: bool = True
primary_key: bool = False
comment: str | None = None
class LiveDatabaseForeignKey(BaseModel):
from_column: str
to_table: str
to_column: str
constraint_name: str | None = None
class LiveDatabaseTable(BaseModel):
catalog: str | None = None
db: str | None = None
name: str
comment: str | None = None
columns: list[LiveDatabaseColumn] = Field(default_factory=list)
foreign_keys: list[LiveDatabaseForeignKey] = Field(default_factory=list)
class DatabaseIntrospectionRequest(BaseModel):
connection_id: str
driver: str = "postgres"
url: str
schemas: list[str] = Field(default_factory=lambda: ["public"])
statement_timeout_ms: int = Field(default=30_000, ge=1)
connection_timeout_seconds: int = Field(default=5, ge=1)
@field_validator("schemas")
@classmethod
def _schemas_must_not_be_empty(cls, value: list[str]) -> list[str]:
if not value:
raise ValueError("database introspection requires at least one schema")
return value
class DatabaseIntrospectionResponse(BaseModel):
connection_id: str
extracted_at: str
metadata: dict[str, Any]
tables: list[LiveDatabaseTable]
@dataclass(frozen=True)
class DatabaseIntrospectionRows:
table_rows: Sequence[Mapping[str, Any]]
column_rows: Sequence[Mapping[str, Any]]
foreign_key_rows: Sequence[Mapping[str, Any]]
DatabaseRowsLoader = Callable[[DatabaseIntrospectionRequest], DatabaseIntrospectionRows]
NowProvider = Callable[[], str]
def _driver_name(driver: str) -> str:
return driver.strip().lower()
def _table_key(catalog: str | None, db: str | None, name: str) -> str:
return f"{catalog or ''}\u0000{db or ''}\u0000{name}"
def _optional_string(row: Mapping[str, Any], key: str) -> str | None:
value = row.get(key)
return value if isinstance(value, str) else None
def _required_string(row: Mapping[str, Any], key: str) -> str:
value = row.get(key)
if not isinstance(value, str) or not value:
raise ValueError(f"database introspection row is missing string field {key}")
return value
def _statement_timeout_config(statement_timeout_ms: int) -> tuple[str, tuple[str]]:
return (
"SELECT set_config('statement_timeout', %s, true)",
(f"{int(statement_timeout_ms)}ms",),
)
def _load_postgres_rows(
request: DatabaseIntrospectionRequest,
) -> DatabaseIntrospectionRows:
try:
import psycopg
from psycopg.rows import dict_row
except ImportError as error:
raise RuntimeError(
"psycopg is required for Postgres database introspection"
) from error
connection = psycopg.connect(
request.url,
connect_timeout=request.connection_timeout_seconds,
application_name="klo-daemon-database-introspection",
row_factory=dict_row,
)
try:
connection.execute("BEGIN READ ONLY")
try:
connection.execute(*_statement_timeout_config(request.statement_timeout_ms))
params = (request.schemas,)
table_rows = list(connection.execute(TABLES_SQL, params))
column_rows = list(connection.execute(COLUMNS_SQL, params))
foreign_key_rows = list(connection.execute(FOREIGN_KEYS_SQL, params))
connection.execute("COMMIT")
except Exception:
connection.execute("ROLLBACK")
raise
finally:
connection.close()
return DatabaseIntrospectionRows(
table_rows=table_rows,
column_rows=column_rows,
foreign_key_rows=foreign_key_rows,
)
def _map_rows_to_tables(rows: DatabaseIntrospectionRows) -> list[LiveDatabaseTable]:
tables: dict[str, LiveDatabaseTable] = {}
for row in rows.table_rows:
catalog = _optional_string(row, "table_catalog")
db = _required_string(row, "table_schema")
name = _required_string(row, "table_name")
key = _table_key(catalog, db, name)
tables[key] = LiveDatabaseTable(
catalog=catalog,
db=db,
name=name,
comment=_optional_string(row, "table_comment"),
)
for row in rows.column_rows:
catalog = _optional_string(row, "table_catalog")
db = _required_string(row, "table_schema")
table_name = _required_string(row, "table_name")
table = tables.get(_table_key(catalog, db, table_name))
if table is None:
continue
table.columns.append(
LiveDatabaseColumn(
name=_required_string(row, "column_name"),
type=_required_string(row, "formatted_type"),
nullable=bool(row.get("is_nullable")),
primary_key=bool(row.get("is_primary_key")),
comment=_optional_string(row, "column_comment"),
)
)
for row in rows.foreign_key_rows:
catalog = _optional_string(row, "table_catalog")
db = _required_string(row, "table_schema")
table_name = _required_string(row, "table_name")
table = tables.get(_table_key(catalog, db, table_name))
if table is None:
continue
table.foreign_keys.append(
LiveDatabaseForeignKey(
from_column=_required_string(row, "from_column"),
to_table=_required_string(row, "to_table"),
to_column=_required_string(row, "to_column"),
constraint_name=_optional_string(row, "constraint_name"),
)
)
return sorted(
tables.values(),
key=lambda table: _table_key(table.catalog, table.db, table.name),
)
def introspect_database_response(
request: DatabaseIntrospectionRequest,
*,
load_rows: DatabaseRowsLoader | None = None,
now: NowProvider | None = None,
) -> DatabaseIntrospectionResponse:
driver = _driver_name(request.driver)
if driver not in {"postgres", "postgresql"}:
raise ValueError('database introspection supports only driver "postgres"')
rows = (load_rows or _load_postgres_rows)(request)
timestamp = now() if now else datetime.now(timezone.utc).isoformat()
return DatabaseIntrospectionResponse(
connection_id=request.connection_id,
extracted_at=timestamp,
metadata={"driver": driver, "schemas": list(request.schemas)},
tables=_map_rows_to_tables(rows),
)

View file

@ -0,0 +1,172 @@
"""Portable embedding compute helpers for KLO daemon."""
from __future__ import annotations
import logging
import threading
from typing import TYPE_CHECKING, Protocol
from pydantic import BaseModel, Field
if TYPE_CHECKING:
from sentence_transformers import SentenceTransformer
logger = logging.getLogger(__name__)
DEFAULT_SENTENCE_TRANSFORMER_MODEL = "all-MiniLM-L6-v2"
DEFAULT_EMBEDDING_DIMENSIONS = 384
DEFAULT_MAX_BATCH_SIZE = 100
class EmbeddingProvider(Protocol):
"""Provider interface for local embedding compute."""
@property
def name(self) -> str: ...
@property
def dimensions(self) -> int: ...
@property
def max_batch_size(self) -> int: ...
def encode(self, texts: list[str]) -> list[list[float]]: ...
class ComputeEmbeddingRequest(BaseModel):
"""Request schema for computing a single embedding."""
text: str = Field(..., description="Text to compute embedding for", min_length=1)
class ComputeEmbeddingResponse(BaseModel):
"""Response schema for single embedding computation."""
embedding: list[float] = Field(..., description="384-dimensional embedding vector")
class ComputeEmbeddingBulkRequest(BaseModel):
"""Request schema for computing multiple embeddings."""
texts: list[str] = Field(
...,
description="List of texts to compute embeddings for",
min_length=1,
max_length=DEFAULT_MAX_BATCH_SIZE,
)
class ComputeEmbeddingBulkResponse(BaseModel):
"""Response schema for bulk embedding computation."""
embeddings: list[list[float]] = Field(
...,
description="List of 384-dimensional embedding vectors",
)
class SentenceTransformersEmbeddingProvider:
"""Lazy sentence-transformers provider for local embeddings."""
def __init__(
self,
model_name: str = DEFAULT_SENTENCE_TRANSFORMER_MODEL,
model: SentenceTransformer | None = None,
) -> None:
self.model_name = model_name
self._model = model
self._model_lock = threading.Lock()
@property
def name(self) -> str:
return "sentence-transformers"
@property
def dimensions(self) -> int:
return DEFAULT_EMBEDDING_DIMENSIONS
@property
def max_batch_size(self) -> int:
return DEFAULT_MAX_BATCH_SIZE
def _get_model(self) -> SentenceTransformer:
if self._model is not None:
return self._model
with self._model_lock:
if self._model is None:
from sentence_transformers import SentenceTransformer
logger.info("Loading SentenceTransformer model: %s", self.model_name)
self._model = SentenceTransformer(self.model_name)
logger.info("SentenceTransformer model loaded successfully")
return self._model
def encode(self, texts: list[str]) -> list[list[float]]:
model = self._get_model()
if len(texts) == 1:
raw_single = model.encode(texts[0]).tolist()
return [[float(value) for value in raw_single]]
raw_bulk = model.encode(texts).tolist()
return [[float(value) for value in embedding] for embedding in raw_bulk]
_default_provider: SentenceTransformersEmbeddingProvider | None = None
_default_provider_lock = threading.Lock()
def get_default_embedding_provider() -> SentenceTransformersEmbeddingProvider:
"""Return the process-wide default embedding provider."""
global _default_provider
if _default_provider is not None:
return _default_provider
with _default_provider_lock:
if _default_provider is None:
_default_provider = SentenceTransformersEmbeddingProvider()
return _default_provider
def _validate_texts(texts: list[str], max_batch_size: int) -> None:
if not texts:
raise ValueError("Texts array must not be empty")
if len(texts) > max_batch_size:
raise ValueError(f"Maximum {max_batch_size} texts allowed per batch")
empty_indices = [
index for index, text in enumerate(texts) if not text or not text.strip()
]
if empty_indices:
joined_indices = ", ".join(str(index) for index in empty_indices)
raise ValueError(f"Empty texts found at indices: {joined_indices}")
def compute_embedding_response(
request: ComputeEmbeddingRequest,
provider: EmbeddingProvider | None = None,
) -> ComputeEmbeddingResponse:
"""Compute one embedding from a request model."""
selected_provider = provider or get_default_embedding_provider()
_validate_texts([request.text], selected_provider.max_batch_size)
return ComputeEmbeddingResponse(
embedding=selected_provider.encode([request.text])[0]
)
def compute_embedding_bulk_response(
request: ComputeEmbeddingBulkRequest,
provider: EmbeddingProvider | None = None,
) -> ComputeEmbeddingBulkResponse:
"""Compute multiple embeddings from a request model."""
selected_provider = provider or get_default_embedding_provider()
_validate_texts(request.texts, selected_provider.max_batch_size)
return ComputeEmbeddingBulkResponse(
embeddings=selected_provider.encode(request.texts)
)

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,136 @@
"""Semantic-layer compute helpers for the KLO daemon package."""
from __future__ import annotations
from typing import Any
from pydantic import BaseModel, Field
from semantic_layer.duplicate_check import validate_measure_duplicates
from semantic_layer.engine import SemanticEngine
from semantic_layer.models import QueryResult, SourceDefinition
class SemanticLayerQueryRequest(BaseModel):
sources: list[dict[str, Any]]
query: dict[str, Any]
dialect: str = "postgres"
class SemanticLayerQueryResponse(BaseModel):
sql: str
dialect: str
columns: list[dict[str, Any]]
plan: dict[str, Any]
class ValidateSourcesRequest(BaseModel):
sources: list[dict[str, Any]]
dialect: str = "postgres"
recently_touched: list[str] | None = None
class ValidateSourcesResponse(BaseModel):
valid: bool
errors: list[str] = Field(default_factory=list)
warnings: list[str] = Field(default_factory=list)
per_source_warnings: dict[str, list[str]] = Field(default_factory=dict)
def _load_sources(raw_sources: list[dict[str, Any]]) -> dict[str, SourceDefinition]:
sources: dict[str, SourceDefinition] = {}
for raw_source in raw_sources:
source = SourceDefinition(**raw_source)
if source.name in sources:
raise ValueError(f"Duplicate source name '{source.name}'")
sources[source.name] = source
return sources
def _validate_duplicate_measure_names(source: SourceDefinition) -> list[str]:
errors: list[str] = []
seen: set[str] = set()
for measure in source.measures:
if measure.name in seen:
errors.append(
f"Duplicate measure '{measure.name}' on source '{source.name}'"
)
continue
seen.add(measure.name)
return errors
def _response_columns(result: QueryResult) -> list[dict[str, Any]]:
measure_names = {
measure.name: measure.qualified_ref
for measure in result.resolved_plan.measures
if measure.qualified_ref
}
columns: list[dict[str, Any]] = []
for column in result.columns:
dumped = column.model_dump(mode="json")
if column.provenance.value == "dimension" and column.expr:
dumped["name"] = column.expr
elif column.name in measure_names:
dumped["name"] = measure_names[column.name]
columns.append(dumped)
return columns
def query_semantic_layer(
request: SemanticLayerQueryRequest,
) -> SemanticLayerQueryResponse:
sources = _load_sources(request.sources)
engine = SemanticEngine.from_sources(sources, dialect=request.dialect)
result = engine.query(request.query)
return SemanticLayerQueryResponse(
sql=result.sql,
dialect=result.dialect,
columns=_response_columns(result),
plan=result.resolved_plan.model_dump(mode="json"),
)
def validate_semantic_layer(request: ValidateSourcesRequest) -> ValidateSourcesResponse:
errors: list[str] = []
warnings: list[str] = []
per_source_warnings: dict[str, list[str]] = {}
sources: dict[str, SourceDefinition] = {}
seen_names: set[str] = set()
for raw_source in request.sources:
raw_name = raw_source.get("name") if isinstance(raw_source, dict) else None
try:
source = SourceDefinition(**raw_source)
except Exception as error:
label = raw_name or "<unknown>"
errors.append(f"Source '{label}' failed to parse: {error}")
continue
if source.name in seen_names:
errors.append(f"Duplicate source name '{source.name}'")
continue
seen_names.add(source.name)
sources[source.name] = source
errors.extend(_validate_duplicate_measure_names(source))
if sources:
try:
engine = SemanticEngine.from_sources(sources, dialect=request.dialect)
report = engine.validate(
recently_touched=set(request.recently_touched)
if request.recently_touched
else None
)
errors.extend(report.errors)
warnings.extend(report.warnings)
per_source_warnings.update(report.per_source_warnings)
errors.extend(validate_measure_duplicates(sources, dialect=request.dialect))
except Exception as error:
errors.append(f"Validation failed: {error}")
return ValidateSourcesResponse(
valid=len(errors) == 0,
errors=errors,
warnings=warnings,
per_source_warnings=per_source_warnings,
)

View file

@ -0,0 +1,254 @@
"""Generate klo-sl YAML source definitions from database schema scan data."""
from __future__ import annotations
import logging
import re
from typing import Any
from pydantic import BaseModel
from semantic_layer.models import (
ColumnRole,
JoinDeclaration,
MeasureDefinition,
SourceColumn,
SourceDefinition,
)
logger = logging.getLogger(__name__)
_NUMBER_PATTERN = re.compile(
r"int|integer|bigint|smallint|tinyint|numeric|decimal|float|double|real|number|money",
re.IGNORECASE,
)
_TIME_PATTERN = re.compile(
r"timestamp|datetime|date|time(?!stamp)",
re.IGNORECASE,
)
_BOOLEAN_PATTERN = re.compile(r"bool|boolean|bit", re.IGNORECASE)
_ID_PATTERN = re.compile(
r"^id$|_id$|^uuid$|_uuid$|_key$|_pk$|identifier$",
re.IGNORECASE,
)
_RELATIONSHIP_MAP = {
"MANY_TO_ONE": "many_to_one",
"ONE_TO_MANY": "one_to_many",
"ONE_TO_ONE": "one_to_one",
"many_to_one": "many_to_one",
"one_to_many": "one_to_many",
"one_to_one": "one_to_one",
}
_RELATIONSHIP_INVERSE = {
"many_to_one": "one_to_many",
"one_to_many": "many_to_one",
"one_to_one": "one_to_one",
}
class ColumnInput(BaseModel):
name: str
type: str
primary_key: bool = False
nullable: bool = True
comment: str | None = None
class TableInput(BaseModel):
name: str
catalog: str | None = None
db: str | None = None
comment: str | None = None
columns: list[ColumnInput]
class LinkInput(BaseModel):
from_table: str
from_column: str
to_table: str
to_column: str
relationship_type: str
class GenerateSourcesRequest(BaseModel):
tables: list[TableInput]
links: list[LinkInput]
dialect: str = "postgres"
class GenerateSourcesResponse(BaseModel):
sources: list[dict[str, Any]]
source_count: int
def _map_column_type(db_type: str) -> str:
if _BOOLEAN_PATTERN.search(db_type):
return "boolean"
if _TIME_PATTERN.search(db_type):
return "time"
if _NUMBER_PATTERN.search(db_type):
return "number"
return "string"
def _build_table_ref(table: TableInput) -> str:
parts = []
if table.catalog:
parts.append(table.catalog)
if table.db:
parts.append(table.db)
parts.append(table.name)
return ".".join(parts)
def _generate_measures(
table_name: str,
columns: list[ColumnInput],
pk_columns: list[str],
) -> list[MeasureDefinition]:
measures: list[MeasureDefinition] = []
if pk_columns:
pk = pk_columns[0]
measures.append(
MeasureDefinition(
name="record_count",
expr=f"count({pk})",
description=f"Count of {table_name} records",
)
)
for col in columns:
if _map_column_type(col.type) != "number":
continue
if _ID_PATTERN.search(col.name):
continue
measures.append(
MeasureDefinition(
name=f"total_{col.name}",
expr=f"sum({col.name})",
description=f"Sum of {col.name}"
+ (f" \u2014 {col.comment}" if col.comment else ""),
)
)
measures.append(
MeasureDefinition(
name=f"avg_{col.name}",
expr=f"avg({col.name})",
description=f"Average of {col.name}"
+ (f" \u2014 {col.comment}" if col.comment else ""),
)
)
return measures
def generate_sources(request: GenerateSourcesRequest) -> list[dict[str, Any]]:
links_by_from: dict[str, list[LinkInput]] = {}
links_by_to: dict[str, list[LinkInput]] = {}
for link in request.links:
links_by_from.setdefault(link.from_table, []).append(link)
links_by_to.setdefault(link.to_table, []).append(link)
table_names = {table.name for table in request.tables}
sources: list[dict[str, Any]] = []
for table in request.tables:
pk_columns = [column.name for column in table.columns if column.primary_key]
grain = (
pk_columns
if pk_columns
else [table.columns[0].name]
if table.columns
else ["id"]
)
sl_columns: list[SourceColumn] = []
for column in table.columns:
sl_type = _map_column_type(column.type)
role = ColumnRole.TIME if sl_type == "time" else ColumnRole.DEFAULT
sl_columns.append(
SourceColumn(
name=column.name,
type=sl_type,
role=role,
description=column.comment,
)
)
joins: list[JoinDeclaration] = []
for link in links_by_from.get(table.name, []):
if link.to_table not in table_names:
logger.warning(
"Skipping link from %s.%s to %s.%s: target table not in scan",
link.from_table,
link.from_column,
link.to_table,
link.to_column,
)
continue
relationship = _RELATIONSHIP_MAP.get(link.relationship_type, "many_to_one")
joins.append(
JoinDeclaration(
to=link.to_table,
on=f"{link.from_column} = {link.to_table}.{link.to_column}",
relationship=relationship,
)
)
for link in links_by_to.get(table.name, []):
if link.from_table not in table_names:
logger.warning(
"Skipping reverse link from %s.%s to %s.%s: source table not in scan",
link.from_table,
link.from_column,
link.to_table,
link.to_column,
)
continue
forward_relationship = _RELATIONSHIP_MAP.get(
link.relationship_type, "many_to_one"
)
reverse_relationship = _RELATIONSHIP_INVERSE.get(
forward_relationship, "one_to_many"
)
joins.append(
JoinDeclaration(
to=link.from_table,
on=f"{link.to_column} = {link.from_table}.{link.from_column}",
relationship=reverse_relationship,
)
)
to_counts: dict[str, int] = {}
for join in joins:
to_counts[join.to] = to_counts.get(join.to, 0) + 1
if any(count > 1 for count in to_counts.values()):
for join in joins:
if to_counts[join.to] > 1:
fk_col = join.on.split(" = ")[0].strip().lower()
join.alias = f"{join.to}_{fk_col}"
source = SourceDefinition(
name=table.name,
description=table.comment,
table=_build_table_ref(table),
grain=grain,
columns=sl_columns,
joins=joins,
measures=_generate_measures(table.name, table.columns, pk_columns),
)
sources.append(source.model_dump(exclude_none=True))
logger.info("Generated %d klo-sl source definitions", len(sources))
return sources
def generate_sources_response(
request: GenerateSourcesRequest,
) -> GenerateSourcesResponse:
sources = generate_sources(request)
return GenerateSourcesResponse(sources=sources, source_count=len(sources))

View file

@ -0,0 +1,66 @@
from __future__ import annotations
from dataclasses import asdict
from typing import Literal
from pydantic import BaseModel, ConfigDict, Field
from semantic_layer.table_identifier_parser import (
ParseTableIdentifierItem as SharedParseTableIdentifierItem,
parse_table_identifier_batch,
)
ParseTableIdentifierReason = Literal[
"looker_template_unresolved",
"derived_table_not_supported",
"no_physical_table",
"multiple_table_references",
"unsupported_dialect",
"parse_error",
]
class ParseTableIdentifierItem(BaseModel):
key: str
sql_table_name: str
dialect: str
class ParseTableIdentifierBatchRequest(BaseModel):
items: list[ParseTableIdentifierItem]
class ParsedIdentifier(BaseModel):
model_config = ConfigDict(populate_by_name=True)
ok: bool
catalog: str | None = None
schema_: str | None = Field(default=None, alias="schema")
name: str | None = None
canonical_table: str | None = None
reason: ParseTableIdentifierReason | None = None
detail: str | None = None
class ParseTableIdentifierBatchResponse(BaseModel):
results: dict[str, ParsedIdentifier]
def parse_table_identifier_response(
request: ParseTableIdentifierBatchRequest,
) -> ParseTableIdentifierBatchResponse:
shared_results = parse_table_identifier_batch(
[
SharedParseTableIdentifierItem(
key=item.key,
sql_table_name=item.sql_table_name,
dialect=item.dialect,
)
for item in request.items
]
)
return ParseTableIdentifierBatchResponse(
results={
key: ParsedIdentifier.model_validate(asdict(value))
for key, value in shared_results.items()
}
)

View file

@ -0,0 +1,442 @@
from __future__ import annotations
from fastapi.testclient import TestClient
from klo_daemon.app import create_app
from klo_daemon.database_introspection import (
DatabaseIntrospectionResponse,
LiveDatabaseColumn,
LiveDatabaseTable,
)
ORDERS_SOURCE = {
"name": "orders",
"table": "public.orders",
"grain": ["id"],
"columns": [
{"name": "id", "type": "number"},
{"name": "status", "type": "string"},
{"name": "amount", "type": "number"},
],
"joins": [],
"measures": [{"name": "order_count", "expr": "count(*)"}],
}
LOOKML_ORDER_VIEW = """
view: orders {
sql_table_name: public.orders ;;
dimension: id {
primary_key: yes
type: number
sql: ${TABLE}.id ;;
}
dimension: status {
type: string
sql: ${TABLE}.status ;;
}
measure: order_count {
type: count
}
}
"""
class FakeEmbeddingProvider:
name = "fake"
dimensions = 3
max_batch_size = 2
def __init__(self) -> None:
self.calls: list[list[str]] = []
def encode(self, texts: list[str]) -> list[list[float]]:
self.calls.append(list(texts))
return [
[float(len(text)), float(index), 1.0] for index, text in enumerate(texts)
]
def test_health_endpoint_returns_healthy() -> None:
client = TestClient(create_app())
response = client.get("/health")
assert response.status_code == 200
assert response.json() == {"status": "healthy"}
def test_database_introspect_endpoint_returns_snapshot() -> None:
calls = []
def fake_introspector(request):
calls.append(request)
return DatabaseIntrospectionResponse(
connection_id=request.connection_id,
extracted_at="2026-04-28T10:00:00+00:00",
metadata={"driver": request.driver, "schemas": request.schemas},
tables=[
LiveDatabaseTable(
catalog="warehouse",
db="public",
name="orders",
columns=[
LiveDatabaseColumn(
name="id",
type="integer",
nullable=False,
primary_key=True,
)
],
)
],
)
client = TestClient(create_app(database_introspector=fake_introspector))
response = client.post(
"/database/introspect",
json={
"connection_id": "warehouse",
"driver": "postgres",
"url": "postgresql://readonly@example.test/warehouse",
"schemas": ["public"],
},
)
assert response.status_code == 200
assert response.json()["connection_id"] == "warehouse"
assert response.json()["tables"][0]["name"] == "orders"
assert calls[0].connection_id == "warehouse"
def test_database_introspect_endpoint_maps_value_error_to_400() -> None:
def fake_introspector(request):
raise ValueError('database introspection supports only driver "postgres"')
client = TestClient(create_app(database_introspector=fake_introspector))
response = client.post(
"/database/introspect",
json={
"connection_id": "warehouse",
"driver": "snowflake",
"url": "snowflake://example",
},
)
assert response.status_code == 400
assert response.json() == {
"detail": 'database introspection supports only driver "postgres"'
}
def test_embedding_compute_endpoint_returns_embedding() -> None:
provider = FakeEmbeddingProvider()
client = TestClient(create_app(embedding_provider=provider))
response = client.post("/embeddings/compute", json={"text": "hello"})
assert response.status_code == 200
assert response.json() == {"embedding": [5.0, 0.0, 1.0]}
assert provider.calls == [["hello"]]
def test_embedding_compute_bulk_endpoint_returns_embeddings() -> None:
provider = FakeEmbeddingProvider()
client = TestClient(create_app(embedding_provider=provider))
response = client.post(
"/embeddings/compute-bulk",
json={"texts": ["one", "three"]},
)
assert response.status_code == 200
assert response.json() == {"embeddings": [[3.0, 0.0, 1.0], [5.0, 1.0, 1.0]]}
assert provider.calls == [["one", "three"]]
def test_embedding_compute_bulk_endpoint_maps_value_error_to_400() -> None:
provider = FakeEmbeddingProvider()
client = TestClient(create_app(embedding_provider=provider))
response = client.post(
"/embeddings/compute-bulk",
json={"texts": ["one", "two", "three"]},
)
assert response.status_code == 400
assert response.json() == {"detail": "Maximum 2 texts allowed per batch"}
assert provider.calls == []
def test_code_execute_endpoint_is_not_registered_by_default() -> None:
client = TestClient(create_app())
response = client.post("/code/execute", json={"code": "result = 7"})
assert response.status_code == 404
def test_code_execute_endpoint_returns_result_when_enabled() -> None:
client = TestClient(create_app(enable_code_execution=True))
response = client.post(
"/code/execute",
json={"code": 'print("ran")\nresult = {"value": 7}'},
)
assert response.status_code == 200
body = response.json()
assert body["result"] == {"value": 7}
assert body["console_output"] == "ran\n"
assert body["error"] is None
assert body["message"] is None
assert body["visualizations"] is None
assert "=== Console Output ===" in body["formatted_result"]
assert "=== Result ===" in body["formatted_result"]
def test_code_execute_endpoint_serializes_numpy_result_when_enabled() -> None:
client = TestClient(create_app(enable_code_execution=True))
response = client.post(
"/code/execute",
json={"code": "import numpy as np\nresult = {'value': np.float64(1.25)}"},
)
assert response.status_code == 200
body = response.json()
assert body["result"] == {"value": 1.25}
assert body["error"] is None
def test_code_execute_endpoint_uses_host_free_boundary_when_enabled() -> None:
client = TestClient(create_app(enable_code_execution=True))
response = client.post(
"/code/execute",
json={
"source_id": "chat_123",
"message_id": "message_456",
"code": (
"import pandas as pd\n"
"result = save_df_to_scratchpad(pd.DataFrame({'value': [1]}), 'out.json')"
),
},
headers={"Authorization": "Bearer should-not-forward"},
)
assert response.status_code == 200
body = response.json()
assert body["result"] is None
assert (
body["error"]
== "nest_api_url, Authorization header, and source_id are required for scratchpad operations"
)
assert "=== Error ===" in body["formatted_result"]
def test_sql_parse_table_identifier_endpoint() -> None:
client = TestClient(create_app())
response = client.post(
"/sql/parse-table-identifier",
json={
"items": [
{
"key": "orders",
"sql_table_name": "public.orders",
"dialect": "postgres",
},
{
"key": "template",
"sql_table_name": "${orders.SQL_TABLE_NAME}",
"dialect": "postgres",
},
]
},
)
assert response.status_code == 200
body = response.json()
assert body["results"]["orders"]["ok"] is True
assert body["results"]["orders"]["schema"] == "public"
assert body["results"]["orders"]["name"] == "orders"
assert body["results"]["template"]["ok"] is False
assert body["results"]["template"]["reason"] == "looker_template_unresolved"
def test_semantic_query_endpoint_returns_sql() -> None:
client = TestClient(create_app())
response = client.post(
"/semantic-layer/query",
json={
"sources": [ORDERS_SOURCE],
"dialect": "postgres",
"query": {
"measures": ["orders.order_count"],
"dimensions": ["orders.status"],
},
},
)
assert response.status_code == 200
body = response.json()
assert body["dialect"] == "postgres"
assert "public.orders" in body["sql"]
assert body["columns"][0]["name"] == "orders.status"
def test_semantic_query_endpoint_maps_value_error_to_400() -> None:
client = TestClient(create_app())
response = client.post(
"/semantic-layer/query",
json={
"sources": [ORDERS_SOURCE],
"dialect": "postgres",
"query": {
"measures": ["missing.order_count"],
"dimensions": [],
},
},
)
assert response.status_code == 400
assert "missing.order_count" in response.json()["detail"]
def test_semantic_validate_endpoint_returns_structured_validation() -> None:
client = TestClient(create_app())
invalid_source = {
**ORDERS_SOURCE,
"measures": [
{"name": "revenue", "expr": "sum(amount)"},
{"name": "revenue", "expr": "sum(amount)"},
],
}
response = client.post(
"/semantic-layer/validate",
json={"sources": [invalid_source], "dialect": "postgres"},
)
assert response.status_code == 200
body = response.json()
assert body["valid"] is False
assert any("Duplicate measure" in error for error in body["errors"])
assert body["warnings"] == []
assert body["per_source_warnings"] == {}
def test_semantic_generate_sources_endpoint_returns_sources() -> None:
client = TestClient(create_app())
response = client.post(
"/semantic-layer/generate-sources",
json={
"tables": [
{
"name": "orders",
"db": "public",
"comment": "Orders table",
"columns": [
{
"name": "id",
"type": "integer",
"primary_key": True,
"nullable": False,
"comment": "Order ID",
},
{"name": "customer_id", "type": "integer"},
{
"name": "amount",
"type": "decimal",
"comment": "Order amount",
},
],
},
{
"name": "customers",
"db": "public",
"columns": [
{"name": "id", "type": "integer", "primary_key": True},
{"name": "email", "type": "varchar"},
],
},
],
"links": [
{
"from_table": "orders",
"from_column": "customer_id",
"to_table": "customers",
"to_column": "id",
"relationship_type": "MANY_TO_ONE",
}
],
"dialect": "postgres",
},
)
assert response.status_code == 200
body = response.json()
assert body["source_count"] == 2
sources = {source["name"]: source for source in body["sources"]}
assert sources["orders"]["table"] == "public.orders"
assert sources["orders"]["description"] == "Orders table"
assert sources["orders"]["grain"] == ["id"]
assert sources["orders"]["joins"] == [
{
"to": "customers",
"on": "customer_id = customers.id",
"relationship": "many_to_one",
}
]
assert [measure["name"] for measure in sources["orders"]["measures"]] == [
"record_count",
"total_amount",
"avg_amount",
]
def test_lookml_parse_endpoint_returns_resolved_views() -> None:
client = TestClient(create_app())
response = client.post(
"/lookml/parse",
json={
"files": [
{
"path": "views/orders.view.lkml",
"content": LOOKML_ORDER_VIEW,
}
],
"dialect": "postgres",
},
)
assert response.status_code == 200
body = response.json()
assert body["joins"] == []
assert body["skipped_views"] == []
assert body["warnings"] == []
assert len(body["views"]) == 1
view = body["views"][0]
assert view["name"] == "orders"
assert view["source_type"] == "table"
assert view["table_ref"] == "public.orders"
assert view["grain"] == ["id"]
assert [column["name"] for column in view["columns"]] == ["id", "status"]
assert view["measures"] == [
{
"name": "order_count",
"expr": "count(*)",
"filter": None,
"description": None,
}
]

View file

@ -0,0 +1,426 @@
from __future__ import annotations
import io
import json
import os
import subprocess
import sys
from pathlib import Path
from typing import Any
ORDERS_SOURCE = {
"name": "orders",
"table": "public.orders",
"grain": ["id"],
"columns": [
{"name": "id", "type": "number"},
{"name": "status", "type": "string"},
{"name": "amount", "type": "number"},
],
"joins": [],
"measures": [{"name": "order_count", "expr": "count(*)"}],
}
def run_daemon_command(
command: str, payload: dict[str, object]
) -> subprocess.CompletedProcess[str]:
env = os.environ.copy()
src_path = str(Path(__file__).resolve().parents[1] / "src")
env["PYTHONPATH"] = src_path + os.pathsep + env.get("PYTHONPATH", "")
return subprocess.run(
[sys.executable, "-m", "klo_daemon", command],
input=json.dumps(payload),
text=True,
capture_output=True,
check=False,
env=env,
)
def test_semantic_query_command_reads_stdin_and_writes_json() -> None:
result = run_daemon_command(
"semantic-query",
{
"sources": [ORDERS_SOURCE],
"dialect": "postgres",
"query": {
"measures": ["orders.order_count"],
"dimensions": ["orders.status"],
},
},
)
assert result.returncode == 0, result.stderr
parsed = json.loads(result.stdout)
assert "public.orders" in parsed["sql"]
assert parsed["columns"][0]["name"] == "orders.status"
def test_semantic_validate_command_reads_stdin_and_writes_json() -> None:
result = run_daemon_command(
"semantic-validate",
{"sources": [ORDERS_SOURCE], "dialect": "postgres"},
)
assert result.returncode == 0, result.stderr
parsed = json.loads(result.stdout)
assert parsed == {
"valid": True,
"errors": [],
"warnings": [],
"per_source_warnings": {},
}
def test_command_returns_nonzero_for_invalid_json() -> None:
env = os.environ.copy()
src_path = str(Path(__file__).resolve().parents[1] / "src")
env["PYTHONPATH"] = src_path + os.pathsep + env.get("PYTHONPATH", "")
result = subprocess.run(
[sys.executable, "-m", "klo_daemon", "semantic-query"],
input="{",
text=True,
capture_output=True,
check=False,
env=env,
)
assert result.returncode == 1
assert "Expecting property name enclosed in double quotes" in result.stderr
def test_serve_http_command_starts_uvicorn_without_reading_stdin(
monkeypatch,
) -> None:
from klo_daemon import __main__ as daemon_main
calls: list[dict[str, object]] = []
class FailingStdin:
def read(self) -> str:
raise AssertionError("serve-http must not read stdin JSON")
def fake_run_http_server(
*,
host: str,
port: int,
log_level: str,
enable_code_execution: bool,
) -> None:
calls.append(
{
"host": host,
"port": port,
"log_level": log_level,
"enable_code_execution": enable_code_execution,
}
)
monkeypatch.setattr(sys, "stdin", FailingStdin())
monkeypatch.setattr(daemon_main, "run_http_server", fake_run_http_server)
assert (
daemon_main.main(
[
"serve-http",
"--host",
"127.0.0.1",
"--port",
"9191",
"--log-level",
"warning",
]
)
== 0
)
assert calls == [
{
"host": "127.0.0.1",
"port": 9191,
"log_level": "warning",
"enable_code_execution": False,
}
]
def test_serve_http_command_defaults_to_loopback(monkeypatch) -> None:
from klo_daemon import __main__ as daemon_main
calls: list[dict[str, object]] = []
def fake_run_http_server(
*,
host: str,
port: int,
log_level: str,
enable_code_execution: bool,
) -> None:
calls.append(
{
"host": host,
"port": port,
"log_level": log_level,
"enable_code_execution": enable_code_execution,
}
)
monkeypatch.setattr(daemon_main, "run_http_server", fake_run_http_server)
assert daemon_main.main(["serve-http"]) == 0
assert calls == [
{
"host": "127.0.0.1",
"port": 8765,
"log_level": "info",
"enable_code_execution": False,
}
]
def test_serve_http_command_can_enable_code_execution(monkeypatch) -> None:
from klo_daemon import __main__ as daemon_main
calls: list[dict[str, object]] = []
def fake_run_http_server(
*,
host: str,
port: int,
log_level: str,
enable_code_execution: bool,
) -> None:
calls.append(
{
"host": host,
"port": port,
"log_level": log_level,
"enable_code_execution": enable_code_execution,
}
)
monkeypatch.setattr(daemon_main, "run_http_server", fake_run_http_server)
assert daemon_main.main(["serve-http", "--enable-code-execution"]) == 0
assert calls == [
{
"host": "127.0.0.1",
"port": 8765,
"log_level": "info",
"enable_code_execution": True,
}
]
def test_lookml_parse_command_reads_stdin_and_writes_json() -> None:
result = run_daemon_command(
"lookml-parse",
{
"files": [
{
"path": "views/orders.view.lkml",
"content": """
view: orders {
sql_table_name: public.orders ;;
dimension: id {
primary_key: yes
type: number
sql: ${TABLE}.id ;;
}
measure: order_count {
type: count
}
}
""",
}
],
"dialect": "postgres",
},
)
assert result.returncode == 0, result.stderr
parsed = json.loads(result.stdout)
assert parsed["views"][0]["name"] == "orders"
assert parsed["views"][0]["table_ref"] == "public.orders"
assert parsed["views"][0]["measures"][0]["expr"] == "count(*)"
assert parsed["joins"] == []
assert parsed["skipped_views"] == []
assert parsed["warnings"] == []
def test_semantic_generate_sources_command_reads_stdin_and_writes_json() -> None:
result = run_daemon_command(
"semantic-generate-sources",
{
"tables": [
{
"name": "orders",
"db": "public",
"columns": [
{"name": "id", "type": "integer", "primary_key": True},
{"name": "amount", "type": "decimal"},
],
}
],
"links": [],
"dialect": "postgres",
},
)
assert result.returncode == 0, result.stderr
parsed = json.loads(result.stdout)
assert parsed["source_count"] == 1
assert parsed["sources"][0]["name"] == "orders"
assert parsed["sources"][0]["table"] == "public.orders"
assert parsed["sources"][0]["measures"] == [
{
"name": "record_count",
"expr": "count(id)",
"segments": [],
"description": "Count of orders records",
},
{
"name": "total_amount",
"expr": "sum(amount)",
"segments": [],
"description": "Sum of amount",
},
{
"name": "avg_amount",
"expr": "avg(amount)",
"segments": [],
"description": "Average of amount",
},
]
def test_database_introspect_command_reads_stdin_and_writes_json(
monkeypatch, capsys
) -> None:
from klo_daemon import __main__ as daemon_main
from klo_daemon.database_introspection import (
DatabaseIntrospectionResponse,
LiveDatabaseColumn,
LiveDatabaseTable,
)
def fake_introspect(request):
assert request.connection_id == "warehouse"
assert request.driver == "postgres"
assert request.schemas == ["public"]
return DatabaseIntrospectionResponse(
connection_id="warehouse",
extracted_at="2026-04-28T10:00:00+00:00",
metadata={"driver": "postgres", "schemas": ["public"]},
tables=[
LiveDatabaseTable(
catalog="warehouse",
db="public",
name="orders",
columns=[
LiveDatabaseColumn(
name="id",
type="integer",
nullable=False,
primary_key=True,
)
],
)
],
)
monkeypatch.setattr(daemon_main, "introspect_database_response", fake_introspect)
monkeypatch.setattr(
sys,
"stdin",
io.StringIO(
'{"connection_id":"warehouse","driver":"postgres","url":"postgresql://readonly@example.test/warehouse","schemas":["public"]}'
),
)
assert daemon_main.main(["database-introspect"]) == 0
captured = capsys.readouterr()
parsed = json.loads(captured.out)
assert parsed["connection_id"] == "warehouse"
assert parsed["metadata"] == {"driver": "postgres", "schemas": ["public"]}
assert parsed["tables"][0]["name"] == "orders"
assert captured.err == ""
def test_embedding_compute_command_reads_stdin_and_writes_json(
monkeypatch, capsys
) -> None:
from klo_daemon import __main__ as daemon_main
from klo_daemon.embeddings import ComputeEmbeddingResponse
def fake_compute(request):
assert request.text == "hello"
return ComputeEmbeddingResponse(embedding=[1.0, 2.0, 3.0])
monkeypatch.setattr(daemon_main, "compute_embedding_response", fake_compute)
monkeypatch.setattr(sys, "stdin", io.StringIO('{"text": "hello"}'))
assert daemon_main.main(["embedding-compute"]) == 0
captured = capsys.readouterr()
assert json.loads(captured.out) == {"embedding": [1.0, 2.0, 3.0]}
assert captured.err == ""
def test_embedding_compute_bulk_command_reads_stdin_and_writes_json(
monkeypatch, capsys
) -> None:
from klo_daemon import __main__ as daemon_main
from klo_daemon.embeddings import ComputeEmbeddingBulkResponse
def fake_compute(request):
assert request.texts == ["hello", "world"]
return ComputeEmbeddingBulkResponse(embeddings=[[1.0, 2.0], [3.0, 4.0]])
monkeypatch.setattr(daemon_main, "compute_embedding_bulk_response", fake_compute)
monkeypatch.setattr(sys, "stdin", io.StringIO('{"texts": ["hello", "world"]}'))
assert daemon_main.main(["embedding-compute-bulk"]) == 0
captured = capsys.readouterr()
assert json.loads(captured.out) == {"embeddings": [[1.0, 2.0], [3.0, 4.0]]}
assert captured.err == ""
def test_code_execute_command_reads_stdin_and_writes_json(monkeypatch, capsys) -> None:
from klo_daemon import __main__ as daemon_main
from klo_daemon.code_execution import ExecuteCodeResponse
calls: list[dict[str, Any]] = []
def fake_execute(request, *, nest_api_url, auth_header):
calls.append(
{
"request": request,
"nest_api_url": nest_api_url,
"auth_header": auth_header,
}
)
return ExecuteCodeResponse(
formatted_result="\n\n=== Result ===\n\n7",
result=7,
)
monkeypatch.setattr(daemon_main, "execute_code_response", fake_execute)
monkeypatch.setattr(sys, "stdin", io.StringIO('{"code": "result = 7"}'))
assert daemon_main.main(["code-execute"]) == 0
captured = capsys.readouterr()
assert json.loads(captured.out) == {
"formatted_result": "\n\n=== Result ===\n\n7",
"result": 7,
"console_output": None,
"error": None,
"message": None,
"visualizations": None,
}
assert captured.err == ""
assert calls[0]["request"].code == "result = 7"
assert calls[0]["nest_api_url"] is None
assert calls[0]["auth_header"] is None

View file

@ -0,0 +1,210 @@
from __future__ import annotations
import json
from dataclasses import dataclass
from typing import Any
import numpy as np
import orjson
import pandas as pd
import pytest
from klo_daemon.code_execution import (
ExecuteCodeRequest,
create_scratchpad_helpers,
detect_visualizations,
dumps_numpy_json,
execute_code_response,
)
@dataclass
class FakeResponse:
json_payload: dict[str, Any] | None = None
content: bytes = b""
headers: dict[str, str] | None = None
def raise_for_status(self) -> None:
return None
def json(self) -> dict[str, Any]:
return self.json_payload or {}
class FakeHttpClient:
def __init__(self) -> None:
self.posts: list[dict[str, Any]] = []
self.gets: list[dict[str, Any]] = []
def post(
self,
url: str,
data: bytes,
headers: dict[str, str],
timeout: int,
) -> FakeResponse:
self.posts.append(
{
"url": url,
"data": orjson.loads(data),
"headers": headers,
"timeout": timeout,
}
)
return FakeResponse(json_payload={"filename": "saved.json"})
def get(
self,
url: str,
headers: dict[str, str],
timeout: int,
) -> FakeResponse:
self.gets.append({"url": url, "headers": headers, "timeout": timeout})
return FakeResponse(
content=b"value,name\n1.25,alpha\n",
headers={"content-type": "text/csv; charset=utf-8"},
)
def test_execute_code_response_captures_console_result_and_strips_ansi() -> None:
response = execute_code_response(
ExecuteCodeRequest(
code='print("\\x1b[31mhello\\x1b[0m")\nresult = {"value": 3}',
),
nest_api_url=None,
auth_header=None,
)
assert response.result == {"value": 3}
assert response.console_output == "\x1b[31mhello\x1b[0m\n"
assert "=== Console Output ===" in response.formatted_result
assert "hello" in response.formatted_result
assert "\x1b" not in response.formatted_result
assert "=== Result ===" in response.formatted_result
def test_execute_code_response_returns_message_when_result_is_absent() -> None:
response = execute_code_response(
ExecuteCodeRequest(code='print("ran")'),
nest_api_url=None,
auth_header=None,
)
assert response.result is None
assert (
response.message == "Code executed successfully but no result variable was set"
)
assert response.console_output == "ran\n"
assert "=== Message ===" in response.formatted_result
def test_execute_code_response_detects_visualization_records() -> None:
response = execute_code_response(
ExecuteCodeRequest(
code="result = "
+ json.dumps(
{
"type": "visualization",
"vis_type": "bar",
"config": {"title": "Revenue"},
"data": [{"month": "Jan", "revenue": 10}],
"title": "Revenue",
}
),
),
nest_api_url=None,
auth_header=None,
)
assert response.visualizations is not None
assert len(response.visualizations) == 1
assert response.visualizations[0].vis_type == "bar"
assert response.visualizations[0].title == "Revenue"
def test_detect_visualizations_filters_mixed_lists() -> None:
visualizations = detect_visualizations(
[
{"type": "note", "text": "skip"},
{
"type": "visualization",
"vis_type": "table",
"config": {"title": "Rows"},
"data": [{"row": 1}],
},
]
)
assert visualizations == [
{
"type": "visualization",
"vis_type": "table",
"config": {"title": "Rows"},
"data": [{"row": 1}],
}
]
def test_scratchpad_and_visualization_helpers_serialize_numpy_scalars() -> None:
client = FakeHttpClient()
save_df, read_file, save_viz = create_scratchpad_helpers(
nest_api_url="http://nest",
auth_header="Bearer token",
source_id="source_123",
message_id="message_456",
http_client=client,
)
df = pd.DataFrame({"value": [np.float64(1.25)]})
assert save_df(df, filename="df.json") == "1 rows saved to saved.json"
read_df = read_file("input.csv")
assert read_df.to_dict(orient="records") == [{"value": 1.25, "name": "alpha"}]
viz_ref = save_viz(
vis_type="bar",
config={"title": "Test", "x": "a", "y": np.float64(2.5)},
data=[{"a": "row1", "b": np.float64(3.75)}],
)
assert viz_ref == "![viz](saved.json)"
assert (
client.posts[0]["url"] == "http://nest/private_api/scratchpad/source_123/files"
)
assert client.posts[0]["data"]["data"][0]["value"] == 1.25
assert (
client.gets[0]["url"]
== "http://nest/private_api/scratchpad/source_123/files/input.csv?format=raw"
)
assert client.posts[1]["url"] == "http://nest/private_api/visualizations/source_123"
assert client.posts[1]["data"]["config"]["y"] == 2.5
assert client.posts[1]["data"]["data"][0]["b"] == 3.75
def test_scratchpad_helpers_require_app_context_only_when_called() -> None:
save_df, read_file, save_viz = create_scratchpad_helpers(
nest_api_url=None,
auth_header=None,
source_id=None,
message_id=None,
)
with pytest.raises(ValueError, match="required for scratchpad operations"):
save_df(pd.DataFrame({"value": [1]}), filename="df.json")
with pytest.raises(ValueError, match="required for scratchpad operations"):
read_file("df.csv")
with pytest.raises(ValueError, match="required for visualization operations"):
save_viz("bar", {"title": "Chart"}, [{"value": 1}])
def test_dumps_numpy_json_serializes_numpy_values() -> None:
rendered = dumps_numpy_json(
{
"scalar": np.float64(1.5),
"array": np.array([1, 2, 3]),
}
)
assert orjson.loads(rendered) == {"scalar": 1.5, "array": [1, 2, 3]}

View file

@ -0,0 +1,153 @@
from __future__ import annotations
import pytest
from klo_daemon.database_introspection import (
DatabaseIntrospectionRequest,
DatabaseIntrospectionRows,
_statement_timeout_config,
introspect_database_response,
)
def test_introspect_database_response_maps_postgres_catalog_rows() -> None:
def fake_load_rows(
request: DatabaseIntrospectionRequest,
) -> DatabaseIntrospectionRows:
assert request.connection_id == "warehouse"
assert request.driver == "postgres"
assert request.schemas == ["public"]
return DatabaseIntrospectionRows(
table_rows=[
{
"table_catalog": "warehouse",
"table_schema": "public",
"table_name": "customers",
"table_comment": None,
},
{
"table_catalog": "warehouse",
"table_schema": "public",
"table_name": "orders",
"table_comment": "Orders table",
},
],
column_rows=[
{
"table_catalog": "warehouse",
"table_schema": "public",
"table_name": "orders",
"column_name": "id",
"formatted_type": "integer",
"is_nullable": False,
"is_primary_key": True,
"column_comment": "Order ID",
},
{
"table_catalog": "warehouse",
"table_schema": "public",
"table_name": "orders",
"column_name": "customer_id",
"formatted_type": "integer",
"is_nullable": False,
"is_primary_key": False,
"column_comment": None,
},
{
"table_catalog": "warehouse",
"table_schema": "public",
"table_name": "customers",
"column_name": "id",
"formatted_type": "integer",
"is_nullable": False,
"is_primary_key": True,
"column_comment": None,
},
],
foreign_key_rows=[
{
"table_catalog": "warehouse",
"table_schema": "public",
"table_name": "orders",
"from_column": "customer_id",
"to_table": "customers",
"to_column": "id",
"constraint_name": "orders_customer_id_fkey",
}
],
)
response = introspect_database_response(
DatabaseIntrospectionRequest(
connection_id="warehouse",
driver="postgres",
url="postgresql://readonly@example.test/warehouse",
schemas=["public"],
),
load_rows=fake_load_rows,
now=lambda: "2026-04-28T10:00:00+00:00",
)
assert response.connection_id == "warehouse"
assert response.extracted_at == "2026-04-28T10:00:00+00:00"
assert response.metadata == {"driver": "postgres", "schemas": ["public"]}
assert [table.name for table in response.tables] == ["customers", "orders"]
orders = response.tables[1]
assert orders.model_dump(exclude_none=True) == {
"catalog": "warehouse",
"db": "public",
"name": "orders",
"comment": "Orders table",
"columns": [
{
"name": "id",
"type": "integer",
"nullable": False,
"primary_key": True,
"comment": "Order ID",
},
{
"name": "customer_id",
"type": "integer",
"nullable": False,
"primary_key": False,
},
],
"foreign_keys": [
{
"from_column": "customer_id",
"to_table": "customers",
"to_column": "id",
"constraint_name": "orders_customer_id_fkey",
}
],
}
def test_introspect_database_response_rejects_non_postgres_driver() -> None:
with pytest.raises(ValueError, match='supports only driver "postgres"'):
introspect_database_response(
DatabaseIntrospectionRequest(
connection_id="warehouse",
driver="snowflake",
url="snowflake://example",
),
load_rows=lambda request: DatabaseIntrospectionRows([], [], []),
)
def test_database_introspection_request_rejects_empty_schema_list() -> None:
with pytest.raises(ValueError, match="at least one schema"):
DatabaseIntrospectionRequest(
connection_id="warehouse",
driver="postgres",
url="postgresql://readonly@example.test/warehouse",
schemas=[],
)
def test_statement_timeout_config_uses_parameterized_set_config() -> None:
assert _statement_timeout_config(30_000) == (
"SELECT set_config('statement_timeout', %s, true)",
("30000ms",),
)

View file

@ -0,0 +1,107 @@
from __future__ import annotations
import pytest
from klo_daemon.embeddings import (
ComputeEmbeddingBulkRequest,
ComputeEmbeddingRequest,
SentenceTransformersEmbeddingProvider,
compute_embedding_bulk_response,
compute_embedding_response,
)
class FakeEmbeddingProvider:
name = "fake"
dimensions = 3
max_batch_size = 2
def __init__(self) -> None:
self.calls: list[list[str]] = []
def encode(self, texts: list[str]) -> list[list[float]]:
self.calls.append(list(texts))
return [
[float(len(text)), float(index), 1.0] for index, text in enumerate(texts)
]
class ArrayLike:
def __init__(self, value: list[float] | list[list[float]]) -> None:
self.value = value
def tolist(self) -> list[float] | list[list[float]]:
return self.value
class FakeSentenceTransformerModel:
def __init__(self) -> None:
self.calls: list[str | list[str]] = []
def encode(self, value: str | list[str]) -> ArrayLike:
self.calls.append(value)
if isinstance(value, str):
return ArrayLike([0.1, 0.2, 0.3])
return ArrayLike(
[[float(index), float(len(text)), 0.5] for index, text in enumerate(value)]
)
def test_compute_embedding_response_uses_injected_provider() -> None:
provider = FakeEmbeddingProvider()
response = compute_embedding_response(
ComputeEmbeddingRequest(text="hello"),
provider=provider,
)
assert response.embedding == [5.0, 0.0, 1.0]
assert provider.calls == [["hello"]]
def test_compute_embedding_bulk_response_uses_injected_provider() -> None:
provider = FakeEmbeddingProvider()
response = compute_embedding_bulk_response(
ComputeEmbeddingBulkRequest(texts=["one", "three"]),
provider=provider,
)
assert response.embeddings == [[3.0, 0.0, 1.0], [5.0, 1.0, 1.0]]
assert provider.calls == [["one", "three"]]
def test_compute_embedding_bulk_rejects_empty_texts() -> None:
provider = FakeEmbeddingProvider()
with pytest.raises(ValueError, match="Empty texts found at indices: 1"):
compute_embedding_bulk_response(
ComputeEmbeddingBulkRequest(texts=["valid", " "]),
provider=provider,
)
assert provider.calls == []
def test_compute_embedding_bulk_respects_provider_batch_size() -> None:
provider = FakeEmbeddingProvider()
with pytest.raises(ValueError, match="Maximum 2 texts allowed per batch"):
compute_embedding_bulk_response(
ComputeEmbeddingBulkRequest(texts=["one", "two", "three"]),
provider=provider,
)
assert provider.calls == []
def test_sentence_transformers_provider_normalizes_single_and_bulk_outputs() -> None:
model = FakeSentenceTransformerModel()
provider = SentenceTransformersEmbeddingProvider(model=model)
assert provider.encode(["hello"]) == [[0.1, 0.2, 0.3]]
assert provider.encode(["one", "three"]) == [
[0.0, 3.0, 0.5],
[1.0, 5.0, 0.5],
]
assert model.calls == ["hello", ["one", "three"]]

View file

@ -0,0 +1,134 @@
from __future__ import annotations
from klo_daemon.lookml import (
LookMLFileInput,
ParseLookMLRequest,
parse_lookml_project,
)
ORDER_VIEW = """
view: orders {
sql_table_name: public.orders ;;
dimension: id {
primary_key: yes
type: number
sql: ${TABLE}.id ;;
}
dimension: user_id {
type: number
sql: ${TABLE}.user_id ;;
}
dimension: status {
type: string
sql: ${TABLE}.status ;;
}
measure: order_count {
type: count
}
measure: revenue {
type: sum
sql: ${TABLE}.amount ;;
}
}
"""
USER_VIEW = """
view: users {
sql_table_name: public.users ;;
dimension: id {
primary_key: yes
type: number
sql: ${TABLE}.id ;;
}
}
"""
ORDER_MODEL = """
explore: orders {
join: users {
relationship: many_to_one
sql_on: ${orders.user_id} = ${users.id} ;;
}
}
"""
DERIVED_VIEW = """
view: order_rollup {
derived_table: {
sql:
SELECT status, SUM(amount) AS total_amount
FROM public.orders
GROUP BY status ;;
}
dimension: status {
type: string
sql: ${TABLE}.status ;;
}
}
"""
def test_parse_lookml_project_returns_views_and_joins() -> None:
response = parse_lookml_project(
ParseLookMLRequest(
files=[
LookMLFileInput(path="views/orders.view.lkml", content=ORDER_VIEW),
LookMLFileInput(path="views/users.view.lkml", content=USER_VIEW),
LookMLFileInput(
path="models/ecommerce.model.lkml", content=ORDER_MODEL
),
],
dialect="postgres",
)
)
views = {view.name: view for view in response.views}
assert sorted(views) == ["orders", "users"]
assert views["orders"].source_type == "table"
assert views["orders"].table_ref == "public.orders"
assert views["orders"].grain == ["id"]
assert [measure.name for measure in views["orders"].measures] == [
"order_count",
"revenue",
]
assert views["orders"].measures[0].expr == "count(*)"
assert views["orders"].measures[1].expr == "sum(amount)"
assert response.joins[0].source_view == "orders"
assert response.joins[0].to == "users"
assert response.joins[0].relationship == "many_to_one"
assert response.joins[0].on == "orders.user_id = users.id"
assert response.skipped_views == []
assert response.warnings == []
def test_parse_lookml_project_extracts_derived_table_columns() -> None:
response = parse_lookml_project(
ParseLookMLRequest(
files=[
LookMLFileInput(
path="views/order_rollup.view.lkml", content=DERIVED_VIEW
)
],
dialect="postgres",
)
)
assert len(response.views) == 1
view = response.views[0]
assert view.name == "order_rollup"
assert view.source_type == "sql"
assert "SELECT status, SUM(amount) AS total_amount" in (view.sql or "")
assert [column.name for column in view.columns] == ["status", "total_amount"]
assert response.skipped_views == []
assert response.warnings == []

View file

@ -0,0 +1,6 @@
from klo_daemon import PACKAGE_NAME, VERSION
def test_package_metadata() -> None:
assert PACKAGE_NAME == "klo-daemon"
assert VERSION == "0.1.0"

View file

@ -0,0 +1,64 @@
from __future__ import annotations
from klo_daemon.semantic_layer import (
SemanticLayerQueryRequest,
ValidateSourcesRequest,
query_semantic_layer,
validate_semantic_layer,
)
ORDERS_SOURCE = {
"name": "orders",
"table": "public.orders",
"grain": ["id"],
"columns": [
{"name": "id", "type": "number"},
{"name": "status", "type": "string"},
{"name": "amount", "type": "number"},
],
"joins": [],
"measures": [
{"name": "order_count", "expr": "count(*)"},
{"name": "revenue", "expr": "sum(amount)"},
],
}
def test_query_semantic_layer_generates_sql_and_plan() -> None:
response = query_semantic_layer(
SemanticLayerQueryRequest(
sources=[ORDERS_SOURCE],
dialect="postgres",
query={
"measures": ["orders.order_count"],
"dimensions": ["orders.status"],
"limit": 25,
},
)
)
assert response.dialect == "postgres"
assert "public.orders" in response.sql
assert "orders.status" in response.sql
assert response.columns[0]["name"] == "orders.status"
assert response.columns[1]["name"] == "orders.order_count"
assert response.plan["sources_used"] == ["orders"]
def test_validate_semantic_layer_reports_duplicate_measure_names() -> None:
invalid_source = {
**ORDERS_SOURCE,
"measures": [
{"name": "revenue", "expr": "sum(amount)"},
{"name": "revenue", "expr": "sum(amount)"},
],
}
response = validate_semantic_layer(
ValidateSourcesRequest(sources=[invalid_source], dialect="postgres")
)
assert response.valid is False
assert any("Duplicate measure" in error for error in response.errors)
assert response.warnings == []

View file

@ -0,0 +1,161 @@
from __future__ import annotations
from klo_daemon.source_generation import (
ColumnInput,
GenerateSourcesRequest,
LinkInput,
TableInput,
generate_sources,
generate_sources_response,
)
def test_generate_sources_maps_tables_columns_measures_and_joins() -> None:
response = generate_sources_response(
GenerateSourcesRequest(
tables=[
TableInput(
name="orders",
db="public",
comment="Orders table",
columns=[
ColumnInput(
name="id",
type="integer",
primary_key=True,
nullable=False,
comment="Order ID",
),
ColumnInput(name="customer_id", type="integer"),
ColumnInput(
name="amount", type="decimal", comment="Order amount"
),
ColumnInput(name="created_at", type="timestamp"),
ColumnInput(name="status", type="varchar"),
],
),
TableInput(
name="customers",
db="public",
columns=[
ColumnInput(name="id", type="integer", primary_key=True),
ColumnInput(name="email", type="varchar"),
],
),
],
links=[
LinkInput(
from_table="orders",
from_column="customer_id",
to_table="customers",
to_column="id",
relationship_type="MANY_TO_ONE",
)
],
)
)
assert response.source_count == 2
sources = {source["name"]: source for source in response.sources}
assert sources["orders"]["description"] == "Orders table"
assert sources["orders"]["table"] == "public.orders"
assert sources["orders"]["grain"] == ["id"]
assert sources["orders"]["columns"] == [
{
"name": "id",
"type": "number",
"visibility": "public",
"role": "default",
"description": "Order ID",
},
{
"name": "customer_id",
"type": "number",
"visibility": "public",
"role": "default",
},
{
"name": "amount",
"type": "number",
"visibility": "public",
"role": "default",
"description": "Order amount",
},
{"name": "created_at", "type": "time", "visibility": "public", "role": "time"},
{"name": "status", "type": "string", "visibility": "public", "role": "default"},
]
assert sources["orders"]["joins"] == [
{
"to": "customers",
"on": "customer_id = customers.id",
"relationship": "many_to_one",
}
]
assert [measure["name"] for measure in sources["orders"]["measures"]] == [
"record_count",
"total_amount",
"avg_amount",
]
assert sources["orders"]["measures"][0]["expr"] == "count(id)"
assert sources["orders"]["measures"][1]["expr"] == "sum(amount)"
assert sources["orders"]["measures"][2]["expr"] == "avg(amount)"
assert sources["customers"]["joins"] == [
{
"to": "orders",
"on": "id = orders.customer_id",
"relationship": "one_to_many",
}
]
def test_generate_sources_aliases_multiple_joins_to_same_table() -> None:
sources = generate_sources(
GenerateSourcesRequest(
tables=[
TableInput(
name="orders",
columns=[
ColumnInput(name="id", type="integer", primary_key=True),
ColumnInput(name="buyer_id", type="integer"),
ColumnInput(name="seller_id", type="integer"),
],
),
TableInput(
name="users",
columns=[ColumnInput(name="id", type="integer", primary_key=True)],
),
],
links=[
LinkInput(
from_table="orders",
from_column="buyer_id",
to_table="users",
to_column="id",
relationship_type="many_to_one",
),
LinkInput(
from_table="orders",
from_column="seller_id",
to_table="users",
to_column="id",
relationship_type="many_to_one",
),
],
)
)
orders = next(source for source in sources if source["name"] == "orders")
assert orders["joins"] == [
{
"to": "users",
"on": "buyer_id = users.id",
"relationship": "many_to_one",
"alias": "users_buyer_id",
},
{
"to": "users",
"on": "seller_id = users.id",
"relationship": "many_to_one",
"alias": "users_seller_id",
},
]