mirror of
https://github.com/MODSetter/SurfSense.git
synced 2026-05-25 19:15:18 +02:00
Merge remote-tracking branch 'upstream/dev' into improvement-agent-speed
This commit is contained in:
commit
d5ee8cc4cd
287 changed files with 7551 additions and 6195 deletions
|
|
@ -167,10 +167,14 @@ COPY scripts/docker/entrypoint.sh /app/scripts/docker/entrypoint.sh
|
|||
RUN dos2unix /app/scripts/docker/entrypoint.sh && chmod +x /app/scripts/docker/entrypoint.sh
|
||||
|
||||
# SERVICE_ROLE controls which process this container runs:
|
||||
# api – FastAPI backend only (runs migrations on startup)
|
||||
# migrate – Run alembic upgrade head, verify zero_publication exists, exit 0.
|
||||
# Used by the dedicated `migrations` service in docker-compose.yml
|
||||
# so downstream services gate on `service_completed_successfully`.
|
||||
# api – FastAPI backend only (does NOT run migrations)
|
||||
# worker – Celery worker only
|
||||
# beat – Celery beat scheduler only
|
||||
# all – All three (legacy / dev default)
|
||||
# all – migrations + api + worker + beat (legacy / dev default;
|
||||
# fails fast on migration error)
|
||||
ENV SERVICE_ROLE=all
|
||||
|
||||
# Celery worker tuning (only used when SERVICE_ROLE=worker or all)
|
||||
|
|
|
|||
|
|
@ -9,6 +9,7 @@ from sqlalchemy.ext.asyncio import AsyncSession
|
|||
|
||||
from app.db import SurfsenseDocsChunk, SurfsenseDocsDocument
|
||||
from app.utils.document_converters import embed_text
|
||||
from app.utils.surfsense_docs import surfsense_docs_public_url
|
||||
|
||||
|
||||
def format_surfsense_docs_results(results: list[tuple]) -> str:
|
||||
|
|
@ -19,13 +20,14 @@ def format_surfsense_docs_results(results: list[tuple]) -> str:
|
|||
# Group chunks by document
|
||||
grouped: dict[int, dict] = {}
|
||||
for chunk, doc in results:
|
||||
public_url = surfsense_docs_public_url(doc.source)
|
||||
if doc.id not in grouped:
|
||||
grouped[doc.id] = {
|
||||
"document_id": f"doc-{doc.id}",
|
||||
"document_type": "SURFSENSE_DOCS",
|
||||
"title": doc.title,
|
||||
"url": doc.source,
|
||||
"metadata": {"source": doc.source},
|
||||
"url": public_url,
|
||||
"metadata": {"source": doc.source, "public_url": public_url},
|
||||
"chunks": [],
|
||||
}
|
||||
grouped[doc.id]["chunks"].append(
|
||||
|
|
|
|||
|
|
@ -17,6 +17,7 @@ from sqlalchemy.ext.asyncio import AsyncSession
|
|||
|
||||
from app.db import SurfsenseDocsChunk, SurfsenseDocsDocument, async_session_maker
|
||||
from app.utils.document_converters import embed_text
|
||||
from app.utils.surfsense_docs import surfsense_docs_public_url
|
||||
|
||||
|
||||
def format_surfsense_docs_results(results: list[tuple]) -> str:
|
||||
|
|
@ -40,13 +41,14 @@ def format_surfsense_docs_results(results: list[tuple]) -> str:
|
|||
# Group chunks by document
|
||||
grouped: dict[int, dict] = {}
|
||||
for chunk, doc in results:
|
||||
public_url = surfsense_docs_public_url(doc.source)
|
||||
if doc.id not in grouped:
|
||||
grouped[doc.id] = {
|
||||
"document_id": f"doc-{doc.id}",
|
||||
"document_type": "SURFSENSE_DOCS",
|
||||
"title": doc.title,
|
||||
"url": doc.source,
|
||||
"metadata": {"source": doc.source},
|
||||
"url": public_url,
|
||||
"metadata": {"source": doc.source, "public_url": public_url},
|
||||
"chunks": [],
|
||||
}
|
||||
grouped[doc.id]["chunks"].append(
|
||||
|
|
|
|||
|
|
@ -945,6 +945,36 @@ async def health_check():
|
|||
return {"status": "ok"}
|
||||
|
||||
|
||||
@app.get("/ready", tags=["health"])
|
||||
@limiter.exempt
|
||||
async def readiness_check():
|
||||
"""Readiness probe.
|
||||
|
||||
Verifies that the schema state required by downstream services is
|
||||
present. Specifically checks that the ``zero_publication`` Postgres
|
||||
logical-replication publication exists; without it zero-cache crash-loops
|
||||
on `Unknown or invalid publications`.
|
||||
|
||||
Returns 200 when ready, 503 otherwise. Used by the docker-compose
|
||||
backend healthcheck and by ``install.ps1`` / ``install.sh`` post-up
|
||||
verification.
|
||||
"""
|
||||
from sqlalchemy import text
|
||||
|
||||
from app.db import async_session_maker
|
||||
|
||||
async with async_session_maker() as session:
|
||||
result = await session.execute(
|
||||
text("SELECT 1 FROM pg_publication WHERE pubname = 'zero_publication'")
|
||||
)
|
||||
if result.first() is None:
|
||||
raise HTTPException(
|
||||
status_code=503,
|
||||
detail="zero_publication missing; run alembic upgrade head",
|
||||
)
|
||||
return {"status": "ready"}
|
||||
|
||||
|
||||
@app.get("/verify-token")
|
||||
async def authenticated_route(
|
||||
user: User = Depends(current_active_user),
|
||||
|
|
|
|||
|
|
@ -24,6 +24,7 @@ from app.schemas.surfsense_docs import (
|
|||
SurfsenseDocsDocumentWithChunksRead,
|
||||
)
|
||||
from app.users import current_active_user
|
||||
from app.utils.surfsense_docs import surfsense_docs_public_url
|
||||
|
||||
router = APIRouter()
|
||||
|
||||
|
|
@ -76,6 +77,7 @@ async def get_surfsense_doc_by_chunk_id(
|
|||
id=document.id,
|
||||
title=document.title,
|
||||
source=document.source,
|
||||
public_url=surfsense_docs_public_url(document.source),
|
||||
content=document.content,
|
||||
chunks=[
|
||||
SurfsenseDocsChunkRead(id=c.id, content=c.content)
|
||||
|
|
@ -146,6 +148,7 @@ async def list_surfsense_docs(
|
|||
id=doc.id,
|
||||
title=doc.title,
|
||||
source=doc.source,
|
||||
public_url=surfsense_docs_public_url(doc.source),
|
||||
content=doc.content,
|
||||
created_at=doc.created_at,
|
||||
updated_at=doc.updated_at,
|
||||
|
|
|
|||
|
|
@ -22,6 +22,7 @@ class SurfsenseDocsDocumentRead(BaseModel):
|
|||
id: int
|
||||
title: str
|
||||
source: str
|
||||
public_url: str
|
||||
content: str
|
||||
created_at: datetime | None = None
|
||||
updated_at: datetime | None = None
|
||||
|
|
@ -35,6 +36,7 @@ class SurfsenseDocsDocumentWithChunksRead(BaseModel):
|
|||
id: int
|
||||
title: str
|
||||
source: str
|
||||
public_url: str
|
||||
content: str
|
||||
chunks: list[SurfsenseDocsChunkRead]
|
||||
|
||||
|
|
|
|||
|
|
@ -79,6 +79,7 @@ from app.tasks.chat.streaming.helpers.interrupt_inspector import (
|
|||
)
|
||||
from app.utils.content_utils import bootstrap_history_from_db
|
||||
from app.utils.perf import get_perf_logger, log_system_snapshot, trim_native_heap
|
||||
from app.utils.surfsense_docs import surfsense_docs_public_url
|
||||
from app.utils.user_message_multimodal import build_human_message_content
|
||||
|
||||
_background_tasks: set[asyncio.Task] = set()
|
||||
|
|
@ -214,14 +215,17 @@ def format_mentioned_surfsense_docs_as_context(
|
|||
)
|
||||
|
||||
for doc in documents:
|
||||
metadata_json = json.dumps({"source": doc.source}, ensure_ascii=False)
|
||||
public_url = surfsense_docs_public_url(doc.source)
|
||||
metadata_json = json.dumps(
|
||||
{"source": doc.source, "public_url": public_url}, ensure_ascii=False
|
||||
)
|
||||
|
||||
context_parts.append("<document>")
|
||||
context_parts.append("<document_metadata>")
|
||||
context_parts.append(f" <document_id>doc-{doc.id}</document_id>")
|
||||
context_parts.append(" <document_type>SURFSENSE_DOCS</document_type>")
|
||||
context_parts.append(f" <title><![CDATA[{doc.title}]]></title>")
|
||||
context_parts.append(f" <url><![CDATA[{doc.source}]]></url>")
|
||||
context_parts.append(f" <url><![CDATA[{public_url}]]></url>")
|
||||
context_parts.append(
|
||||
f" <metadata_json><![CDATA[{metadata_json}]]></metadata_json>"
|
||||
)
|
||||
|
|
|
|||
13
surfsense_backend/app/utils/surfsense_docs.py
Normal file
13
surfsense_backend/app/utils/surfsense_docs.py
Normal file
|
|
@ -0,0 +1,13 @@
|
|||
"""Utilities for SurfSense's built-in documentation index."""
|
||||
|
||||
from pathlib import PurePosixPath
|
||||
|
||||
DOCS_PUBLIC_ROOT = PurePosixPath("/docs")
|
||||
|
||||
|
||||
def surfsense_docs_public_url(source: str) -> str:
|
||||
"""Return the public docs route for an indexed documentation source path."""
|
||||
docs_path = PurePosixPath(source).with_suffix("")
|
||||
if docs_path.name == "index":
|
||||
docs_path = docs_path.parent
|
||||
return (DOCS_PUBLIC_ROOT / docs_path).as_posix()
|
||||
|
|
@ -1,6 +1,6 @@
|
|||
[project]
|
||||
name = "surf-new-backend"
|
||||
version = "0.0.23"
|
||||
version = "0.0.24"
|
||||
description = "SurfSense Backend"
|
||||
requires-python = ">=3.12"
|
||||
dependencies = [
|
||||
|
|
|
|||
|
|
@ -4,10 +4,15 @@ set -e
|
|||
# ─────────────────────────────────────────────────────────────
|
||||
# SERVICE_ROLE controls which process(es) this container runs.
|
||||
#
|
||||
# api – FastAPI backend only (runs migrations on startup)
|
||||
# migrate – Run `alembic upgrade head`, verify zero_publication,
|
||||
# then exit 0. Used by the dedicated `migrations` service
|
||||
# in docker-compose.yml so downstream services can gate
|
||||
# on `condition: service_completed_successfully`.
|
||||
# api – FastAPI backend only (does NOT run migrations)
|
||||
# worker – Celery worker only
|
||||
# beat – Celery beat scheduler only
|
||||
# all – All three in one container (legacy / dev default)
|
||||
# all – migrations + api + worker + beat in one container
|
||||
# (legacy / dev default; fails fast on migration error)
|
||||
#
|
||||
# Set SERVICE_ROLE as an environment variable in Coolify for
|
||||
# each service deployment.
|
||||
|
|
@ -41,7 +46,13 @@ cleanup() {
|
|||
|
||||
trap cleanup SIGTERM SIGINT
|
||||
|
||||
# ── Database migrations (only for api / all) ─────────────────
|
||||
# ── Database migrations (only for migrate / all) ─────────────
|
||||
# Fail-fast contract:
|
||||
# - alembic upgrade head must succeed within ${MIGRATION_TIMEOUT:-900}s
|
||||
# - zero_publication must exist in pg_publication afterwards
|
||||
# Either failure exits non-zero so the dedicated `migrations` compose
|
||||
# service exits non-zero, halting the rest of the stack instead of
|
||||
# silently producing a half-built system that crash-loops zero-cache.
|
||||
run_migrations() {
|
||||
echo "Running database migrations..."
|
||||
for i in {1..30}; do
|
||||
|
|
@ -53,11 +64,66 @@ run_migrations() {
|
|||
sleep 1
|
||||
done
|
||||
|
||||
if timeout 300 alembic upgrade head 2>&1; then
|
||||
echo "Migrations completed successfully."
|
||||
else
|
||||
echo "WARNING: Migration failed or timed out. Continuing anyway..."
|
||||
echo "You may need to run migrations manually: alembic upgrade head"
|
||||
local timeout_secs="${MIGRATION_TIMEOUT:-900}"
|
||||
echo "Running alembic upgrade head (timeout=${timeout_secs}s)..."
|
||||
if ! timeout "${timeout_secs}" alembic upgrade head; then
|
||||
echo "ERROR: alembic upgrade head failed (or exceeded ${timeout_secs}s timeout)." >&2
|
||||
echo "Refusing to start. Inspect the error above and re-run." >&2
|
||||
exit 1
|
||||
fi
|
||||
echo "Migrations completed successfully."
|
||||
|
||||
echo "Verifying zero_publication exists in Postgres..."
|
||||
local pub_oid
|
||||
pub_oid=$(python <<'PY' 2>/dev/null || true
|
||||
import asyncio
|
||||
import sys
|
||||
from sqlalchemy import text
|
||||
from app.db import engine
|
||||
|
||||
|
||||
async def get_oid():
|
||||
async with engine.connect() as conn:
|
||||
result = await conn.execute(
|
||||
text("SELECT oid FROM pg_publication WHERE pubname = 'zero_publication'")
|
||||
)
|
||||
row = result.first()
|
||||
if row is None:
|
||||
sys.exit(1)
|
||||
print(int(row[0]))
|
||||
|
||||
|
||||
asyncio.run(get_oid())
|
||||
PY
|
||||
)
|
||||
if [ -z "${pub_oid}" ]; then
|
||||
echo "ERROR: zero_publication is missing from Postgres after running alembic." >&2
|
||||
echo "This usually means migration 116 (or a later publication migration) did not run." >&2
|
||||
echo "Inspect alembic state with:" >&2
|
||||
echo " docker compose exec db psql -U \"\$DB_USER\" -d \"\$DB_NAME\" -c 'SELECT * FROM alembic_version;'" >&2
|
||||
exit 1
|
||||
fi
|
||||
echo "zero_publication verified (oid=${pub_oid})."
|
||||
|
||||
# Stale-replica safety net: if /zero-init is mounted (i.e. we are the
|
||||
# dedicated `migrations` compose service), drop a marker file when the
|
||||
# publication oid changed (or on first run) so the wrapped zero-cache
|
||||
# entrypoint can wipe /data/zero.db before starting. This recovers from
|
||||
# the case where a previous zero-cache crashed mid-init and left a
|
||||
# half-built SQLite replica without a `_zero.tableMetadata` table.
|
||||
if [ -d /zero-init ]; then
|
||||
local stored_oid=""
|
||||
[ -f /zero-init/last_pub_oid ] && stored_oid=$(cat /zero-init/last_pub_oid 2>/dev/null || true)
|
||||
if [ -z "${stored_oid}" ] || [ "${stored_oid}" != "${pub_oid}" ]; then
|
||||
echo "Publication oid changed (stored=${stored_oid:-<none>}, current=${pub_oid}); writing /zero-init/needs_reset."
|
||||
: > /zero-init/needs_reset
|
||||
chmod 666 /zero-init/needs_reset 2>/dev/null || true
|
||||
fi
|
||||
echo "${pub_oid}" > /zero-init/last_pub_oid
|
||||
chmod 666 /zero-init/last_pub_oid 2>/dev/null || true
|
||||
# World-writable dir so the (possibly non-root) zero-cache container
|
||||
# can `rm -f /zero-init/needs_reset` after acting on the marker.
|
||||
chmod 777 /zero-init 2>/dev/null || true
|
||||
fi
|
||||
}
|
||||
|
||||
|
|
@ -102,8 +168,12 @@ start_beat() {
|
|||
|
||||
# ── Main: run based on role ──────────────────────────────────
|
||||
case "${SERVICE_ROLE}" in
|
||||
api)
|
||||
migrate)
|
||||
run_migrations
|
||||
echo "Migrations complete; exiting cleanly."
|
||||
exit 0
|
||||
;;
|
||||
api)
|
||||
start_api
|
||||
;;
|
||||
worker)
|
||||
|
|
@ -121,7 +191,7 @@ case "${SERVICE_ROLE}" in
|
|||
start_beat
|
||||
;;
|
||||
*)
|
||||
echo "ERROR: Unknown SERVICE_ROLE '${SERVICE_ROLE}'. Use: api, worker, beat, or all"
|
||||
echo "ERROR: Unknown SERVICE_ROLE '${SERVICE_ROLE}'. Use: migrate, api, worker, beat, or all"
|
||||
exit 1
|
||||
;;
|
||||
esac
|
||||
|
|
|
|||
2
surfsense_backend/uv.lock
generated
2
surfsense_backend/uv.lock
generated
|
|
@ -7947,7 +7947,7 @@ wheels = [
|
|||
|
||||
[[package]]
|
||||
name = "surf-new-backend"
|
||||
version = "0.0.23"
|
||||
version = "0.0.24"
|
||||
source = { editable = "." }
|
||||
dependencies = [
|
||||
{ name = "alembic" },
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue