mirror of
https://github.com/MODSetter/SurfSense.git
synced 2026-04-26 01:06:23 +02:00
feat: enhance knowledge base search with date filtering
This commit is contained in:
parent
006dccbe4b
commit
ad0e77c3d6
7 changed files with 660 additions and 12 deletions
|
|
@ -152,7 +152,9 @@ class _FakeReconciliationStripeClient:
|
|||
|
||||
|
||||
class TestStripeCheckoutSessionCreation:
|
||||
async def test_get_status_reflects_backend_toggle(self, client, headers, monkeypatch):
|
||||
async def test_get_status_reflects_backend_toggle(
|
||||
self, client, headers, monkeypatch
|
||||
):
|
||||
monkeypatch.setattr(stripe_routes.config, "STRIPE_PAGE_BUYING_ENABLED", False)
|
||||
disabled_response = await client.get("/api/v1/stripe/status", headers=headers)
|
||||
assert disabled_response.status_code == 200, disabled_response.text
|
||||
|
|
@ -237,7 +239,9 @@ class TestStripeCheckoutSessionCreation:
|
|||
)
|
||||
|
||||
assert response.status_code == 503, response.text
|
||||
assert response.json()["detail"] == "Page purchases are temporarily unavailable."
|
||||
assert (
|
||||
response.json()["detail"] == "Page purchases are temporarily unavailable."
|
||||
)
|
||||
|
||||
purchase_count = await _fetchrow("SELECT COUNT(*) AS count FROM page_purchases")
|
||||
assert purchase_count is not None
|
||||
|
|
|
|||
|
|
@ -3,7 +3,7 @@
|
|||
from __future__ import annotations
|
||||
|
||||
import uuid
|
||||
from datetime import UTC, datetime
|
||||
from datetime import UTC, datetime, timedelta
|
||||
|
||||
import pytest_asyncio
|
||||
from sqlalchemy.ext.asyncio import AsyncSession
|
||||
|
|
@ -22,6 +22,7 @@ def _make_document(
|
|||
content: str,
|
||||
search_space_id: int,
|
||||
created_by_id: str,
|
||||
updated_at: datetime | None = None,
|
||||
) -> Document:
|
||||
uid = uuid.uuid4().hex[:12]
|
||||
return Document(
|
||||
|
|
@ -34,7 +35,7 @@ def _make_document(
|
|||
search_space_id=search_space_id,
|
||||
created_by_id=created_by_id,
|
||||
embedding=DUMMY_EMBEDDING,
|
||||
updated_at=datetime.now(UTC),
|
||||
updated_at=updated_at or datetime.now(UTC),
|
||||
status={"state": "ready"},
|
||||
)
|
||||
|
||||
|
|
@ -104,3 +105,54 @@ async def seed_large_doc(
|
|||
"search_space": db_search_space,
|
||||
"user": db_user,
|
||||
}
|
||||
|
||||
|
||||
@pytest_asyncio.fixture
|
||||
async def seed_date_filtered_docs(
|
||||
db_session: AsyncSession, db_user: User, db_search_space: SearchSpace
|
||||
):
|
||||
"""Insert matching docs with different timestamps for date-filter tests."""
|
||||
user_id = str(db_user.id)
|
||||
space_id = db_search_space.id
|
||||
now = datetime.now(UTC)
|
||||
|
||||
recent_doc = _make_document(
|
||||
title="Recent OCV Notes",
|
||||
document_type=DocumentType.FILE,
|
||||
content="ocv meeting decisions and action items",
|
||||
search_space_id=space_id,
|
||||
created_by_id=user_id,
|
||||
updated_at=now,
|
||||
)
|
||||
old_doc = _make_document(
|
||||
title="Old OCV Notes",
|
||||
document_type=DocumentType.FILE,
|
||||
content="ocv meeting decisions and action items",
|
||||
search_space_id=space_id,
|
||||
created_by_id=user_id,
|
||||
updated_at=now - timedelta(days=730),
|
||||
)
|
||||
|
||||
db_session.add_all([recent_doc, old_doc])
|
||||
await db_session.flush()
|
||||
|
||||
db_session.add_all(
|
||||
[
|
||||
_make_chunk(
|
||||
content="ocv meeting decisions and action items recent",
|
||||
document_id=recent_doc.id,
|
||||
),
|
||||
_make_chunk(
|
||||
content="ocv meeting decisions and action items old",
|
||||
document_id=old_doc.id,
|
||||
),
|
||||
]
|
||||
)
|
||||
await db_session.flush()
|
||||
|
||||
return {
|
||||
"recent_doc": recent_doc,
|
||||
"old_doc": old_doc,
|
||||
"search_space": db_search_space,
|
||||
"user": db_user,
|
||||
}
|
||||
|
|
|
|||
|
|
@ -0,0 +1,62 @@
|
|||
"""Integration smoke tests for KB search query/date scoping."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from contextlib import asynccontextmanager
|
||||
from datetime import UTC, datetime, timedelta
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from app.agents.new_chat.middleware.knowledge_search import search_knowledge_base
|
||||
|
||||
from .conftest import DUMMY_EMBEDDING
|
||||
|
||||
pytestmark = pytest.mark.integration
|
||||
|
||||
|
||||
async def test_search_knowledge_base_applies_date_filters(
|
||||
db_session,
|
||||
seed_date_filtered_docs,
|
||||
monkeypatch,
|
||||
):
|
||||
"""Date filters should remove older matching documents from scoped KB results."""
|
||||
|
||||
@asynccontextmanager
|
||||
async def fake_shielded_async_session():
|
||||
yield db_session
|
||||
|
||||
monkeypatch.setattr(
|
||||
"app.agents.new_chat.middleware.knowledge_search.shielded_async_session",
|
||||
fake_shielded_async_session,
|
||||
)
|
||||
monkeypatch.setattr(
|
||||
"app.agents.new_chat.middleware.knowledge_search.embed_texts",
|
||||
lambda texts: [np.array(DUMMY_EMBEDDING) for _ in texts],
|
||||
)
|
||||
|
||||
space_id = seed_date_filtered_docs["search_space"].id
|
||||
recent_cutoff = datetime.now(UTC) - timedelta(days=30)
|
||||
|
||||
unfiltered_results = await search_knowledge_base(
|
||||
query="ocv meeting decisions",
|
||||
search_space_id=space_id,
|
||||
available_document_types=["FILE"],
|
||||
top_k=10,
|
||||
)
|
||||
filtered_results = await search_knowledge_base(
|
||||
query="ocv meeting decisions",
|
||||
search_space_id=space_id,
|
||||
available_document_types=["FILE"],
|
||||
top_k=10,
|
||||
start_date=recent_cutoff,
|
||||
end_date=datetime.now(UTC),
|
||||
)
|
||||
|
||||
unfiltered_ids = {result["document"]["id"] for result in unfiltered_results}
|
||||
filtered_ids = {result["document"]["id"] for result in filtered_results}
|
||||
|
||||
assert seed_date_filtered_docs["recent_doc"].id in unfiltered_ids
|
||||
assert seed_date_filtered_docs["old_doc"].id in unfiltered_ids
|
||||
assert seed_date_filtered_docs["recent_doc"].id in filtered_ids
|
||||
assert seed_date_filtered_docs["old_doc"].id not in filtered_ids
|
||||
|
|
@ -1,12 +1,16 @@
|
|||
"""Unit tests for knowledge_search middleware helpers.
|
||||
"""Unit tests for knowledge_search middleware helpers."""
|
||||
|
||||
These test pure functions that don't require a database.
|
||||
"""
|
||||
import json
|
||||
|
||||
import pytest
|
||||
from langchain_core.messages import AIMessage, HumanMessage
|
||||
|
||||
from app.agents.new_chat.middleware.knowledge_search import (
|
||||
KnowledgeBaseSearchMiddleware,
|
||||
_build_document_xml,
|
||||
_normalize_optional_date_range,
|
||||
_parse_kb_search_plan_response,
|
||||
_render_recent_conversation,
|
||||
_resolve_search_types,
|
||||
)
|
||||
|
||||
|
|
@ -131,3 +135,234 @@ class TestBuildDocumentXml:
|
|||
line for line in lines if "<![CDATA[" in line and "<chunk" in line
|
||||
]
|
||||
assert len(chunk_lines) == 3
|
||||
|
||||
|
||||
# ── planner parsing / date normalization ───────────────────────────────
|
||||
|
||||
|
||||
class TestPlannerHelpers:
|
||||
def test_parse_kb_search_plan_response_accepts_plain_json(self):
|
||||
plan = _parse_kb_search_plan_response(
|
||||
json.dumps(
|
||||
{
|
||||
"optimized_query": "ocv meeting decisions summary",
|
||||
"start_date": "2026-03-01",
|
||||
"end_date": "2026-03-31",
|
||||
}
|
||||
)
|
||||
)
|
||||
assert plan.optimized_query == "ocv meeting decisions summary"
|
||||
assert plan.start_date == "2026-03-01"
|
||||
assert plan.end_date == "2026-03-31"
|
||||
|
||||
def test_parse_kb_search_plan_response_accepts_fenced_json(self):
|
||||
plan = _parse_kb_search_plan_response(
|
||||
"""```json
|
||||
{"optimized_query":"deel founders guide","start_date":null,"end_date":null}
|
||||
```"""
|
||||
)
|
||||
assert plan.optimized_query == "deel founders guide"
|
||||
assert plan.start_date is None
|
||||
assert plan.end_date is None
|
||||
|
||||
def test_normalize_optional_date_range_returns_none_when_absent(self):
|
||||
start_date, end_date = _normalize_optional_date_range(None, None)
|
||||
assert start_date is None
|
||||
assert end_date is None
|
||||
|
||||
def test_normalize_optional_date_range_resolves_single_bound(self):
|
||||
start_date, end_date = _normalize_optional_date_range("2026-03-01", None)
|
||||
assert start_date is not None
|
||||
assert end_date is not None
|
||||
assert start_date.date().isoformat() == "2026-03-01"
|
||||
assert end_date >= start_date
|
||||
|
||||
|
||||
class FakeLLM:
|
||||
def __init__(self, response_text: str):
|
||||
self.response_text = response_text
|
||||
self.calls: list[dict] = []
|
||||
|
||||
async def ainvoke(self, messages, config=None):
|
||||
self.calls.append({"messages": messages, "config": config})
|
||||
return AIMessage(content=self.response_text)
|
||||
|
||||
|
||||
class FakeBudgetLLM:
|
||||
def __init__(self, *, max_input_tokens: int):
|
||||
self._max_input_tokens_value = max_input_tokens
|
||||
|
||||
def _get_max_input_tokens(self) -> int:
|
||||
return self._max_input_tokens_value
|
||||
|
||||
def _count_tokens(self, messages) -> int:
|
||||
# Deterministic, simple proxy for tests: count characters as tokens.
|
||||
return sum(len(msg.get("content", "")) for msg in messages)
|
||||
|
||||
|
||||
class TestKnowledgeBaseSearchMiddlewarePlanner:
|
||||
def test_render_recent_conversation_prefers_latest_messages_under_budget(self):
|
||||
messages = [
|
||||
HumanMessage(content="old user context " * 40),
|
||||
AIMessage(content="old assistant answer " * 35),
|
||||
HumanMessage(content="recent user context " * 20),
|
||||
AIMessage(content="recent assistant answer " * 18),
|
||||
HumanMessage(content="latest question"),
|
||||
]
|
||||
|
||||
rendered = _render_recent_conversation(
|
||||
messages,
|
||||
llm=FakeBudgetLLM(max_input_tokens=900),
|
||||
user_text="latest question",
|
||||
)
|
||||
|
||||
assert "recent user context" in rendered
|
||||
assert "recent assistant answer" in rendered
|
||||
assert "latest question" not in rendered
|
||||
assert rendered.index("recent user context") < rendered.index(
|
||||
"recent assistant answer"
|
||||
)
|
||||
|
||||
def test_render_recent_conversation_falls_back_to_legacy_without_budgeting(self):
|
||||
messages = [
|
||||
HumanMessage(content="message one"),
|
||||
AIMessage(content="message two"),
|
||||
HumanMessage(content="latest question"),
|
||||
]
|
||||
|
||||
rendered = _render_recent_conversation(
|
||||
messages,
|
||||
llm=None,
|
||||
user_text="latest question",
|
||||
)
|
||||
|
||||
assert "user: message one" in rendered
|
||||
assert "assistant: message two" in rendered
|
||||
assert "latest question" not in rendered
|
||||
|
||||
async def test_middleware_uses_optimized_query_and_dates(self, monkeypatch):
|
||||
captured: dict = {}
|
||||
|
||||
async def fake_search_knowledge_base(**kwargs):
|
||||
captured.update(kwargs)
|
||||
return []
|
||||
|
||||
async def fake_build_scoped_filesystem(**kwargs):
|
||||
return {}
|
||||
|
||||
monkeypatch.setattr(
|
||||
"app.agents.new_chat.middleware.knowledge_search.search_knowledge_base",
|
||||
fake_search_knowledge_base,
|
||||
)
|
||||
monkeypatch.setattr(
|
||||
"app.agents.new_chat.middleware.knowledge_search.build_scoped_filesystem",
|
||||
fake_build_scoped_filesystem,
|
||||
)
|
||||
|
||||
llm = FakeLLM(
|
||||
json.dumps(
|
||||
{
|
||||
"optimized_query": "ocv meeting decisions action items",
|
||||
"start_date": "2026-03-01",
|
||||
"end_date": "2026-03-31",
|
||||
}
|
||||
)
|
||||
)
|
||||
middleware = KnowledgeBaseSearchMiddleware(llm=llm, search_space_id=37)
|
||||
|
||||
result = await middleware.abefore_agent(
|
||||
{
|
||||
"messages": [
|
||||
HumanMessage(content="what happened in our OCV meeting last month?")
|
||||
]
|
||||
},
|
||||
runtime=None,
|
||||
)
|
||||
|
||||
assert result is not None
|
||||
assert captured["query"] == "ocv meeting decisions action items"
|
||||
assert captured["start_date"] is not None
|
||||
assert captured["end_date"] is not None
|
||||
assert captured["start_date"].date().isoformat() == "2026-03-01"
|
||||
assert captured["end_date"].date().isoformat() == "2026-03-31"
|
||||
assert llm.calls[0]["config"] == {"tags": ["surfsense:internal"]}
|
||||
|
||||
async def test_middleware_falls_back_when_planner_returns_invalid_json(
|
||||
self,
|
||||
monkeypatch,
|
||||
):
|
||||
captured: dict = {}
|
||||
|
||||
async def fake_search_knowledge_base(**kwargs):
|
||||
captured.update(kwargs)
|
||||
return []
|
||||
|
||||
async def fake_build_scoped_filesystem(**kwargs):
|
||||
return {}
|
||||
|
||||
monkeypatch.setattr(
|
||||
"app.agents.new_chat.middleware.knowledge_search.search_knowledge_base",
|
||||
fake_search_knowledge_base,
|
||||
)
|
||||
monkeypatch.setattr(
|
||||
"app.agents.new_chat.middleware.knowledge_search.build_scoped_filesystem",
|
||||
fake_build_scoped_filesystem,
|
||||
)
|
||||
|
||||
middleware = KnowledgeBaseSearchMiddleware(
|
||||
llm=FakeLLM("not json"),
|
||||
search_space_id=37,
|
||||
)
|
||||
|
||||
await middleware.abefore_agent(
|
||||
{"messages": [HumanMessage(content="summarize founders guide by deel")]},
|
||||
runtime=None,
|
||||
)
|
||||
|
||||
assert captured["query"] == "summarize founders guide by deel"
|
||||
assert captured["start_date"] is None
|
||||
assert captured["end_date"] is None
|
||||
|
||||
async def test_middleware_passes_none_dates_when_planner_returns_nulls(
|
||||
self,
|
||||
monkeypatch,
|
||||
):
|
||||
captured: dict = {}
|
||||
|
||||
async def fake_search_knowledge_base(**kwargs):
|
||||
captured.update(kwargs)
|
||||
return []
|
||||
|
||||
async def fake_build_scoped_filesystem(**kwargs):
|
||||
return {}
|
||||
|
||||
monkeypatch.setattr(
|
||||
"app.agents.new_chat.middleware.knowledge_search.search_knowledge_base",
|
||||
fake_search_knowledge_base,
|
||||
)
|
||||
monkeypatch.setattr(
|
||||
"app.agents.new_chat.middleware.knowledge_search.build_scoped_filesystem",
|
||||
fake_build_scoped_filesystem,
|
||||
)
|
||||
|
||||
middleware = KnowledgeBaseSearchMiddleware(
|
||||
llm=FakeLLM(
|
||||
json.dumps(
|
||||
{
|
||||
"optimized_query": "deel founders guide summary",
|
||||
"start_date": None,
|
||||
"end_date": None,
|
||||
}
|
||||
)
|
||||
),
|
||||
search_space_id=37,
|
||||
)
|
||||
|
||||
await middleware.abefore_agent(
|
||||
{"messages": [HumanMessage(content="summarize founders guide by deel")]},
|
||||
runtime=None,
|
||||
)
|
||||
|
||||
assert captured["query"] == "deel founders guide summary"
|
||||
assert captured["start_date"] is None
|
||||
assert captured["end_date"] is None
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue