From 407bfcd94ff3798f5bc0a31288760125f44c839d Mon Sep 17 00:00:00 2001 From: CREDO23 Date: Thu, 25 Jun 2026 08:23:29 +0200 Subject: [PATCH] retrieval: add source label and retrieved-document adapter --- .../shared/retrieval/adapter.py | 31 +++++++++ .../shared/retrieval/source_label.py | 67 +++++++++++++++++++ 2 files changed, 98 insertions(+) create mode 100644 surfsense_backend/app/agents/chat/multi_agent_chat/shared/retrieval/adapter.py create mode 100644 surfsense_backend/app/agents/chat/multi_agent_chat/shared/retrieval/source_label.py diff --git a/surfsense_backend/app/agents/chat/multi_agent_chat/shared/retrieval/adapter.py b/surfsense_backend/app/agents/chat/multi_agent_chat/shared/retrieval/adapter.py new file mode 100644 index 000000000..3e7ee79d3 --- /dev/null +++ b/surfsense_backend/app/agents/chat/multi_agent_chat/shared/retrieval/adapter.py @@ -0,0 +1,31 @@ +"""Turn retriever ``DocumentHit``s into renderable ``RetrievedDocument``s.""" + +from __future__ import annotations + +from app.agents.chat.multi_agent_chat.shared.retrieved_context import ( + RetrievedDocument, + RetrievedPassage, +) + +from .models import DocumentHit +from .source_label import source_label + + +def to_retrieved_document(hit: DocumentHit) -> RetrievedDocument: + """Map one hit to the shape the ```` renderer consumes.""" + return RetrievedDocument( + document_id=hit.document_id, + title=hit.title, + source_label=source_label(hit.document_type, hit.metadata), + passages=[ + RetrievedPassage( + document_id=hit.document_id, + chunk_id=chunk.chunk_id, + content=chunk.content, + ) + for chunk in hit.chunks + ], + ) + + +__all__ = ["to_retrieved_document"] diff --git a/surfsense_backend/app/agents/chat/multi_agent_chat/shared/retrieval/source_label.py b/surfsense_backend/app/agents/chat/multi_agent_chat/shared/retrieval/source_label.py new file mode 100644 index 000000000..12b3ad6ac --- /dev/null +++ b/surfsense_backend/app/agents/chat/multi_agent_chat/shared/retrieval/source_label.py @@ -0,0 +1,67 @@ +"""Build a short, honest source label for a retrieved document. + +A label orients the model about where a passage came from — e.g. ``Slack`` or +``Web · docs.python.org``. It is derived only from the document's type and any +URL in its metadata, so it never asserts detail we don't actually have. +""" + +from __future__ import annotations + +from typing import Any +from urllib.parse import urlparse + +_FRIENDLY_NAMES = { + "FILE": "File", + "NOTE": "Note", + "EXTENSION": "Saved page", + "CRAWLED_URL": "Web", + "YOUTUBE_VIDEO": "YouTube", + "SLACK_CONNECTOR": "Slack", + "TEAMS_CONNECTOR": "Teams", + "DISCORD_CONNECTOR": "Discord", + "NOTION_CONNECTOR": "Notion", + "GITHUB_CONNECTOR": "GitHub", + "LINEAR_CONNECTOR": "Linear", + "JIRA_CONNECTOR": "Jira", + "CONFLUENCE_CONNECTOR": "Confluence", + "CLICKUP_CONNECTOR": "ClickUp", + "AIRTABLE_CONNECTOR": "Airtable", + "OBSIDIAN_CONNECTOR": "Obsidian", + "BOOKSTACK_CONNECTOR": "BookStack", +} + +_URL_KEYS = ("url", "source_url", "link", "source") + + +def source_label(document_type: str | None, metadata: dict[str, Any]) -> str | None: + """``Source`` or ``Source · host``; ``None`` when nothing is known.""" + name = _friendly_name(document_type) + host = _url_host(metadata) + if name and host: + return f"{name} · {host}" + return name or host + + +def _friendly_name(document_type: str | None) -> str | None: + if not document_type: + return None + return _FRIENDLY_NAMES.get(document_type, _prettify(document_type)) + + +def _prettify(document_type: str) -> str: + """Fallback name for unmapped types: ``GOOGLE_DRIVE_FILE`` → ``Google Drive``.""" + words = document_type.replace("_CONNECTOR", "").replace("_FILE", "").split("_") + return " ".join(word.capitalize() for word in words if word) + + +def _url_host(metadata: dict[str, Any]) -> str | None: + for key in _URL_KEYS: + value = metadata.get(key) + if isinstance(value, str) and value.startswith(("http://", "https://")): + host = urlparse(value).netloc + if host: + return host.removeprefix("www.") + return None + + +__all__ = ["source_label"]