mirror of
https://github.com/MODSetter/SurfSense.git
synced 2026-06-26 21:39:43 +02:00
retrieval: add source label and retrieved-document adapter
This commit is contained in:
parent
608192057f
commit
407bfcd94f
2 changed files with 98 additions and 0 deletions
|
|
@ -0,0 +1,31 @@
|
|||
"""Turn retriever ``DocumentHit``s into renderable ``RetrievedDocument``s."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from app.agents.chat.multi_agent_chat.shared.retrieved_context import (
|
||||
RetrievedDocument,
|
||||
RetrievedPassage,
|
||||
)
|
||||
|
||||
from .models import DocumentHit
|
||||
from .source_label import source_label
|
||||
|
||||
|
||||
def to_retrieved_document(hit: DocumentHit) -> RetrievedDocument:
|
||||
"""Map one hit to the shape the ``<retrieved_context>`` renderer consumes."""
|
||||
return RetrievedDocument(
|
||||
document_id=hit.document_id,
|
||||
title=hit.title,
|
||||
source_label=source_label(hit.document_type, hit.metadata),
|
||||
passages=[
|
||||
RetrievedPassage(
|
||||
document_id=hit.document_id,
|
||||
chunk_id=chunk.chunk_id,
|
||||
content=chunk.content,
|
||||
)
|
||||
for chunk in hit.chunks
|
||||
],
|
||||
)
|
||||
|
||||
|
||||
__all__ = ["to_retrieved_document"]
|
||||
|
|
@ -0,0 +1,67 @@
|
|||
"""Build a short, honest source label for a retrieved document.
|
||||
|
||||
A label orients the model about where a passage came from — e.g. ``Slack`` or
|
||||
``Web · docs.python.org``. It is derived only from the document's type and any
|
||||
URL in its metadata, so it never asserts detail we don't actually have.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import Any
|
||||
from urllib.parse import urlparse
|
||||
|
||||
_FRIENDLY_NAMES = {
|
||||
"FILE": "File",
|
||||
"NOTE": "Note",
|
||||
"EXTENSION": "Saved page",
|
||||
"CRAWLED_URL": "Web",
|
||||
"YOUTUBE_VIDEO": "YouTube",
|
||||
"SLACK_CONNECTOR": "Slack",
|
||||
"TEAMS_CONNECTOR": "Teams",
|
||||
"DISCORD_CONNECTOR": "Discord",
|
||||
"NOTION_CONNECTOR": "Notion",
|
||||
"GITHUB_CONNECTOR": "GitHub",
|
||||
"LINEAR_CONNECTOR": "Linear",
|
||||
"JIRA_CONNECTOR": "Jira",
|
||||
"CONFLUENCE_CONNECTOR": "Confluence",
|
||||
"CLICKUP_CONNECTOR": "ClickUp",
|
||||
"AIRTABLE_CONNECTOR": "Airtable",
|
||||
"OBSIDIAN_CONNECTOR": "Obsidian",
|
||||
"BOOKSTACK_CONNECTOR": "BookStack",
|
||||
}
|
||||
|
||||
_URL_KEYS = ("url", "source_url", "link", "source")
|
||||
|
||||
|
||||
def source_label(document_type: str | None, metadata: dict[str, Any]) -> str | None:
|
||||
"""``Source`` or ``Source · host``; ``None`` when nothing is known."""
|
||||
name = _friendly_name(document_type)
|
||||
host = _url_host(metadata)
|
||||
if name and host:
|
||||
return f"{name} · {host}"
|
||||
return name or host
|
||||
|
||||
|
||||
def _friendly_name(document_type: str | None) -> str | None:
|
||||
if not document_type:
|
||||
return None
|
||||
return _FRIENDLY_NAMES.get(document_type, _prettify(document_type))
|
||||
|
||||
|
||||
def _prettify(document_type: str) -> str:
|
||||
"""Fallback name for unmapped types: ``GOOGLE_DRIVE_FILE`` → ``Google Drive``."""
|
||||
words = document_type.replace("_CONNECTOR", "").replace("_FILE", "").split("_")
|
||||
return " ".join(word.capitalize() for word in words if word)
|
||||
|
||||
|
||||
def _url_host(metadata: dict[str, Any]) -> str | None:
|
||||
for key in _URL_KEYS:
|
||||
value = metadata.get(key)
|
||||
if isinstance(value, str) and value.startswith(("http://", "https://")):
|
||||
host = urlparse(value).netloc
|
||||
if host:
|
||||
return host.removeprefix("www.")
|
||||
return None
|
||||
|
||||
|
||||
__all__ = ["source_label"]
|
||||
Loading…
Add table
Add a link
Reference in a new issue