mirror of
https://github.com/MODSetter/SurfSense.git
synced 2026-05-17 18:35:19 +02:00
Merge remote-tracking branch 'upstream/dev' into feat/unified-etl-pipeline
This commit is contained in:
commit
63a75052ca
76 changed files with 3041 additions and 376 deletions
|
|
@ -32,6 +32,7 @@ logger = logging.getLogger(__name__)
|
|||
class LLMRole:
|
||||
AGENT = "agent" # For agent/chat operations
|
||||
DOCUMENT_SUMMARY = "document_summary" # For document summarization
|
||||
VISION = "vision" # For vision/screenshot analysis
|
||||
|
||||
|
||||
def get_global_llm_config(llm_config_id: int) -> dict | None:
|
||||
|
|
@ -187,7 +188,7 @@ async def get_search_space_llm_instance(
|
|||
Args:
|
||||
session: Database session
|
||||
search_space_id: Search Space ID
|
||||
role: LLM role ('agent' or 'document_summary')
|
||||
role: LLM role ('agent', 'document_summary', or 'vision')
|
||||
|
||||
Returns:
|
||||
ChatLiteLLM or ChatLiteLLMRouter instance, or None if not found
|
||||
|
|
@ -209,6 +210,8 @@ async def get_search_space_llm_instance(
|
|||
llm_config_id = search_space.agent_llm_id
|
||||
elif role == LLMRole.DOCUMENT_SUMMARY:
|
||||
llm_config_id = search_space.document_summary_llm_id
|
||||
elif role == LLMRole.VISION:
|
||||
llm_config_id = search_space.vision_llm_id
|
||||
else:
|
||||
logger.error(f"Invalid LLM role: {role}")
|
||||
return None
|
||||
|
|
@ -405,6 +408,13 @@ async def get_document_summary_llm(
|
|||
)
|
||||
|
||||
|
||||
async def get_vision_llm(
|
||||
session: AsyncSession, search_space_id: int
|
||||
) -> ChatLiteLLM | ChatLiteLLMRouter | None:
|
||||
"""Get the search space's vision LLM instance for screenshot analysis."""
|
||||
return await get_search_space_llm_instance(session, search_space_id, LLMRole.VISION)
|
||||
|
||||
|
||||
# Backward-compatible alias (LLM preferences are now per-search-space, not per-user)
|
||||
async def get_user_long_context_llm(
|
||||
session: AsyncSession,
|
||||
|
|
|
|||
|
|
@ -3,7 +3,7 @@ Service for managing user page limits for ETL services.
|
|||
"""
|
||||
|
||||
import os
|
||||
from pathlib import Path
|
||||
from pathlib import Path, PurePosixPath
|
||||
|
||||
from sqlalchemy import select
|
||||
from sqlalchemy.ext.asyncio import AsyncSession
|
||||
|
|
@ -223,10 +223,155 @@ class PageLimitService:
|
|||
# Estimate ~2000 characters per page
|
||||
return max(1, content_length // 2000)
|
||||
|
||||
@staticmethod
|
||||
def estimate_pages_from_metadata(
|
||||
file_name_or_ext: str, file_size: int | str | None = None
|
||||
) -> int:
|
||||
"""Size-based page estimation from file name/extension and byte size.
|
||||
|
||||
Pure function — no file I/O, no database access. Used by cloud
|
||||
connectors (which only have API metadata) and as the internal
|
||||
fallback for :meth:`estimate_pages_before_processing`.
|
||||
|
||||
``file_name_or_ext`` can be a full filename (``"report.pdf"``) or
|
||||
a bare extension (``".pdf"``). ``file_size`` may be an int, a
|
||||
stringified int from a cloud API, or *None*.
|
||||
"""
|
||||
if file_size is not None:
|
||||
try:
|
||||
file_size = int(file_size)
|
||||
except (ValueError, TypeError):
|
||||
file_size = 0
|
||||
else:
|
||||
file_size = 0
|
||||
|
||||
if file_size <= 0:
|
||||
return 1
|
||||
|
||||
ext = PurePosixPath(file_name_or_ext).suffix.lower() if file_name_or_ext else ""
|
||||
if not ext and file_name_or_ext.startswith("."):
|
||||
ext = file_name_or_ext.lower()
|
||||
file_ext = ext
|
||||
|
||||
if file_ext == ".pdf":
|
||||
return max(1, file_size // (100 * 1024))
|
||||
|
||||
if file_ext in {
|
||||
".doc",
|
||||
".docx",
|
||||
".docm",
|
||||
".dot",
|
||||
".dotm",
|
||||
".odt",
|
||||
".ott",
|
||||
".sxw",
|
||||
".stw",
|
||||
".uot",
|
||||
".rtf",
|
||||
".pages",
|
||||
".wpd",
|
||||
".wps",
|
||||
".abw",
|
||||
".zabw",
|
||||
".cwk",
|
||||
".hwp",
|
||||
".lwp",
|
||||
".mcw",
|
||||
".mw",
|
||||
".sdw",
|
||||
".vor",
|
||||
}:
|
||||
return max(1, file_size // (50 * 1024))
|
||||
|
||||
if file_ext in {
|
||||
".ppt",
|
||||
".pptx",
|
||||
".pptm",
|
||||
".pot",
|
||||
".potx",
|
||||
".odp",
|
||||
".otp",
|
||||
".sxi",
|
||||
".sti",
|
||||
".uop",
|
||||
".key",
|
||||
".sda",
|
||||
".sdd",
|
||||
".sdp",
|
||||
}:
|
||||
return max(1, file_size // (200 * 1024))
|
||||
|
||||
if file_ext in {
|
||||
".xls",
|
||||
".xlsx",
|
||||
".xlsm",
|
||||
".xlsb",
|
||||
".xlw",
|
||||
".xlr",
|
||||
".ods",
|
||||
".ots",
|
||||
".fods",
|
||||
".numbers",
|
||||
".123",
|
||||
".wk1",
|
||||
".wk2",
|
||||
".wk3",
|
||||
".wk4",
|
||||
".wks",
|
||||
".wb1",
|
||||
".wb2",
|
||||
".wb3",
|
||||
".wq1",
|
||||
".wq2",
|
||||
".csv",
|
||||
".tsv",
|
||||
".slk",
|
||||
".sylk",
|
||||
".dif",
|
||||
".dbf",
|
||||
".prn",
|
||||
".qpw",
|
||||
".602",
|
||||
".et",
|
||||
".eth",
|
||||
}:
|
||||
return max(1, file_size // (100 * 1024))
|
||||
|
||||
if file_ext in {".epub"}:
|
||||
return max(1, file_size // (50 * 1024))
|
||||
|
||||
if file_ext in {".txt", ".log", ".md", ".markdown", ".htm", ".html", ".xml"}:
|
||||
return max(1, file_size // 3000)
|
||||
|
||||
if file_ext in {
|
||||
".jpg",
|
||||
".jpeg",
|
||||
".png",
|
||||
".gif",
|
||||
".bmp",
|
||||
".tiff",
|
||||
".webp",
|
||||
".svg",
|
||||
".cgm",
|
||||
".odg",
|
||||
".pbd",
|
||||
}:
|
||||
return 1
|
||||
|
||||
if file_ext in {".mp3", ".m4a", ".wav", ".mpga"}:
|
||||
return max(1, file_size // (1024 * 1024))
|
||||
|
||||
if file_ext in {".mp4", ".mpeg", ".webm"}:
|
||||
return max(1, file_size // (5 * 1024 * 1024))
|
||||
|
||||
return max(1, file_size // (80 * 1024))
|
||||
|
||||
def estimate_pages_before_processing(self, file_path: str) -> int:
|
||||
"""
|
||||
Estimate page count from file before processing (to avoid unnecessary API calls).
|
||||
This is called BEFORE sending to ETL services to prevent cost on rejected files.
|
||||
Estimate page count from a local file before processing.
|
||||
|
||||
For PDFs, attempts to read the actual page count via pypdf.
|
||||
For everything else, delegates to :meth:`estimate_pages_from_metadata`.
|
||||
|
||||
Args:
|
||||
file_path: Path to the file
|
||||
|
|
@ -240,7 +385,6 @@ class PageLimitService:
|
|||
file_ext = Path(file_path).suffix.lower()
|
||||
file_size = os.path.getsize(file_path)
|
||||
|
||||
# PDF files - try to get actual page count
|
||||
if file_ext == ".pdf":
|
||||
try:
|
||||
import pypdf
|
||||
|
|
@ -249,153 +393,6 @@ class PageLimitService:
|
|||
pdf_reader = pypdf.PdfReader(f)
|
||||
return len(pdf_reader.pages)
|
||||
except Exception:
|
||||
# If PDF reading fails, fall back to size estimation
|
||||
# Typical PDF: ~100KB per page (conservative estimate)
|
||||
return max(1, file_size // (100 * 1024))
|
||||
pass # fall through to size-based estimation
|
||||
|
||||
# Word Processing Documents
|
||||
# Microsoft Word, LibreOffice Writer, WordPerfect, Pages, etc.
|
||||
elif file_ext in [
|
||||
".doc",
|
||||
".docx",
|
||||
".docm",
|
||||
".dot",
|
||||
".dotm", # Microsoft Word
|
||||
".odt",
|
||||
".ott",
|
||||
".sxw",
|
||||
".stw",
|
||||
".uot", # OpenDocument/StarOffice Writer
|
||||
".rtf", # Rich Text Format
|
||||
".pages", # Apple Pages
|
||||
".wpd",
|
||||
".wps", # WordPerfect, Microsoft Works
|
||||
".abw",
|
||||
".zabw", # AbiWord
|
||||
".cwk",
|
||||
".hwp",
|
||||
".lwp",
|
||||
".mcw",
|
||||
".mw",
|
||||
".sdw",
|
||||
".vor", # Other word processors
|
||||
]:
|
||||
# Typical word document: ~50KB per page (conservative)
|
||||
return max(1, file_size // (50 * 1024))
|
||||
|
||||
# Presentation Documents
|
||||
# PowerPoint, Impress, Keynote, etc.
|
||||
elif file_ext in [
|
||||
".ppt",
|
||||
".pptx",
|
||||
".pptm",
|
||||
".pot",
|
||||
".potx", # Microsoft PowerPoint
|
||||
".odp",
|
||||
".otp",
|
||||
".sxi",
|
||||
".sti",
|
||||
".uop", # OpenDocument/StarOffice Impress
|
||||
".key", # Apple Keynote
|
||||
".sda",
|
||||
".sdd",
|
||||
".sdp", # StarOffice Draw/Impress
|
||||
]:
|
||||
# Typical presentation: ~200KB per slide (conservative)
|
||||
return max(1, file_size // (200 * 1024))
|
||||
|
||||
# Spreadsheet Documents
|
||||
# Excel, Calc, Numbers, Lotus, etc.
|
||||
elif file_ext in [
|
||||
".xls",
|
||||
".xlsx",
|
||||
".xlsm",
|
||||
".xlsb",
|
||||
".xlw",
|
||||
".xlr", # Microsoft Excel
|
||||
".ods",
|
||||
".ots",
|
||||
".fods", # OpenDocument Spreadsheet
|
||||
".numbers", # Apple Numbers
|
||||
".123",
|
||||
".wk1",
|
||||
".wk2",
|
||||
".wk3",
|
||||
".wk4",
|
||||
".wks", # Lotus 1-2-3
|
||||
".wb1",
|
||||
".wb2",
|
||||
".wb3",
|
||||
".wq1",
|
||||
".wq2", # Quattro Pro
|
||||
".csv",
|
||||
".tsv",
|
||||
".slk",
|
||||
".sylk",
|
||||
".dif",
|
||||
".dbf",
|
||||
".prn",
|
||||
".qpw", # Data formats
|
||||
".602",
|
||||
".et",
|
||||
".eth", # Other spreadsheets
|
||||
]:
|
||||
# Spreadsheets typically have 1 sheet = 1 page for ETL
|
||||
# Conservative: ~100KB per sheet
|
||||
return max(1, file_size // (100 * 1024))
|
||||
|
||||
# E-books
|
||||
elif file_ext in [".epub"]:
|
||||
# E-books vary widely, estimate by size
|
||||
# Typical e-book: ~50KB per page
|
||||
return max(1, file_size // (50 * 1024))
|
||||
|
||||
# Plain Text and Markup Files
|
||||
elif file_ext in [
|
||||
".txt",
|
||||
".log", # Plain text
|
||||
".md",
|
||||
".markdown", # Markdown
|
||||
".htm",
|
||||
".html",
|
||||
".xml", # Markup
|
||||
]:
|
||||
# Plain text: ~3000 bytes per page
|
||||
return max(1, file_size // 3000)
|
||||
|
||||
# Image Files
|
||||
# Each image is typically processed as 1 page
|
||||
elif file_ext in [
|
||||
".jpg",
|
||||
".jpeg", # JPEG
|
||||
".png", # PNG
|
||||
".gif", # GIF
|
||||
".bmp", # Bitmap
|
||||
".tiff", # TIFF
|
||||
".webp", # WebP
|
||||
".svg", # SVG
|
||||
".cgm", # Computer Graphics Metafile
|
||||
".odg",
|
||||
".pbd", # OpenDocument Graphics
|
||||
]:
|
||||
# Each image = 1 page
|
||||
return 1
|
||||
|
||||
# Audio Files (transcription = typically 1 page per minute)
|
||||
# Note: These should be handled by audio transcription flow, not ETL
|
||||
elif file_ext in [".mp3", ".m4a", ".wav", ".mpga"]:
|
||||
# Audio files: estimate based on duration
|
||||
# Fallback: ~1MB per minute of audio, 1 page per minute transcript
|
||||
return max(1, file_size // (1024 * 1024))
|
||||
|
||||
# Video Files (typically not processed for pages, but just in case)
|
||||
elif file_ext in [".mp4", ".mpeg", ".webm"]:
|
||||
# Video files: very rough estimate
|
||||
# Typically wouldn't be page-based, but use conservative estimate
|
||||
return max(1, file_size // (5 * 1024 * 1024))
|
||||
|
||||
# Other/Unknown Document Types
|
||||
else:
|
||||
# Conservative estimate: ~80KB per page
|
||||
# This catches: .sgl, .sxg, .uof, .uos1, .uos2, .web, and any future formats
|
||||
return max(1, file_size // (80 * 1024))
|
||||
return self.estimate_pages_from_metadata(file_ext, file_size)
|
||||
|
|
|
|||
225
surfsense_backend/app/services/vision_autocomplete_service.py
Normal file
225
surfsense_backend/app/services/vision_autocomplete_service.py
Normal file
|
|
@ -0,0 +1,225 @@
|
|||
import logging
|
||||
from typing import AsyncGenerator
|
||||
|
||||
from langchain_core.messages import HumanMessage, SystemMessage
|
||||
from sqlalchemy.ext.asyncio import AsyncSession
|
||||
|
||||
from app.retriever.chunks_hybrid_search import ChucksHybridSearchRetriever
|
||||
from app.services.llm_service import get_vision_llm
|
||||
from app.services.new_streaming_service import VercelStreamingService
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
KB_TOP_K = 5
|
||||
KB_MAX_CHARS = 4000
|
||||
|
||||
EXTRACT_QUERY_PROMPT = """Look at this screenshot and describe in 1-2 short sentences what the user is working on and what topic they need to write about. Be specific about the subject matter. Output ONLY the description, nothing else."""
|
||||
|
||||
EXTRACT_QUERY_PROMPT_WITH_APP = """The user is currently in the application "{app_name}" with the window titled "{window_title}".
|
||||
|
||||
Look at this screenshot and describe in 1-2 short sentences what the user is working on and what topic they need to write about. Be specific about the subject matter. Output ONLY the description, nothing else."""
|
||||
|
||||
VISION_SYSTEM_PROMPT = """You are a smart writing assistant that analyzes the user's screen to draft or complete text.
|
||||
|
||||
You will receive a screenshot of the user's screen. Your job:
|
||||
1. Analyze the ENTIRE screenshot to understand what the user is working on (email thread, chat conversation, document, code editor, form, etc.).
|
||||
2. Identify the text area where the user will type.
|
||||
3. Based on the full visual context, generate the text the user most likely wants to write.
|
||||
|
||||
Key behavior:
|
||||
- If the text area is EMPTY, draft a full response or message based on what you see on screen (e.g., reply to an email, respond to a chat message, continue a document).
|
||||
- If the text area already has text, continue it naturally.
|
||||
|
||||
Rules:
|
||||
- Output ONLY the text to be inserted. No quotes, no explanations, no meta-commentary.
|
||||
- Be concise but complete — a full thought, not a fragment.
|
||||
- Match the tone and formality of the surrounding context.
|
||||
- If the screen shows code, write code. If it shows a casual chat, be casual. If it shows a formal email, be formal.
|
||||
- Do NOT describe the screenshot or explain your reasoning.
|
||||
- If you cannot determine what to write, output nothing."""
|
||||
|
||||
APP_CONTEXT_BLOCK = """
|
||||
|
||||
The user is currently working in "{app_name}" (window: "{window_title}"). Use this to understand the type of application and adapt your tone and format accordingly."""
|
||||
|
||||
KB_CONTEXT_BLOCK = """
|
||||
|
||||
You also have access to the user's knowledge base documents below. Use them to write more accurate, informed, and contextually relevant text. Do NOT cite or reference the documents explicitly — just let the knowledge inform your writing naturally.
|
||||
|
||||
<knowledge_base>
|
||||
{kb_context}
|
||||
</knowledge_base>"""
|
||||
|
||||
|
||||
def _build_system_prompt(app_name: str, window_title: str, kb_context: str) -> str:
|
||||
"""Assemble the system prompt from optional context blocks."""
|
||||
prompt = VISION_SYSTEM_PROMPT
|
||||
if app_name:
|
||||
prompt += APP_CONTEXT_BLOCK.format(app_name=app_name, window_title=window_title)
|
||||
if kb_context:
|
||||
prompt += KB_CONTEXT_BLOCK.format(kb_context=kb_context)
|
||||
return prompt
|
||||
|
||||
|
||||
def _is_vision_unsupported_error(e: Exception) -> bool:
|
||||
"""Check if an exception indicates the model doesn't support vision/images."""
|
||||
msg = str(e).lower()
|
||||
return "content must be a string" in msg or "does not support image" in msg
|
||||
|
||||
|
||||
async def _extract_query_from_screenshot(
|
||||
llm, screenshot_data_url: str,
|
||||
app_name: str = "", window_title: str = "",
|
||||
) -> str | None:
|
||||
"""Ask the Vision LLM to describe what the user is working on.
|
||||
|
||||
Raises vision-unsupported errors so the caller can return a
|
||||
friendly message immediately instead of retrying with astream.
|
||||
"""
|
||||
if app_name:
|
||||
prompt_text = EXTRACT_QUERY_PROMPT_WITH_APP.format(
|
||||
app_name=app_name, window_title=window_title,
|
||||
)
|
||||
else:
|
||||
prompt_text = EXTRACT_QUERY_PROMPT
|
||||
|
||||
try:
|
||||
response = await llm.ainvoke([
|
||||
HumanMessage(content=[
|
||||
{"type": "text", "text": prompt_text},
|
||||
{"type": "image_url", "image_url": {"url": screenshot_data_url}},
|
||||
]),
|
||||
])
|
||||
query = response.content.strip() if hasattr(response, "content") else ""
|
||||
return query if query else None
|
||||
except Exception as e:
|
||||
if _is_vision_unsupported_error(e):
|
||||
raise
|
||||
logger.warning(f"Failed to extract query from screenshot: {e}")
|
||||
return None
|
||||
|
||||
|
||||
async def _search_knowledge_base(
|
||||
session: AsyncSession, search_space_id: int, query: str
|
||||
) -> str:
|
||||
"""Search the KB and return formatted context string."""
|
||||
try:
|
||||
retriever = ChucksHybridSearchRetriever(session)
|
||||
results = await retriever.hybrid_search(
|
||||
query_text=query,
|
||||
top_k=KB_TOP_K,
|
||||
search_space_id=search_space_id,
|
||||
)
|
||||
|
||||
if not results:
|
||||
return ""
|
||||
|
||||
parts: list[str] = []
|
||||
char_count = 0
|
||||
for doc in results:
|
||||
title = doc.get("document", {}).get("title", "Untitled")
|
||||
for chunk in doc.get("chunks", []):
|
||||
content = chunk.get("content", "").strip()
|
||||
if not content:
|
||||
continue
|
||||
entry = f"[{title}]\n{content}"
|
||||
if char_count + len(entry) > KB_MAX_CHARS:
|
||||
break
|
||||
parts.append(entry)
|
||||
char_count += len(entry)
|
||||
if char_count >= KB_MAX_CHARS:
|
||||
break
|
||||
|
||||
return "\n\n---\n\n".join(parts)
|
||||
except Exception as e:
|
||||
logger.warning(f"KB search failed, proceeding without context: {e}")
|
||||
return ""
|
||||
|
||||
|
||||
async def stream_vision_autocomplete(
|
||||
screenshot_data_url: str,
|
||||
search_space_id: int,
|
||||
session: AsyncSession,
|
||||
*,
|
||||
app_name: str = "",
|
||||
window_title: str = "",
|
||||
) -> AsyncGenerator[str, None]:
|
||||
"""Analyze a screenshot with the vision LLM and stream a text completion.
|
||||
|
||||
Pipeline:
|
||||
1. Extract a search query from the screenshot (non-streaming)
|
||||
2. Search the knowledge base for relevant context
|
||||
3. Stream the final completion with screenshot + KB + app context
|
||||
"""
|
||||
streaming = VercelStreamingService()
|
||||
vision_error_msg = (
|
||||
"The selected model does not support vision. "
|
||||
"Please set a vision-capable model (e.g. GPT-4o, Gemini) in your search space settings."
|
||||
)
|
||||
|
||||
llm = await get_vision_llm(session, search_space_id)
|
||||
if not llm:
|
||||
yield streaming.format_message_start()
|
||||
yield streaming.format_error("No Vision LLM configured for this search space")
|
||||
yield streaming.format_done()
|
||||
return
|
||||
|
||||
kb_context = ""
|
||||
try:
|
||||
query = await _extract_query_from_screenshot(
|
||||
llm, screenshot_data_url, app_name=app_name, window_title=window_title,
|
||||
)
|
||||
except Exception as e:
|
||||
logger.warning(f"Vision autocomplete: selected model does not support vision: {e}")
|
||||
yield streaming.format_message_start()
|
||||
yield streaming.format_error(vision_error_msg)
|
||||
yield streaming.format_done()
|
||||
return
|
||||
|
||||
if query:
|
||||
kb_context = await _search_knowledge_base(session, search_space_id, query)
|
||||
|
||||
system_prompt = _build_system_prompt(app_name, window_title, kb_context)
|
||||
|
||||
messages = [
|
||||
SystemMessage(content=system_prompt),
|
||||
HumanMessage(content=[
|
||||
{
|
||||
"type": "text",
|
||||
"text": "Analyze this screenshot. Understand the full context of what the user is working on, then generate the text they most likely want to write in the active text area.",
|
||||
},
|
||||
{
|
||||
"type": "image_url",
|
||||
"image_url": {"url": screenshot_data_url},
|
||||
},
|
||||
]),
|
||||
]
|
||||
|
||||
text_started = False
|
||||
text_id = ""
|
||||
try:
|
||||
yield streaming.format_message_start()
|
||||
text_id = streaming.generate_text_id()
|
||||
yield streaming.format_text_start(text_id)
|
||||
text_started = True
|
||||
|
||||
async for chunk in llm.astream(messages):
|
||||
token = chunk.content if hasattr(chunk, "content") else str(chunk)
|
||||
if token:
|
||||
yield streaming.format_text_delta(text_id, token)
|
||||
|
||||
yield streaming.format_text_end(text_id)
|
||||
yield streaming.format_finish()
|
||||
yield streaming.format_done()
|
||||
|
||||
except Exception as e:
|
||||
if text_started:
|
||||
yield streaming.format_text_end(text_id)
|
||||
|
||||
if _is_vision_unsupported_error(e):
|
||||
logger.warning(f"Vision autocomplete: selected model does not support vision: {e}")
|
||||
yield streaming.format_error(vision_error_msg)
|
||||
else:
|
||||
logger.error(f"Vision autocomplete streaming error: {e}", exc_info=True)
|
||||
yield streaming.format_error("Autocomplete failed. Please try again.")
|
||||
yield streaming.format_done()
|
||||
Loading…
Add table
Add a link
Reference in a new issue