Merge remote-tracking branch 'upstream/dev' into feat/azure-ocr

2026-05-01 20:03:30 +02:00 · 2026-04-08 05:00:32 +05:30 · 2026-04-08 05:00:32 +05:30 · 6038f6dfc0
commit 6038f6dfc0
parent d264aec57d 65c05008e0
84 changed files with 6041 additions and 1065 deletions
--- a/surfsense_backend/app/config/init.py
+++ b/surfsense_backend/app/config/init.py
@ -102,6 +102,44 @@ def load_global_image_gen_configs():
        return []


+def load_global_vision_llm_configs():
+    global_config_file = BASE_DIR / "app" / "config" / "global_llm_config.yaml"
+
+    if not global_config_file.exists():
+        return []
+
+    try:
+        with open(global_config_file, encoding="utf-8") as f:
+            data = yaml.safe_load(f)
+            return data.get("global_vision_llm_configs", [])
+    except Exception as e:
+        print(f"Warning: Failed to load global vision LLM configs: {e}")
+        return []
+
+
+def load_vision_llm_router_settings():
+    default_settings = {
+        "routing_strategy": "usage-based-routing",
+        "num_retries": 3,
+        "allowed_fails": 3,
+        "cooldown_time": 60,
+    }
+
+    global_config_file = BASE_DIR / "app" / "config" / "global_llm_config.yaml"
+
+    if not global_config_file.exists():
+        return default_settings
+
+    try:
+        with open(global_config_file, encoding="utf-8") as f:
+            data = yaml.safe_load(f)
+            settings = data.get("vision_llm_router_settings", {})
+            return {**default_settings, **settings}
+    except Exception as e:
+        print(f"Warning: Failed to load vision LLM router settings: {e}")
+        return default_settings
+
+
 def load_image_gen_router_settings():
    """
    Load router settings for image generation Auto mode from YAML file.
@ -182,6 +220,29 @@ def initialize_image_gen_router():
        print(f"Warning: Failed to initialize Image Generation Router: {e}")


+def initialize_vision_llm_router():
+    vision_configs = load_global_vision_llm_configs()
+    router_settings = load_vision_llm_router_settings()
+
+    if not vision_configs:
+        print(
+            "Info: No global vision LLM configs found, "
+            "Vision LLM Auto mode will not be available"
+        )
+        return
+
+    try:
+        from app.services.vision_llm_router_service import VisionLLMRouterService
+
+        VisionLLMRouterService.initialize(vision_configs, router_settings)
+        print(
+            f"Info: Vision LLM Router initialized with {len(vision_configs)} models "
+            f"(strategy: {router_settings.get('routing_strategy', 'usage-based-routing')})"
+        )
+    except Exception as e:
+        print(f"Warning: Failed to initialize Vision LLM Router: {e}")
+
+
 class Config:
    # Check if ffmpeg is installed
    if not is_ffmpeg_installed():
@ -335,6 +396,12 @@ class Config:
    # Router settings for Image Generation Auto mode
    IMAGE_GEN_ROUTER_SETTINGS = load_image_gen_router_settings()

+    # Global Vision LLM Configurations (optional)
+    GLOBAL_VISION_LLM_CONFIGS = load_global_vision_llm_configs()
+
+    # Router settings for Vision LLM Auto mode
+    VISION_LLM_ROUTER_SETTINGS = load_vision_llm_router_settings()
+
    # Chonkie Configuration | Edit this to your needs
    EMBEDDING_MODEL = os.getenv("EMBEDDING_MODEL")
    # Azure OpenAI credentials from environment variables
--- a/surfsense_backend/app/config/global_llm_config.example.yaml
+++ b/surfsense_backend/app/config/global_llm_config.example.yaml
@ -263,6 +263,82 @@ global_image_generation_configs:
  #   rpm: 30
  #   litellm_params: {}

+# =============================================================================
+# Vision LLM Configuration
+# =============================================================================
+# These configurations power the vision autocomplete feature (screenshot analysis).
+# Only vision-capable models should be used here (e.g. GPT-4o, Gemini Pro, Claude 3).
+# Supported providers: OpenAI, Anthropic, Google, Azure OpenAI, Vertex AI, Bedrock,
+# xAI, OpenRouter, Ollama, Groq, Together AI, Fireworks AI, DeepSeek, Mistral, Custom
+#
+# Auto mode (ID 0) uses LiteLLM Router for load balancing across all vision configs.
+
+# Router Settings for Vision LLM Auto Mode
+vision_llm_router_settings:
+  routing_strategy: "usage-based-routing"
+  num_retries: 3
+  allowed_fails: 3
+  cooldown_time: 60
+
+global_vision_llm_configs:
+  # Example: OpenAI GPT-4o (recommended for vision)
+  - id: -1
+    name: "Global GPT-4o Vision"
+    description: "OpenAI's GPT-4o with strong vision capabilities"
+    provider: "OPENAI"
+    model_name: "gpt-4o"
+    api_key: "sk-your-openai-api-key-here"
+    api_base: ""
+    rpm: 500
+    tpm: 100000
+    litellm_params:
+      temperature: 0.3
+      max_tokens: 1000
+
+  # Example: Google Gemini 2.0 Flash
+  - id: -2
+    name: "Global Gemini 2.0 Flash"
+    description: "Google's fast vision model with large context"
+    provider: "GOOGLE"
+    model_name: "gemini-2.0-flash"
+    api_key: "your-google-ai-api-key-here"
+    api_base: ""
+    rpm: 1000
+    tpm: 200000
+    litellm_params:
+      temperature: 0.3
+      max_tokens: 1000
+
+  # Example: Anthropic Claude 3.5 Sonnet
+  - id: -3
+    name: "Global Claude 3.5 Sonnet Vision"
+    description: "Anthropic's Claude 3.5 Sonnet with vision support"
+    provider: "ANTHROPIC"
+    model_name: "claude-3-5-sonnet-20241022"
+    api_key: "sk-ant-your-anthropic-api-key-here"
+    api_base: ""
+    rpm: 1000
+    tpm: 100000
+    litellm_params:
+      temperature: 0.3
+      max_tokens: 1000
+
+  # Example: Azure OpenAI GPT-4o
+  # - id: -4
+  #   name: "Global Azure GPT-4o Vision"
+  #   description: "Azure-hosted GPT-4o for vision analysis"
+  #   provider: "AZURE_OPENAI"
+  #   model_name: "azure/gpt-4o-deployment"
+  #   api_key: "your-azure-api-key-here"
+  #   api_base: "https://your-resource.openai.azure.com"
+  #   api_version: "2024-02-15-preview"
+  #   rpm: 500
+  #   tpm: 100000
+  #   litellm_params:
+  #     temperature: 0.3
+  #     max_tokens: 1000
+  #     base_model: "gpt-4o"
+
 # Notes:
 # - ID 0 is reserved for "Auto" mode - uses LiteLLM Router for load balancing
 # - Use negative IDs to distinguish global configs from user configs (NewLLMConfig in DB)
@ -283,3 +359,9 @@ global_image_generation_configs:
 # - The router uses litellm.aimage_generation() for async image generation
 # - Only RPM (requests per minute) is relevant for image generation rate limiting.
 #   TPM (tokens per minute) does not apply since image APIs are billed/rate-limited per request, not per token.
+#
+# VISION LLM NOTES:
+# - Vision configs use the same ID scheme (negative for global, positive for user DB)
+# - Only use vision-capable models (GPT-4o, Gemini, Claude 3, etc.)
+# - Lower temperature (0.3) is recommended for accurate screenshot analysis
+# - Lower max_tokens (1000) is sufficient since autocomplete produces short suggestions
--- a/surfsense_backend/app/config/vision_model_list_fallback.json
+++ b/surfsense_backend/app/config/vision_model_list_fallback.json
@ -0,0 +1,23 @@
+[
+  {"value": "gpt-4o", "label": "GPT-4o", "provider": "OPENAI", "context_window": "128K"},
+  {"value": "gpt-4o-mini", "label": "GPT-4o Mini", "provider": "OPENAI", "context_window": "128K"},
+  {"value": "gpt-4-turbo", "label": "GPT-4 Turbo", "provider": "OPENAI", "context_window": "128K"},
+  {"value": "claude-sonnet-4-20250514", "label": "Claude Sonnet 4", "provider": "ANTHROPIC", "context_window": "200K"},
+  {"value": "claude-3-7-sonnet-20250219", "label": "Claude 3.7 Sonnet", "provider": "ANTHROPIC", "context_window": "200K"},
+  {"value": "claude-3-5-sonnet-20241022", "label": "Claude 3.5 Sonnet", "provider": "ANTHROPIC", "context_window": "200K"},
+  {"value": "claude-3-opus-20240229", "label": "Claude 3 Opus", "provider": "ANTHROPIC", "context_window": "200K"},
+  {"value": "claude-3-haiku-20240307", "label": "Claude 3 Haiku", "provider": "ANTHROPIC", "context_window": "200K"},
+  {"value": "gemini-2.5-flash", "label": "Gemini 2.5 Flash", "provider": "GOOGLE", "context_window": "1M"},
+  {"value": "gemini-2.5-pro", "label": "Gemini 2.5 Pro", "provider": "GOOGLE", "context_window": "1M"},
+  {"value": "gemini-2.0-flash", "label": "Gemini 2.0 Flash", "provider": "GOOGLE", "context_window": "1M"},
+  {"value": "gemini-1.5-pro", "label": "Gemini 1.5 Pro", "provider": "GOOGLE", "context_window": "1M"},
+  {"value": "gemini-1.5-flash", "label": "Gemini 1.5 Flash", "provider": "GOOGLE", "context_window": "1M"},
+  {"value": "pixtral-large-latest", "label": "Pixtral Large", "provider": "MISTRAL", "context_window": "128K"},
+  {"value": "pixtral-12b-2409", "label": "Pixtral 12B", "provider": "MISTRAL", "context_window": "128K"},
+  {"value": "grok-2-vision-1212", "label": "Grok 2 Vision", "provider": "XAI", "context_window": "32K"},
+  {"value": "llava", "label": "LLaVA", "provider": "OLLAMA"},
+  {"value": "bakllava", "label": "BakLLaVA", "provider": "OLLAMA"},
+  {"value": "llava-llama3", "label": "LLaVA Llama 3", "provider": "OLLAMA"},
+  {"value": "llama-4-scout-17b-16e-instruct", "label": "Llama 4 Scout 17B", "provider": "GROQ", "context_window": "128K"},
+  {"value": "meta-llama/Llama-4-Scout-17B-16E-Instruct", "label": "Llama 4 Scout 17B", "provider": "TOGETHER_AI", "context_window": "128K"}
+]