From c5afce38733ab98d4817f111b1380ee4a98f5e3e Mon Sep 17 00:00:00 2001 From: Anish Sarkar <104695310+AnishSarkar22@users.noreply.github.com> Date: Sat, 6 Jun 2026 01:15:04 +0530 Subject: [PATCH] feat(docker): add GPU support and enhance installation scripts - Introduced a new docker-compose.gpu.yml file to define GPU resource reservations for backend services. - Updated .env.example to include GPU-related environment variables and usage instructions. - Enhanced install.ps1 and install.sh scripts to support GPU variant selection and validation for GPU count. - Improved error handling and user feedback for invalid GPU configurations. --- docker/.env.example | 8 +- docker/docker-compose.gpu.yml | 30 ++++ docker/scripts/install.ps1 | 306 ++++++++++++++++++-------------- docker/scripts/install.sh | 321 ++++++++++++++++++++-------------- 4 files changed, 401 insertions(+), 264 deletions(-) create mode 100644 docker/docker-compose.gpu.yml diff --git a/docker/.env.example b/docker/.env.example index 4f33c92b0..5f0f3c018 100644 --- a/docker/.env.example +++ b/docker/.env.example @@ -8,8 +8,14 @@ SURFSENSE_VERSION=latest # Image variant: empty = CPU (default), "cuda" = CUDA 12.8, "cuda126" = CUDA 12.6. -# NOTE: this only selects the GPU-built image. GPU device access lands in Phase 3. +# GPU acceleration also requires the NVIDIA Container Toolkit on the host and +# the GPU overlay in COMPOSE_FILE. Linux/macOS use ":"; Windows uses ";". +# Example Linux/macOS: COMPOSE_FILE=docker-compose.yml:docker-compose.gpu.yml +# Example Windows: COMPOSE_FILE=docker-compose.yml;docker-compose.gpu.yml +# Use "cuda126" for older NVIDIA driver stacks; use "cuda" for newer drivers. SURFSENSE_VARIANT= +# COMPOSE_FILE=docker-compose.yml:docker-compose.gpu.yml +# SURFSENSE_GPU_COUNT=1 # Deployment environment: dev or production SURFSENSE_ENV=production diff --git a/docker/docker-compose.gpu.yml b/docker/docker-compose.gpu.yml new file mode 100644 index 000000000..a40aeb32f --- /dev/null +++ b/docker/docker-compose.gpu.yml @@ -0,0 +1,30 @@ +services: + backend: + deploy: + resources: + reservations: + devices: + - driver: ${SURFSENSE_GPU_DRIVER:-nvidia} + count: ${SURFSENSE_GPU_COUNT:-1} + capabilities: + - gpu + + celery_worker: + deploy: + resources: + reservations: + devices: + - driver: ${SURFSENSE_GPU_DRIVER:-nvidia} + count: ${SURFSENSE_GPU_COUNT:-1} + capabilities: + - gpu + + celery_beat: + deploy: + resources: + reservations: + devices: + - driver: ${SURFSENSE_GPU_DRIVER:-nvidia} + count: ${SURFSENSE_GPU_COUNT:-1} + capabilities: + - gpu diff --git a/docker/scripts/install.ps1 b/docker/scripts/install.ps1 index 60c4fd5df..b44e17ef3 100644 --- a/docker/scripts/install.ps1 +++ b/docker/scripts/install.ps1 @@ -7,6 +7,8 @@ # To pass flags, save and run locally: # .\install.ps1 -NoWatchtower # .\install.ps1 -WatchtowerInterval 3600 +# .\install.ps1 -Variant cuda +# .\install.ps1 -Variant cuda -GpuCount all # # Handles two cases automatically: # 1. Fresh install — no prior SurfSense data detected @@ -17,7 +19,11 @@ param( [switch]$NoWatchtower, - [int]$WatchtowerInterval = 86400 + [int]$WatchtowerInterval = 86400, + [ValidateSet("cpu", "cuda", "cuda126")] + [string]$Variant, + [string]$GpuCount, + [switch]$Quiet ) $ErrorActionPreference = 'Stop' @@ -34,6 +40,11 @@ $MigrationMode = $false $SetupWatchtower = -not $NoWatchtower $WatchtowerContainer = "watchtower" +if ($GpuCount -and $GpuCount -notmatch '^([0-9]+|all)$') { + Write-Host "[SurfSense] ERROR: Invalid -GpuCount '$GpuCount'. Use a number or 'all'." -ForegroundColor Red + exit 1 +} + # ── Output helpers ────────────────────────────────────────────────────────── function Write-Info { param([string]$Msg) Write-Host "[SurfSense] " -ForegroundColor Cyan -NoNewline; Write-Host $Msg } @@ -97,101 +108,7 @@ function Wait-ForPostgres { Write-Ok "PostgreSQL is ready." } -# ── Stack health helpers ──────────────────────────────────────────────────── - -function Get-ComposeServices { - Push-Location $InstallDir - try { - $raw = Invoke-NativeSafe { docker compose ps -a --format json 2>$null } - } finally { - Pop-Location - } - if ([string]::IsNullOrWhiteSpace($raw)) { return @() } - - # Compose v2.21+ emits a JSON array; older versions emit one object per line. - try { - $parsed = $raw | ConvertFrom-Json - if ($parsed -is [System.Collections.IEnumerable] -and -not ($parsed -is [string])) { - return @($parsed) - } - return @($parsed) - } catch { - $services = @() - foreach ($line in ($raw -split "`r?`n")) { - $line = $line.Trim() - if (-not $line) { continue } - try { $services += ($line | ConvertFrom-Json) } catch { } - } - return $services - } -} - -function Wait-StackHealthy { - param([int]$TimeoutSec = 300) - - $deadline = (Get-Date).AddSeconds($TimeoutSec) - $lastReport = "" - - while ((Get-Date) -lt $deadline) { - $services = Get-ComposeServices - if (-not $services -or $services.Count -eq 0) { - Start-Sleep -Seconds 3 - continue - } - - $bad = @() - $waiting = @() - $good = @() - - foreach ($svc in $services) { - $name = $svc.Service - $state = $svc.State - $health = if ($svc.PSObject.Properties.Name -contains 'Health') { $svc.Health } else { '' } - $exit = if ($svc.PSObject.Properties.Name -contains 'ExitCode') { $svc.ExitCode } else { $null } - - if ($name -eq 'migrations') { - if ($state -eq 'exited' -and $exit -eq 0) { $good += $name } - elseif ($state -eq 'exited') { $bad += "${name} (exit=${exit})" } - else { $waiting += "${name} (${state})" } - continue - } - - if ($state -eq 'running') { - if ([string]::IsNullOrEmpty($health) -or $health -eq 'healthy') { - $good += $name - } elseif ($health -eq 'starting') { - $waiting += "${name} (starting)" - } elseif ($health -eq 'unhealthy') { - $bad += "${name} (unhealthy)" - } else { - $waiting += "${name} (${health})" - } - } elseif ($state -eq 'restarting') { - $bad += "${name} (restarting)" - } elseif ($state -eq 'exited') { - $bad += "${name} (exited, code=${exit})" - } else { - $waiting += "${name} (${state})" - } - } - - if ($bad.Count -gt 0) { - return @{ Ok = $false; Reason = 'failure'; Bad = $bad; Waiting = $waiting; Good = $good } - } - if ($waiting.Count -eq 0) { - return @{ Ok = $true; Reason = 'all_healthy'; Good = $good } - } - - $report = "Waiting on: " + ($waiting -join ', ') - if ($report -ne $lastReport) { - Write-Info $report - $lastReport = $report - } - Start-Sleep -Seconds 5 - } - - return @{ Ok = $false; Reason = 'timeout'; Bad = $bad; Waiting = $waiting; Good = $good } -} +# ── Stack startup helper ──────────────────────────────────────────────────── function Test-StaleZeroCacheVolume { $raw = Invoke-NativeSafe { docker volume ls --format '{{.Name}}' 2>$null } @@ -221,19 +138,9 @@ function Invoke-StaleZeroCacheCleanup { Write-Ok "Removed surfsense-zero-cache volume; zero-cache will re-sync on next start." } -function Write-Err-NoExit { - param([string]$Message) - Write-Host "[ERROR] $Message" -ForegroundColor Red -} - function Invoke-StackFailureReport { - param([hashtable]$Result) - Write-Host "" - Write-Err-NoExit "Stack did not reach a healthy state." - if ($Result.Bad.Count -gt 0) { Write-Host (" Failed: " + ($Result.Bad -join ', ')) } - if ($Result.Waiting.Count -gt 0) { Write-Host (" Stuck: " + ($Result.Waiting -join ', ')) } - + Write-Host "[ERROR] Stack did not reach a healthy state." -ForegroundColor Red Write-Host "" Write-Info "Recent logs from migrations / zero-cache / backend:" Push-Location $InstallDir @@ -247,11 +154,151 @@ function Invoke-StackFailureReport { Write-Host "Recovery hints:" -ForegroundColor Yellow Write-Host " 1. Inspect migrations: cd $InstallDir; docker compose logs migrations" Write-Host " 2. Verify publication: cd $InstallDir; docker compose exec db psql -U surfsense -d surfsense -c 'SELECT pubname FROM pg_publication;'" - Write-Host " 3. Hard reset zero db: cd $InstallDir; docker compose down; docker volume rm surfsense-zero-cache; docker compose up -d" + Write-Host " 3. Hard reset zero db: cd $InstallDir; docker compose down; docker volume rm surfsense-zero-cache; docker compose up -d --wait" Write-Host "" exit 1 } +function Invoke-ComposeUpWait { + Push-Location $InstallDir + try { + Invoke-NativeSafe { docker compose up -d --wait } + } finally { + Pop-Location + } + if ($LASTEXITCODE -ne 0) { + Invoke-StackFailureReport + } +} + +# ── Variant and .env helpers ──────────────────────────────────────────────── + +function Set-EnvValue { + param([string]$Path, [string]$Key, [string]$Value) + $lines = @() + if (Test-Path $Path) { + $lines = @(Get-Content $Path) + } + $updated = $false + $newLines = foreach ($line in $lines) { + if ($line -match "^$([regex]::Escape($Key))=") { + $updated = $true + "$Key=$Value" + } else { + $line + } + } + if (-not $updated) { + $newLines += "$Key=$Value" + } + Set-Content -Path $Path -Value $newLines +} + +function Remove-EnvValue { + param([string]$Path, [string]$Key) + if (-not (Test-Path $Path)) { return } + $newLines = Get-Content $Path | Where-Object { $_ -notmatch "^$([regex]::Escape($Key))=" } + Set-Content -Path $Path -Value $newLines +} + +function Test-NvidiaGpu { + if (-not (Get-Command nvidia-smi -ErrorAction SilentlyContinue)) { return $false } + Invoke-NativeSafe { nvidia-smi *>$null } | Out-Null + return ($LASTEXITCODE -eq 0) +} + +function Test-NvidiaRuntime { + $info = Invoke-NativeSafe { docker info 2>$null } + if ($info -match 'nvidia') { return $true } + if (Get-Command nvidia-ctk -ErrorAction SilentlyContinue) { return $true } + if (Get-Command nvidia-container-runtime -ErrorAction SilentlyContinue) { return $true } + return $false +} + +function Get-RecommendedVariant { + $driver = (Invoke-NativeSafe { nvidia-smi --query-gpu=driver_version --format=csv,noheader 2>$null } | Select-Object -First 1) + $major = 0 + if ($driver -match '^(\d+)') { + $major = [int]$Matches[1] + } + if ($major -gt 0 -and $major -lt 570) { + return "cuda126" + } + return "cuda" +} + +function Resolve-Variant { + $hasGpu = Test-NvidiaGpu + $hasRuntime = $false + $recommended = "cpu" + + if ($hasGpu) { + $recommended = Get-RecommendedVariant + $hasRuntime = Test-NvidiaRuntime + } + + if ($Variant) { + if ($Variant -eq "cpu") { return "cpu" } + if (-not $hasGpu) { + Write-Warn "No NVIDIA GPU detected; falling back to CPU variant." + return "cpu" + } + if (-not $hasRuntime) { + Write-Warn "NVIDIA GPU detected, but NVIDIA Container Toolkit was not detected; falling back to CPU variant." + Write-Warn "Install the toolkit before enabling SurfSense GPU acceleration." + return "cpu" + } + return $Variant + } + + if ($hasGpu -and -not $hasRuntime) { + Write-Warn "NVIDIA GPU detected, but NVIDIA Container Toolkit was not detected; using CPU variant." + } + + if ($hasGpu -and $hasRuntime -and -not $Quiet -and [Environment]::UserInteractive) { + Write-Host "" + Write-Host "SurfSense detected an NVIDIA GPU." -ForegroundColor Cyan + $choice = Read-Host "Use GPU acceleration? [Y/n]" + switch ($choice) { + "" { return $recommended } + { $_ -match '^(?i)y(es)?$' } { return $recommended } + { $_ -match '^(?i)n(o)?$' } { return "cpu" } + default { + Write-Warn "Unrecognized choice '$choice'; using CPU variant." + return "cpu" + } + } + } + + return "cpu" +} + +function Set-VariantEnv { + param([string]$Path, [string]$SelectedVariant, [bool]$AllowExistingUpdate) + + if ((Test-Path $Path) -and -not $AllowExistingUpdate) { + Write-Warn ".env already exists - keeping your existing configuration." + Write-Info "To change variants later, edit SURFSENSE_VARIANT and COMPOSE_FILE in $Path, then run docker compose up -d --wait." + return + } + + if ($SelectedVariant -eq "cpu") { + Set-EnvValue -Path $Path -Key "SURFSENSE_VARIANT" -Value "" + Remove-EnvValue -Path $Path -Key "COMPOSE_FILE" + Remove-EnvValue -Path $Path -Key "SURFSENSE_GPU_COUNT" + } else { + Set-EnvValue -Path $Path -Key "SURFSENSE_VARIANT" -Value $SelectedVariant + Set-EnvValue -Path $Path -Key "COMPOSE_FILE" -Value "docker-compose.yml;docker-compose.gpu.yml" + if ($GpuCount) { + Set-EnvValue -Path $Path -Key "SURFSENSE_GPU_COUNT" -Value $GpuCount + } + } + + Remove-EnvValue -Path $Path -Key "COMPOSE_PROFILES" +} + +$SelectedVariant = Resolve-Variant + # ── Download files ────────────────────────────────────────────────────────── Write-Step "Downloading SurfSense files" @@ -262,6 +309,7 @@ New-Item -ItemType Directory -Path "$InstallDir\searxng" -Force | Out-Null $Files = @( @{ Src = "docker/docker-compose.yml"; Dest = "docker-compose.yml" } + @{ Src = "docker/docker-compose.gpu.yml"; Dest = "docker-compose.gpu.yml" } @{ Src = "docker/.env.example"; Dest = ".env.example" } @{ Src = "docker/postgresql.conf"; Dest = "postgresql.conf" } @{ Src = "docker/scripts/migrate-database.ps1"; Dest = "scripts/migrate-database.ps1" } @@ -339,9 +387,15 @@ if (-not (Test-Path $envPath)) { $content = $content -replace 'SECRET_KEY=replace_me_with_a_random_string', "SECRET_KEY=$SecretKey" Set-Content -Path $envPath -Value $content -NoNewline + Set-VariantEnv -Path $envPath -SelectedVariant $SelectedVariant -AllowExistingUpdate $false Write-Info "Created $envPath" } else { - Write-Warn ".env already exists - keeping your existing configuration." + if ($PSBoundParameters.ContainsKey('Variant')) { + Set-VariantEnv -Path $envPath -SelectedVariant $SelectedVariant -AllowExistingUpdate $true + Write-Info "Updated SurfSense image variant in existing $envPath" + } else { + Set-VariantEnv -Path $envPath -SelectedVariant $SelectedVariant -AllowExistingUpdate $false + } } # ── Start containers ──────────────────────────────────────────────────────── @@ -405,31 +459,15 @@ if ($MigrationMode) { } Write-Step "Starting all SurfSense services" - Push-Location $InstallDir - Invoke-NativeSafe { docker compose up -d } - Pop-Location - Write-Ok "All containers started; waiting for stack to become healthy..." - - $waitResult = Wait-StackHealthy -TimeoutSec 300 - if (-not $waitResult.Ok) { - Invoke-StackFailureReport -Result $waitResult - } - Write-Ok "All services healthy." + Invoke-ComposeUpWait + Write-Ok "All services started and healthy." Remove-Item $KeyFile -ErrorAction SilentlyContinue } else { Write-Step "Starting SurfSense" - Push-Location $InstallDir - Invoke-NativeSafe { docker compose up -d } - Pop-Location - Write-Ok "All containers started; waiting for stack to become healthy..." - - $waitResult = Wait-StackHealthy -TimeoutSec 300 - if (-not $waitResult.Ok) { - Invoke-StackFailureReport -Result $waitResult - } - Write-Ok "All services healthy." + Invoke-ComposeUpWait + Write-Ok "All services started and healthy." } # ── Watchtower (auto-update) ──────────────────────────────────────────────── @@ -461,7 +499,7 @@ if ($SetupWatchtower) { if ($LASTEXITCODE -eq 0) { Write-Ok "Watchtower started - labeled SurfSense containers will auto-update." } else { - Write-Warn "Could not start Watchtower. You can set it up manually or use: docker compose pull; docker compose up -d" + Write-Warn "Could not start Watchtower. You can set it up manually or use: docker compose pull; docker compose up -d --wait" } } } else { @@ -488,6 +526,9 @@ Y88b d88P Y88b 888 888 888 Y88b d88P Y8b. 888 888 X88 Y8b. $versionDisplay = (Get-Content $envPath | Select-String '^SURFSENSE_VERSION=' | ForEach-Object { ($_ -split '=',2)[1].Trim('"') }) | Select-Object -First 1 if (-not $versionDisplay) { $versionDisplay = "latest" } +$variantDisplay = (Get-Content $envPath | Select-String '^SURFSENSE_VARIANT=' | ForEach-Object { ($_ -split '=',2)[1].Trim('"') }) | Select-Object -First 1 +if (-not $variantDisplay) { $variantDisplay = "cpu" } +$wtHours = [math]::Floor($WatchtowerInterval / 3600) Write-Host " OSS Alternative to NotebookLM for Teams [$versionDisplay]" -ForegroundColor Yellow Write-Host ("=" * 62) -ForegroundColor Cyan Write-Host "" @@ -497,13 +538,14 @@ Write-Info " Backend: http://localhost:8929" Write-Info " API Docs: http://localhost:8929/docs" Write-Info "" Write-Info " Config: $InstallDir\.env" +Write-Info " Variant: $variantDisplay" Write-Info " Logs: cd $InstallDir; docker compose logs -f" Write-Info " Stop: cd $InstallDir; docker compose down" -Write-Info " Update: cd $InstallDir; docker compose pull; docker compose up -d" +Write-Info " Update: cd $InstallDir; docker compose pull; docker compose up -d --wait" Write-Info "" if ($SetupWatchtower) { - Write-Info " Watchtower: auto-updates every ${wtHours}h (stop: docker rm -f $WatchtowerContainer)" + Write-Info " Watchtower: auto-updates every ${wtHours}h (disable: docker rm -f $WatchtowerContainer)" } else { Write-Warn " Watchtower skipped. For auto-updates, re-run without -NoWatchtower." } diff --git a/docker/scripts/install.sh b/docker/scripts/install.sh index db81f95eb..8d53519d9 100644 --- a/docker/scripts/install.sh +++ b/docker/scripts/install.sh @@ -8,6 +8,11 @@ # Flags: # --no-watchtower Skip automatic Watchtower setup # --watchtower-interval=SECS Check interval in seconds (default: 86400 = 24h) +# --variant=cpu|cuda|cuda126 Select backend image variant +# --gpu Alias for --variant=cuda +# --cpu Alias for --variant=cpu +# --gpu-count=N|all Number of GPUs to reserve when GPU is enabled +# --quiet Skip interactive prompts # # Handles two cases automatically: # 1. Fresh install — no prior SurfSense data detected @@ -35,12 +40,21 @@ MIGRATION_MODE=false SETUP_WATCHTOWER=true WATCHTOWER_INTERVAL=86400 WATCHTOWER_CONTAINER="watchtower" +REQUESTED_VARIANT="" +VARIANT_EXPLICIT=false +GPU_COUNT="" +QUIET=false # ── Parse flags ───────────────────────────────────────────────────────────── for arg in "$@"; do case "$arg" in --no-watchtower) SETUP_WATCHTOWER=false ;; --watchtower-interval=*) WATCHTOWER_INTERVAL="${arg#*=}" ;; + --variant=*) REQUESTED_VARIANT="${arg#*=}"; VARIANT_EXPLICIT=true ;; + --gpu) REQUESTED_VARIANT="cuda"; VARIANT_EXPLICIT=true ;; + --cpu) REQUESTED_VARIANT="cpu"; VARIANT_EXPLICIT=true ;; + --gpu-count=*) GPU_COUNT="${arg#*=}" ;; + --quiet) QUIET=true ;; esac done @@ -57,6 +71,15 @@ warn() { printf "${YELLOW}[SurfSense]${NC} %s\n" "$1"; } error() { printf "${RED}[SurfSense]${NC} ERROR: %s\n" "$1" >&2; exit 1; } step() { printf "\n${BOLD}${CYAN}── %s${NC}\n" "$1"; } +case "${REQUESTED_VARIANT}" in + ""|cpu|cuda|cuda126) ;; + *) error "Invalid --variant='${REQUESTED_VARIANT}'. Use cpu, cuda, or cuda126." ;; +esac + +if [[ -n "${GPU_COUNT}" && ! "${GPU_COUNT}" =~ ^([0-9]+|all)$ ]]; then + error "Invalid --gpu-count='${GPU_COUNT}'. Use a number or 'all'." +fi + # ── Pre-flight checks ──────────────────────────────────────────────────────── step "Checking prerequisites" @@ -97,126 +120,11 @@ wait_for_pg() { success "PostgreSQL is ready." } -# ── Stack health helpers ───────────────────────────────────────────────────── - -# Enumerate compose services for project `surfsense` as `service|state|health|exitcode` -# lines. Uses `docker inspect` so we don't depend on `jq`, `python3`, or the -# exact ordering of fields in `docker compose ps --format json` output. -get_compose_services() { - local containers - containers=$(docker ps -a --filter "label=com.docker.compose.project=surfsense" --format '{{.Names}}' 2>/dev/null) || true - [[ -z "$containers" ]] && return 0 - - while IFS= read -r container; do - [[ -z "$container" ]] && continue - local svc state health code - svc=$(docker inspect -f '{{index .Config.Labels "com.docker.compose.service"}}' "$container" 2>/dev/null || echo "") - state=$(docker inspect -f '{{.State.Status}}' "$container" 2>/dev/null || echo "unknown") - health=$(docker inspect -f '{{if .State.Health}}{{.State.Health.Status}}{{end}}' "$container" 2>/dev/null || echo "") - code=$(docker inspect -f '{{.State.ExitCode}}' "$container" 2>/dev/null || echo "") - [[ -z "$svc" ]] && continue - printf '%s|%s|%s|%s\n' "$svc" "$state" "$health" "$code" - done <<< "$containers" -} - -# Globals populated by wait_stack_healthy / consumed by stack_failure_report. -STACK_BAD=() -STACK_WAITING=() -STACK_GOOD=() -STACK_TIMEOUT=false - -wait_stack_healthy() { - local timeout_sec=${1:-300} - local deadline=$(($(date +%s) + timeout_sec)) - local last_report="" - local bad=() - local waiting=() - local good=() - - while [[ $(date +%s) -lt $deadline ]]; do - local lines - lines=$(get_compose_services) - if [[ -z "$lines" ]]; then - sleep 3 - continue - fi - - bad=() - waiting=() - good=() - - while IFS='|' read -r name state health code; do - [[ -z "$name" ]] && continue - if [[ "$name" == "migrations" ]]; then - if [[ "$state" == "exited" && "$code" == "0" ]]; then - good+=("$name") - elif [[ "$state" == "exited" ]]; then - bad+=("${name} (exit=${code})") - else - waiting+=("${name} (${state})") - fi - continue - fi - - if [[ "$state" == "running" ]]; then - if [[ -z "$health" || "$health" == "healthy" ]]; then - good+=("$name") - elif [[ "$health" == "starting" ]]; then - waiting+=("${name} (starting)") - elif [[ "$health" == "unhealthy" ]]; then - bad+=("${name} (unhealthy)") - else - waiting+=("${name} (${health})") - fi - elif [[ "$state" == "restarting" ]]; then - bad+=("${name} (restarting)") - elif [[ "$state" == "exited" ]]; then - bad+=("${name} (exited, code=${code})") - else - waiting+=("${name} (${state})") - fi - done <<< "$lines" - - if (( ${#bad[@]} > 0 )); then - STACK_BAD=("${bad[@]}") - STACK_WAITING=("${waiting[@]}") - STACK_GOOD=("${good[@]}") - return 1 - fi - if (( ${#waiting[@]} == 0 )); then - STACK_GOOD=("${good[@]}") - return 0 - fi - - local report="Waiting on: ${waiting[*]}" - if [[ "$report" != "$last_report" ]]; then - info "$report" - last_report="$report" - fi - sleep 5 - done - - # bad/waiting/good are declared at function scope so referencing them is - # safe even if the polling loop never executed its body. - STACK_BAD=() - [[ ${#bad[@]} -gt 0 ]] && STACK_BAD=("${bad[@]}") - STACK_WAITING=() - [[ ${#waiting[@]} -gt 0 ]] && STACK_WAITING=("${waiting[@]}") - STACK_GOOD=() - [[ ${#good[@]} -gt 0 ]] && STACK_GOOD=("${good[@]}") - STACK_TIMEOUT=true - return 1 -} +# ── Stack startup helper ───────────────────────────────────────────────────── stack_failure_report() { echo "" echo -e "\033[31m[ERROR]\033[0m Stack did not reach a healthy state." - if (( ${#STACK_BAD[@]} > 0 )) && [[ -n "${STACK_BAD[0]}" ]]; then - echo " Failed: ${STACK_BAD[*]}" - fi - if (( ${#STACK_WAITING[@]} > 0 )) && [[ -n "${STACK_WAITING[0]}" ]]; then - echo " Stuck: ${STACK_WAITING[*]}" - fi echo "" info "Recent logs from migrations / zero-cache / backend:" (cd "${INSTALL_DIR}" && ${DC} logs --tail=60 migrations zero-cache backend 2>&1) || true @@ -224,11 +132,20 @@ stack_failure_report() { echo "Recovery hints:" echo " 1. Inspect migrations: cd ${INSTALL_DIR} && ${DC} logs migrations" echo " 2. Verify publication: cd ${INSTALL_DIR} && ${DC} exec db psql -U surfsense -d surfsense -c 'SELECT pubname FROM pg_publication;'" - echo " 3. Hard reset zero db: cd ${INSTALL_DIR} && ${DC} down && docker volume rm surfsense-zero-cache && ${DC} up -d" + echo " 3. Hard reset zero db: cd ${INSTALL_DIR} && ${DC} down && docker volume rm surfsense-zero-cache && ${DC} up -d --wait" echo "" exit 1 } +compose_up_wait() { + local service="${1:-}" + if [[ -n "$service" ]]; then + (cd "${INSTALL_DIR}" && ${DC} up -d --wait "$service") < /dev/null + else + (cd "${INSTALL_DIR}" && ${DC} up -d --wait) < /dev/null + fi +} + # True if `surfsense-zero-cache` exists but `surfsense-zero-init` does not. # That signals an install that predates the migrations-service fix; the old # replica may be half-initialized and would block zero-cache on next start. @@ -254,6 +171,144 @@ invoke_stale_zero_cache_cleanup() { success "Removed surfsense-zero-cache volume; zero-cache will re-sync on next start." } +# ── Variant and .env helpers ───────────────────────────────────────────────── + +set_env_value() { + local file="$1" + local key="$2" + local value="$3" + local tmp + tmp=$(mktemp) + + if grep -q "^${key}=" "$file" 2>/dev/null; then + awk -v key="$key" -v value="$value" 'BEGIN { prefix = key "=" } $0 ~ "^" prefix { print prefix value; next } { print }' "$file" > "$tmp" + else + cp "$file" "$tmp" + printf '\n%s=%s\n' "$key" "$value" >> "$tmp" + fi + mv "$tmp" "$file" +} + +remove_env_value() { + local file="$1" + local key="$2" + local tmp + tmp=$(mktemp) + awk -v key="$key" 'BEGIN { prefix = key "=" } $0 !~ "^" prefix { print }' "$file" > "$tmp" + mv "$tmp" "$file" +} + +version_major() { + printf '%s' "$1" | cut -d. -f1 +} + +recommend_cuda_variant() { + local driver_version driver_major + driver_version=$(nvidia-smi --query-gpu=driver_version --format=csv,noheader 2>/dev/null | head -n 1 | tr -d '[:space:]' || true) + driver_major=$(version_major "$driver_version") + + # CUDA 12.8 generally requires an R570+ driver. Use CUDA 12.6 as the + # compatibility fallback for older 12.x driver stacks and GPUs. + if [[ "$driver_major" =~ ^[0-9]+$ && "$driver_major" -lt 570 ]]; then + printf 'cuda126' + else + printf 'cuda' + fi +} + +gpu_runtime_available() { + docker info 2>/dev/null | grep -qi 'nvidia' \ + || command -v nvidia-ctk >/dev/null 2>&1 \ + || command -v nvidia-container-runtime >/dev/null 2>&1 +} + +host_has_nvidia_gpu() { + command -v nvidia-smi >/dev/null 2>&1 && nvidia-smi >/dev/null 2>&1 +} + +resolve_variant() { + local detected_variant="cpu" + local has_gpu=false + local has_runtime=false + + if host_has_nvidia_gpu; then + has_gpu=true + detected_variant=$(recommend_cuda_variant) + if gpu_runtime_available; then + has_runtime=true + fi + fi + + if $VARIANT_EXPLICIT; then + if [[ "$REQUESTED_VARIANT" == "cpu" ]]; then + printf 'cpu' + return 0 + fi + if ! $has_gpu; then + warn "No NVIDIA GPU detected; falling back to CPU variant." >&2 + printf 'cpu' + return 0 + fi + if ! $has_runtime; then + warn "NVIDIA GPU detected, but NVIDIA Container Toolkit was not detected; falling back to CPU variant." >&2 + warn "Install the toolkit before enabling SurfSense GPU acceleration." >&2 + printf 'cpu' + return 0 + fi + printf '%s' "$REQUESTED_VARIANT" + return 0 + fi + + if $has_gpu && ! $has_runtime; then + warn "NVIDIA GPU detected, but NVIDIA Container Toolkit was not detected; using CPU variant." >&2 + fi + + if $has_gpu && $has_runtime && ! $QUIET && [[ -r /dev/tty && -w /dev/tty ]]; then + local choice + echo "" > /dev/tty + printf "${BOLD}${CYAN}SurfSense detected an NVIDIA GPU.${NC}\n" > /dev/tty + printf "Use GPU acceleration? [Y/n]: " > /dev/tty + read -r choice < /dev/tty || choice="" + case "$choice" in + "") printf '%s' "$detected_variant" ;; + [Yy]|[Yy][Ee][Ss]) printf '%s' "$detected_variant" ;; + [Nn]|[Nn][Oo]) printf 'cpu' ;; + *) warn "Unrecognized choice '${choice}', using CPU variant." >&2; printf 'cpu' ;; + esac + return 0 + fi + + printf 'cpu' +} + +apply_variant_env() { + local env_file="$1" + local variant="$2" + local allow_existing_update="$3" + + if [[ -f "$env_file" && "$allow_existing_update" != "true" ]]; then + warn ".env already exists — keeping your existing configuration." + info "To change variants later, edit SURFSENSE_VARIANT and COMPOSE_FILE in ${env_file}, then run ${DC} up -d --wait." + return 0 + fi + + if [[ "$variant" == "cpu" ]]; then + set_env_value "$env_file" "SURFSENSE_VARIANT" "" + remove_env_value "$env_file" "COMPOSE_FILE" + remove_env_value "$env_file" "SURFSENSE_GPU_COUNT" + else + set_env_value "$env_file" "SURFSENSE_VARIANT" "$variant" + set_env_value "$env_file" "COMPOSE_FILE" "docker-compose.yml:docker-compose.gpu.yml" + if [[ -n "$GPU_COUNT" ]]; then + set_env_value "$env_file" "SURFSENSE_GPU_COUNT" "$GPU_COUNT" + fi + fi + + remove_env_value "$env_file" "COMPOSE_PROFILES" +} + +SELECTED_VARIANT=$(resolve_variant) + # ── Download files ─────────────────────────────────────────────────────────── step "Downloading SurfSense files" @@ -263,6 +318,7 @@ mkdir -p "${INSTALL_DIR}/searxng" FILES=( "docker/docker-compose.yml:docker-compose.yml" + "docker/docker-compose.gpu.yml:docker-compose.gpu.yml" "docker/.env.example:.env.example" "docker/postgresql.conf:postgresql.conf" "docker/scripts/migrate-database.sh:scripts/migrate-database.sh" @@ -336,9 +392,15 @@ if [ ! -f "${INSTALL_DIR}/.env" ]; then else sed -i "s|SECRET_KEY=replace_me_with_a_random_string|SECRET_KEY=${SECRET_KEY}|" "${INSTALL_DIR}/.env" fi + apply_variant_env "${INSTALL_DIR}/.env" "$SELECTED_VARIANT" "false" info "Created ${INSTALL_DIR}/.env" else - warn ".env already exists — keeping your existing configuration." + if $VARIANT_EXPLICIT; then + apply_variant_env "${INSTALL_DIR}/.env" "$SELECTED_VARIANT" "true" + info "Updated SurfSense image variant in existing ${INSTALL_DIR}/.env" + else + apply_variant_env "${INSTALL_DIR}/.env" "$SELECTED_VARIANT" "false" + fi fi # ── Start containers ───────────────────────────────────────────────────────── @@ -401,26 +463,20 @@ if $MIGRATION_MODE; then fi step "Starting all SurfSense services" - (cd "${INSTALL_DIR}" && ${DC} up -d) < /dev/null - success "All containers started; waiting for stack to become healthy..." - - if ! wait_stack_healthy 300; then + if ! compose_up_wait; then stack_failure_report fi - success "All services healthy." + success "All services started and healthy." # Key file is no longer needed — SECRET_KEY is now in .env rm -f "${KEY_FILE}" else step "Starting SurfSense" - (cd "${INSTALL_DIR}" && ${DC} up -d) < /dev/null - success "All containers started; waiting for stack to become healthy..." - - if ! wait_stack_healthy 300; then + if ! compose_up_wait; then stack_failure_report fi - success "All services healthy." + success "All services started and healthy." fi # ── Watchtower (auto-update) ───────────────────────────────────────────────── @@ -445,7 +501,7 @@ if $SETUP_WATCHTOWER; then --label-enable \ --interval "${WATCHTOWER_INTERVAL}" >/dev/null 2>&1 < /dev/null \ && success "Watchtower started — labeled SurfSense containers will auto-update." \ - || warn "Could not start Watchtower. You can set it up manually or use: docker compose pull && docker compose up -d" + || warn "Could not start Watchtower. You can set it up manually or use: docker compose pull && docker compose up -d --wait" fi else info "Skipping Watchtower setup (--no-watchtower flag)." @@ -471,6 +527,8 @@ Y88b d88P Y88b 888 888 888 Y88b d88P Y8b. 888 888 X88 Y8b. EOF _version_display=$(grep '^SURFSENSE_VERSION=' "${INSTALL_DIR}/.env" 2>/dev/null | cut -d= -f2 | tr -d '"' | head -1 || true) _version_display="${_version_display:-latest}" +_variant_display=$(grep '^SURFSENSE_VARIANT=' "${INSTALL_DIR}/.env" 2>/dev/null | cut -d= -f2 | tr -d '"' | head -1 || true) +_variant_display="${_variant_display:-cpu}" printf " OSS Alternative to NotebookLM for Teams ${YELLOW}[%s]${NC}\n" "${_version_display}" printf "${CYAN}══════════════════════════════════════════════════════════════${NC}\n\n" @@ -479,13 +537,14 @@ info " Backend: http://localhost:8929" info " API Docs: http://localhost:8929/docs" info "" info " Config: ${INSTALL_DIR}/.env" +info " Variant: ${_variant_display}" info " Logs: cd ${INSTALL_DIR} && ${DC} logs -f" info " Stop: cd ${INSTALL_DIR} && ${DC} down" -info " Update: cd ${INSTALL_DIR} && ${DC} pull && ${DC} up -d" +info " Update: cd ${INSTALL_DIR} && ${DC} pull && ${DC} up -d --wait" info "" if $SETUP_WATCHTOWER; then - info " Watchtower: auto-updates every $((WATCHTOWER_INTERVAL / 3600))h (stop: docker rm -f ${WATCHTOWER_CONTAINER})" + info " Watchtower: auto-updates every $((WATCHTOWER_INTERVAL / 3600))h (disable: docker rm -f ${WATCHTOWER_CONTAINER})" else warn " Watchtower skipped. For auto-updates, re-run without --no-watchtower." fi