diff --git a/.devcontainer/Dockerfile b/.devcontainer/Dockerfile
index 9e6adb29..b24a4409 100644
--- a/.devcontainer/Dockerfile
+++ b/.devcontainer/Dockerfile
@@ -49,7 +49,7 @@ RUN --mount=type=bind,source=api/requirements.txt,target=/tmp/req.txt \
 #      sys.prefix/nltk_data, so it travels with the venv on COPY/rsync.
 RUN --mount=type=bind,source=pipecat,target=/tmp/pipecat,rw \
     --mount=type=cache,target=/root/.cache/uv \
-    uv pip install '/tmp/pipecat[cartesia,deepgram,openai,elevenlabs,groq,google,azure,sarvam,soundfile,silero,webrtc,speechmatics,openrouter,camb,mcp]' \
+    uv pip install '/tmp/pipecat[cartesia,deepgram,openai,elevenlabs,groq,google,azure,sarvam,soundfile,silero,webrtc,speechmatics,openrouter,camb,mcp,inworld,smallest]' \
  && uv pip install --group /tmp/pipecat/pyproject.toml:dev \
  && uv pip uninstall opencv-python \
  && uv pip install opencv-python-headless \
diff --git a/api/Dockerfile b/api/Dockerfile
index e3244125..621d1530 100644
--- a/api/Dockerfile
+++ b/api/Dockerfile
@@ -37,7 +37,7 @@ RUN --mount=type=bind,source=api/requirements.txt,target=/tmp/req.txt \
 #      sys.prefix/nltk_data, so it travels with the venv on COPY.
 RUN --mount=type=bind,source=pipecat,target=/tmp/pipecat,rw \
     --mount=type=cache,target=/root/.cache/uv \
-    uv pip install '/tmp/pipecat[cartesia,deepgram,openai,elevenlabs,groq,google,azure,sarvam,soundfile,silero,webrtc,speechmatics,openrouter,camb,mcp,inworld]' \
+    uv pip install '/tmp/pipecat[cartesia,deepgram,openai,elevenlabs,groq,google,azure,sarvam,soundfile,silero,webrtc,speechmatics,openrouter,camb,mcp,inworld,smallest]' \
  && uv pip uninstall opencv-python \
  && uv pip install opencv-python-headless \
  && python -c "import nltk; nltk.download('punkt_tab', download_dir='/opt/venv/nltk_data', quiet=True)"
diff --git a/api/db/user_client.py b/api/db/user_client.py
index 6455bc2b..27cf7749 100644
--- a/api/db/user_client.py
+++ b/api/db/user_client.py
@@ -4,6 +4,7 @@ from datetime import datetime, timezone
 from loguru import logger
 from pydantic import ValidationError
 from sqlalchemy import func
+from sqlalchemy.dialects.postgresql import insert
 from sqlalchemy.future import select
 
 from api.db.base_client import BaseDBClient
@@ -29,8 +30,6 @@ class UserClient(BaseDBClient):
 
             # Use PostgreSQL's INSERT ... ON CONFLICT DO NOTHING
             # This is atomic and handles race conditions at the database level
-            from sqlalchemy.dialects.postgresql import insert
-
             stmt = insert(UserModel.__table__).values(
                 provider_id=provider_id,
                 created_at=datetime.now(timezone.utc),
@@ -88,21 +87,22 @@ class UserClient(BaseDBClient):
     ) -> dict:
         """Create or update the JSON value stored for a user under `key`."""
         async with self.async_session() as session:
-            row = await self._get_user_configuration_row(session, user_id, key)
-            if row:
-                row.configuration = value
-            else:
-                row = UserConfigurationModel(
-                    user_id=user_id, key=key, configuration=value
-                )
-                session.add(row)
+            stmt = insert(UserConfigurationModel.__table__).values(
+                user_id=user_id,
+                key=key,
+                configuration=value,
+            )
+            stmt = stmt.on_conflict_do_update(
+                constraint="_user_configuration_key_uc",
+                set_={"configuration": stmt.excluded.configuration},
+            ).returning(UserConfigurationModel.configuration)
             try:
+                result = await session.execute(stmt)
                 await session.commit()
             except Exception as e:
                 await session.rollback()
                 raise e
-            await session.refresh(row)
-            return row.configuration
+            return result.scalar_one()
 
     async def get_user_configurations(
         self, user_id: int
diff --git a/api/services/mps_service_key_client.py b/api/services/mps_service_key_client.py
index 87b95fde..f37d2e1c 100644
--- a/api/services/mps_service_key_client.py
+++ b/api/services/mps_service_key_client.py
@@ -512,7 +512,7 @@ class MPSServiceKeyClient:
             if response.status_code == 200:
                 return response.json()
 
-            logger.error(
+            logger.warning(
                 "Failed to authorize MPS workflow run start: "
                 f"{response.status_code} - {response.text}"
             )
diff --git a/api/services/quota_service.py b/api/services/quota_service.py
index 6633736e..9dd8528f 100644
--- a/api/services/quota_service.py
+++ b/api/services/quota_service.py
@@ -37,6 +37,12 @@ BILLING_V2_QUOTA_EXCEEDED_MESSAGE = (
     "or change providers in Models configurations."
 )
 
+SERVICE_TOKEN_ORG_MISMATCH_MESSAGE = (
+    "The Dograh service token being used is created from another account. "
+    "Please create a new service token from the Developers tab and use it in "
+    "your model configuration."
+)
+
 
 @dataclass
 class QuotaCheckResult:
@@ -98,6 +104,26 @@ def _dograh_api_keys(user_config: Any) -> set[str]:
     return api_keys
 
 
+def _is_service_key_org_mismatch_error(error: Exception) -> bool:
+    response = getattr(error, "response", None)
+    if getattr(response, "status_code", None) != 403:
+        return False
+
+    detail: Any = None
+    try:
+        payload = response.json()
+        if isinstance(payload, dict):
+            detail = payload.get("detail")
+    except Exception:
+        detail = None
+
+    if isinstance(detail, str):
+        return detail.lower() == "service key organization mismatch"
+
+    response_text = getattr(response, "text", "")
+    return "Service key organization mismatch" in response_text
+
+
 async def _store_run_correlation_id(
     workflow_run_id: int | None,
     correlation_id: str | None,
@@ -173,11 +199,20 @@ async def _authorize_hosted_workflow_run_start(
             },
         )
     except Exception as e:
-        logger.error(
+        logger.warning(
             "Failed to authorize workflow start with MPS for org {}: {}",
             organization_id,
             e,
         )
+        if _is_service_key_org_mismatch_error(e):
+            return (
+                QuotaCheckResult(
+                    has_quota=False,
+                    error_code="service_key_org_mismatch",
+                    error_message=SERVICE_TOKEN_ORG_MISMATCH_MESSAGE,
+                ),
+                True,
+            )
         return (
             QuotaCheckResult(
                 has_quota=False,
diff --git a/api/tests/test_quota_service.py b/api/tests/test_quota_service.py
index 8e2ee6f5..80b5e8c6 100644
--- a/api/tests/test_quota_service.py
+++ b/api/tests/test_quota_service.py
@@ -1,6 +1,7 @@
 from types import SimpleNamespace
 from unittest.mock import AsyncMock
 
+import httpx
 import pytest
 
 from api.services import quota_service
@@ -284,6 +285,69 @@ async def test_authorize_workflow_run_managed_v2_stores_hosted_correlation(
     )
 
 
+@pytest.mark.asyncio
+async def test_authorize_workflow_run_service_token_from_wrong_org_prompts_new_token(
+    monkeypatch,
+):
+    api_key = "mps_sk_12345678"
+    get_config = AsyncMock(
+        return_value=_dograh_config(api_key, managed_service_version=2)
+    )
+    request = httpx.Request(
+        "POST",
+        "http://localhost:8004/api/v1/billing/accounts/42/run-authorization",
+    )
+    response = httpx.Response(
+        403,
+        json={"detail": "Service key organization mismatch"},
+        request=request,
+    )
+    authorize = AsyncMock(
+        side_effect=httpx.HTTPStatusError(
+            "Failed to authorize MPS workflow run start",
+            request=request,
+            response=response,
+        )
+    )
+
+    monkeypatch.setattr(quota_service, "DEPLOYMENT_MODE", "saas")
+    _patch_workflow_context(monkeypatch)
+    monkeypatch.setattr(
+        quota_service,
+        "get_effective_ai_model_configuration_for_workflow",
+        get_config,
+    )
+    monkeypatch.setattr(
+        quota_service.mps_service_key_client,
+        "authorize_workflow_run_start",
+        authorize,
+    )
+    monkeypatch.setattr(
+        quota_service.mps_service_key_client,
+        "check_service_key_usage",
+        AsyncMock(),
+    )
+
+    result = await quota_service.authorize_workflow_run_start(
+        workflow_id=7,
+        workflow_run_id=88,
+    )
+
+    assert result.has_quota is False
+    assert result.error_code == "service_key_org_mismatch"
+    assert result.error_message == quota_service.SERVICE_TOKEN_ORG_MISMATCH_MESSAGE
+    assert "new service token from the Developers tab" in result.error_message
+    authorize.assert_awaited_once_with(
+        organization_id=42,
+        workflow_run_id=88,
+        service_key=api_key,
+        require_correlation_id=True,
+        minimum_credits=quota_service.MINIMUM_DOGRAH_CREDITS_FOR_CALL,
+        created_by="provider-123",
+        metadata={"dograh_user_id": "123", "workflow_id": 7},
+    )
+
+
 @pytest.mark.asyncio
 async def test_authorize_workflow_run_oss_uses_key_paths_not_workflow_org(
     monkeypatch,
diff --git a/api/tests/test_user_configuration_upsert.py b/api/tests/test_user_configuration_upsert.py
new file mode 100644
index 00000000..2861354b
--- /dev/null
+++ b/api/tests/test_user_configuration_upsert.py
@@ -0,0 +1,98 @@
+import pytest
+from sqlalchemy import select
+from sqlalchemy.dialects import postgresql
+
+from api.db.models import UserConfigurationModel
+from api.db.user_client import UserClient
+from api.enums import UserConfigurationKey
+
+
+class _FakeResult:
+    def __init__(self, value: dict):
+        self._value = value
+
+    def scalar_one(self) -> dict:
+        return self._value
+
+
+class _FakeSession:
+    def __init__(self, result_value: dict):
+        self.result_value = result_value
+        self.statements = []
+        self.committed = False
+        self.rolled_back = False
+
+    async def __aenter__(self):
+        return self
+
+    async def __aexit__(self, exc_type, exc_val, exc_tb):
+        return False
+
+    async def execute(self, stmt):
+        self.statements.append(stmt)
+        return _FakeResult(self.result_value)
+
+    async def commit(self):
+        self.committed = True
+
+    async def rollback(self):
+        self.rolled_back = True
+
+
+@pytest.mark.asyncio
+async def test_upsert_user_configuration_value_uses_atomic_conflict_update():
+    result_value = {"completed_actions": ["web_call_started"]}
+    fake_session = _FakeSession(result_value)
+    client = UserClient.__new__(UserClient)
+    client.async_session = lambda: fake_session
+
+    value = await client.upsert_user_configuration_value(
+        86,
+        UserConfigurationKey.ONBOARDING.value,
+        result_value,
+    )
+
+    assert value == result_value
+    assert fake_session.committed is True
+    assert fake_session.rolled_back is False
+    assert len(fake_session.statements) == 1
+
+    compiled = str(fake_session.statements[0].compile(dialect=postgresql.dialect()))
+    assert "ON CONFLICT ON CONSTRAINT _user_configuration_key_uc DO UPDATE" in compiled
+    assert "configuration = excluded.configuration" in compiled
+    assert "last_validated_at" not in compiled
+
+
+@pytest.mark.asyncio
+async def test_upsert_user_configuration_value_updates_existing_row(
+    db_session,
+    async_session,
+):
+    user, _ = await db_session.get_or_create_user_by_provider_id(
+        "user-config-upsert-test"
+    )
+
+    first = await db_session.upsert_user_configuration_value(
+        user.id,
+        UserConfigurationKey.ONBOARDING.value,
+        {"skipped": False},
+    )
+    second = await db_session.upsert_user_configuration_value(
+        user.id,
+        UserConfigurationKey.ONBOARDING.value,
+        {"skipped": True},
+    )
+
+    assert first == {"skipped": False}
+    assert second == {"skipped": True}
+
+    result = await async_session.execute(
+        select(UserConfigurationModel).where(
+            UserConfigurationModel.user_id == user.id,
+            UserConfigurationModel.key == UserConfigurationKey.ONBOARDING.value,
+        )
+    )
+    rows = result.scalars().all()
+
+    assert len(rows) == 1
+    assert rows[0].configuration == {"skipped": True}
diff --git a/docs/configurations/api-keys.mdx b/docs/configurations/api-keys.mdx
index 7471c7ba..924abf60 100644
--- a/docs/configurations/api-keys.mdx
+++ b/docs/configurations/api-keys.mdx
@@ -15,5 +15,12 @@ Please note that you must copy and keep the API key secretly, since this is the
 ### Service Keys
 Service Keys are the keys which you generate to be used in [Model Configurations](inference-providers). In order to generate that, you can go to `/api-keys` and create a new key. 
 
+<Note>
+You can use a Service Key created in Dograh Cloud (`https://app.dograh.com/api-keys`) in your self-hosted Dograh deployment. Create the Service Key from your Dograh Cloud account, then paste it into **Model Configurations** in your self-hosted instance to use Dograh-managed inference providers. You can purchase Dograh credits in Dograh Cloud; billing happens on the Cloud account that owns the Service Key.
+</Note>
+
 ![Create a new Service Key](../images/service-keys.png)
 
+<Warning>
+Service Keys are scoped to the Dograh Cloud account that created them. You cannot use a Service Key from one cloud-hosted account in another cloud-hosted account; create a new Service Key from the account where you want to use it.
+</Warning>
diff --git a/docs/configurations/inference-providers.mdx b/docs/configurations/inference-providers.mdx
index d31dbc81..9fb12d16 100644
--- a/docs/configurations/inference-providers.mdx
+++ b/docs/configurations/inference-providers.mdx
@@ -1,187 +1,93 @@
 ---
 title: "Model Configurations"
-description: "Voice Agents need AI Models to work, like LLM (Large Language Model), TTS (Voice) and STT (Transcriber). You can use any of your faviourite providers with Dograh Platform to run your Voice Agent."
+description: "Configure the speech-to-speech, Dograh-managed, or bring-your-own-key models your Dograh agents use."
 ---
 
-## How Model Configuration Works
+## How model configurations work
 
-Dograh uses a **two-level configuration system** for AI models:
+Model Configurations define the default AI model setup for your organization. Agents use this configuration unless you set agent-level model overrides in the agent settings.
 
-1. **Global configuration** — A single set of model settings (LLM, TTS, STT) that applies to **all agents** by default.
-2. **Agent-level overrides** — Optional per-agent settings that override the global configuration for specific services.
+To configure models, open **Models** in your Dograh dashboard:
 
-If no overrides are set for an agent, it uses the global configuration as-is.
-
-<Note>
-Agent-level overrides are **selective** — you can override only the services you want to change. For example, you can override just the LLM provider for a specific agent while keeping the global TTS and STT settings. There is no need to reconfigure every service.
-</Note>
-
-## Global Configuration
-
-The global configuration is the default model setup shared across all your agents. Dograh ships with its own models by default — when you sign up on https://app.dograh.com or set up the platform on your self-hosted infrastructure, you get some Dograh model credits to start with.
-
-To configure the global models, go to **Model Configurations** in your dashboard:
 - **Hosted:** `https://app.dograh.com/model-configurations`
 - **Self-hosted:** `http://localhost:3010/model-configurations`
+- **Local development:** `http://localhost:3000/model-configurations`
 
-![Model Configuration](../images/service-configuration.png)
+The Models page has three top-level sections:
 
-From here you can configure each service:
-
-| Service | What it does |
-|---------|-------------|
-| **LLM** | The language model that generates responses (e.g., OpenAI GPT-4.1, Anthropic Claude) |
-| **TTS (Voice)** | The text-to-speech model that converts responses to spoken audio (e.g., ElevenLabs, Cartesia) |
-| **STT (Transcriber)** | The speech-to-text model that transcribes user speech (e.g., Deepgram, AssemblyAI) |
-| **Realtime** | A single speech-to-speech model that handles LLM, TTS, and STT in one (e.g., Gemini Live) |
-
-Select a provider from the dropdown and configure the API key, model, and any provider-specific settings. For Dograh's own models, see [Service Keys](api-keys) for instructions on creating Service Keys.
-
-## Agent-Level Model Overrides
-
-You can override the global model configuration for any individual agent. This is useful when different agents have different requirements — for example, a customer support agent might use a faster, cheaper LLM while a sales agent uses a more capable one.
-
-### Configuring overrides
-
-1. Open the agent you want to customize.
-2. Go to **Settings** in the agent detail page.
-3. Select the **Model Overrides** tab.
-4. You will see tabs for each service: **LLM**, **Voice** (TTS), and **Transcriber** (STT).
-5. Toggle **Override** on for the service you want to change.
-6. Configure the provider, model, and other settings as needed.
-7. Save your changes.
-
-### Selective overrides
-
-Each service can be toggled independently. When an override is **off** for a service, the agent inherits the global setting for that service. When an override is **on**, the agent uses the override setting instead.
-
-| LLM Override | TTS Override | STT Override | Result |
-|---|---|---|---|
-| Off | Off | Off | Agent uses global config for all services |
-| On | Off | Off | Agent uses custom LLM, global TTS and STT |
-| Off | On | Off | Agent uses global LLM and STT, custom TTS |
-| On | On | On | Agent uses custom config for all services |
-
-For example, if you only want to change the voice for a specific agent:
-1. Leave the LLM and Transcriber overrides **off**.
-2. Toggle the Voice override **on**.
-3. Select a different TTS provider or voice.
-4. The agent will use your custom voice while still using the global LLM and STT.
-
-### Realtime mode override
-
-You can also switch an individual agent to use a **Realtime** provider (such as Gemini Live) even if the global configuration uses standard LLM + TTS + STT. Toggle the **Realtime** switch in the Model Overrides tab, then configure the realtime provider, model, and voice.
+| Section | When to use it |
+|---------|----------------|
+| **Speech to Speech** | Use a realtime speech-to-speech model for the live conversation. You still configure an LLM alongside it for variable extraction and QA. |
+| **Dograh** | Use Dograh-managed LLM, voice, and transcriber models behind one Dograh Service Key. |
+| **BYOK** | Bring your own provider keys and configure LLM, Voice, Transcriber, and Embedding models separately. |
 
 <Note>
-When an agent uses a Realtime provider, it replaces the separate TTS and STT services with a single speech-to-speech model. An **LLM** is still required alongside the Realtime model — it's used for out-of-band tasks like variable extraction and QA analysis, which the realtime service does not handle. Context compaction is not applicable in Realtime mode and is ignored if enabled.
+Model settings are organization-scoped. If no agent-level override is set, every agent in the organization uses the saved global configuration.
 </Note>
 
-## Gemini 3.1 Live
+## Speech to Speech
 
-Gemini 3.1 Live is Google's realtime multimodal API that handles both LLM and voice in a single model. Instead of configuring separate LLM, TTS, and STT services, Gemini Live acts as an all-in-one realtime provider — it processes speech input, generates a response, and speaks it back, all over a single streaming connection.
+Use **Speech to Speech** when you want a realtime model to handle the live spoken conversation directly. In this mode, the realtime model handles speech input and speech output, so you do not configure separate Voice and Transcriber services.
 
-Dograh supports Gemini 3.1 Live as a **Realtime** provider. The default model is `gemini-3.1-flash-live-preview`.
+![Speech to Speech model configuration](../images/model-configuration-speech-to-speech.png)
 
-### Available Voices
+The Speech to Speech section has nested tabs:
 
-You can choose from the following built-in voices:
+| Tab | What to configure |
+|-----|-------------------|
+| **Realtime Model** | The speech-to-speech provider, model, voice, and API key. |
+| **LLM** | A standard LLM used for non-realtime tasks such as variable extraction and QA analysis. |
+| **Embedding** | An embedding model used by features that need embeddings, such as retrieval from knowledge base content. |
 
-| Voice | Description |
-|-------|-------------|
-| Puck | Default voice |
-| Charon | — |
-| Kore | — |
-| Fenrir | — |
-| Aoede | — |
+<Warning>
+An LLM is still required when you use Speech to Speech. The realtime model handles the live voice conversation, but Dograh uses the LLM for analysis tasks that happen outside the live audio stream.
+</Warning>
 
-### Getting a Gemini API Key
+## Dograh
 
-To use Gemini 3.1 Live with Dograh, you need a Google Gemini API key. Follow these steps:
+Use **Dograh** when you want Dograh to manage the model providers for you. This path uses one Dograh Service Key for Dograh-managed models instead of separate provider keys for LLM, Voice, and Transcriber.
 
-1. Go to [Google AI Studio](https://aistudio.google.com/).
-2. Sign in with your Google account.
-3. Click on **Get API Key** in the left sidebar.
-4. Click **Create API Key**.
-5. Select an existing Google Cloud project or create a new one.
-6. Copy the generated API key and store it securely.
+![Dograh model configuration](../images/model-configuration-dograh.png)
 
-<Note>
-  The Gemini API key is different from a Google Cloud service account key. You specifically need a **Gemini API key** from Google AI Studio for use with Dograh.
-</Note>
+Configure:
 
-### Configuring Gemini 3.1 Live in Dograh
+| Field | What it controls |
+|-------|------------------|
+| **Voice** | The Dograh-managed voice to use. |
+| **Speed** | The voice playback speed. |
+| **Language** | The language behavior, including multilingual auto-detect when available. |
+| **API Key** | Your Dograh Service Key. Create Service Keys from **Developers**. |
 
-1. Go to **Model Configurations** in your Dograh dashboard (`https://app.dograh.com/model-configurations` for hosted or `http://localhost:3010/model-configurations` for local).
-2. Under the **Realtime** section, select `google_realtime` as the provider.
-3. Paste your Gemini API key.
-4. Select the model (`gemini-3.1-flash-live-preview` is available by default, or you can enter a model name manually).
-5. Choose a voice from the dropdown (default is `Puck`).
-6. Select the language (currently `en` is supported).
+For details on creating and using Service Keys, see [API Keys and Service Keys](api-keys#service-keys).
 
-<Note>
-  When using a Realtime provider like Gemini Live, you do not need to configure separate TTS and STT services — the realtime model handles speech in and out. However, you **must** still configure an **LLM** under the LLM tab: it powers variable extraction and QA analysis, which the realtime service does not perform.
-</Note>
+## BYOK
 
-## Gemini Live on Vertex AI
+Use **BYOK** when you want to bring your own provider accounts and API keys. This gives you separate control over each model category.
 
-If you want to run Gemini Live through your own Google Cloud project — for billing consolidation, VPC controls, regional residency, or enterprise IAM — Dograh also supports Gemini Live via **Vertex AI** as a separate provider (`google_vertex_realtime`). The default model is `google/gemini-live-2.5-flash-native-audio`.
+![BYOK model configuration](../images/model-configuration-byok.png)
 
-Unlike Google AI Studio (which uses a single Gemini API key), Vertex AI authenticates with a **service account** belonging to your Google Cloud project.
+The BYOK section has nested tabs:
 
-### Prerequisites
+| Tab | What to configure |
+|-----|-------------------|
+| **LLM** | The chat or reasoning model provider, model, optional base URL, and API key. |
+| **Voice** | The text-to-speech provider, voice, model, speed, optional base URL, and API key. |
+| **Transcriber** | The speech-to-text provider, model, language, and API key. |
+| **Embedding** | The embedding provider, model, and API key. |
 
-1. A Google Cloud project with billing enabled.
-2. The Vertex AI API enabled on that project:
+Provider-specific fields appear only when they apply. For example, OpenAI-compatible LLM providers can expose a **Base URL** field, ElevenLabs voices can expose a voice ID, and transcribers can expose language options.
 
-   ```bash
-   gcloud services enable aiplatform.googleapis.com --project=YOUR_PROJECT_ID
-   ```
+## Agent-level model overrides
 
-3. A service account with the **Vertex AI User** role (`roles/aiplatform.user`) on the project:
+You can override the organization model configuration for an individual agent. This is useful when different agents need different models, voices, transcribers, or providers.
 
-   ```bash
-   gcloud projects add-iam-policy-binding YOUR_PROJECT_ID \
-     --member="serviceAccount:YOUR_SA@YOUR_PROJECT_ID.iam.gserviceaccount.com" \
-     --role="roles/aiplatform.user"
-   ```
+To configure an override:
 
-4. A **JSON** key for that service account (P12 keys are not supported).
+1. Open the agent.
+2. Go to **Settings**.
+3. Open **Model Overrides**.
+4. Enable the override for the service you want to customize.
+5. Configure the provider, model, and keys for that service.
+6. Save the agent settings.
 
-### Creating the service account key
-
-1. In the GCP Console, go to **IAM & Admin → Service Accounts**.
-2. Pick an existing service account (or create a new one).
-3. Open the **Keys** tab → **Add Key → Create new key**.
-4. Choose **JSON** as the key type and click **Create**.
-5. The key file will download to your computer — store it securely and treat it as a secret.
-
-<Note>
-  Always pick **JSON**, not P12. The Vertex AI client libraries used by Dograh only accept service-account JSON keys; P12 is a legacy format retained for older Google Workspace integrations.
-</Note>
-
-### Configuring Vertex AI Realtime in Dograh
-
-1. Go to **Model Configurations** in your Dograh dashboard.
-2. Enable the **Realtime** toggle.
-3. Under the **Realtime** section, select `google_vertex_realtime` as the provider.
-4. Fill in the fields:
-
-   | Field | What to put in |
-   |---|---|
-   | **Model** | Vertex publisher/model id, e.g. `google/gemini-live-2.5-flash-native-audio` |
-   | **Voice** | One of the built-in voices (Puck, Charon, Kore, Fenrir, Aoede) |
-   | **Language** | BCP-47 code (e.g. `en-US`) |
-   | **Project Id** | The `project_id` value from your service-account JSON |
-   | **Location** | GCP region where the model is available (e.g. `us-east4`) |
-   | **Credentials** | Paste the **entire contents** of the service-account JSON file |
-   | **API Key** | Leave blank — Vertex AI does not use API keys |
-
-5. Save the configuration.
-
-<Note>
-  Paste the whole JSON file into the **Credentials** field — including `private_key`, `client_email`, and all other entries. Don't try to extract individual fields. If `Credentials` is left blank, Dograh falls back to **Application Default Credentials (ADC)** from the host environment, which is useful when running Dograh on a GCP VM or GKE pod with an attached service account.
-</Note>
-
-<Note>
-  IAM changes can take up to ~60 seconds to propagate. If you see `Permission 'aiplatform.endpoints.predict' denied`, wait a minute and retry — or double-check that the role was granted to the same service account whose JSON you pasted.
-</Note>
\ No newline at end of file
+Agent-level overrides are selective. For example, you can override only the Voice service for one agent while it continues to use the organization-level LLM and Transcriber configuration.
diff --git a/docs/images/model-configuration-byok.png b/docs/images/model-configuration-byok.png
new file mode 100644
index 00000000..0413d718
Binary files /dev/null and b/docs/images/model-configuration-byok.png differ
diff --git a/docs/images/model-configuration-dograh.png b/docs/images/model-configuration-dograh.png
new file mode 100644
index 00000000..7f4d0a9b
Binary files /dev/null and b/docs/images/model-configuration-dograh.png differ
diff --git a/docs/images/model-configuration-speech-to-speech.png b/docs/images/model-configuration-speech-to-speech.png
new file mode 100644
index 00000000..52b5d99b
Binary files /dev/null and b/docs/images/model-configuration-speech-to-speech.png differ
diff --git a/docs/images/service-configuration.png b/docs/images/service-configuration.png
deleted file mode 100644
index 2cbc3506..00000000
Binary files a/docs/images/service-configuration.png and /dev/null differ
diff --git a/scripts/setup_pipecat.sh b/scripts/setup_pipecat.sh
index 6da6844a..101d4a90 100755
--- a/scripts/setup_pipecat.sh
+++ b/scripts/setup_pipecat.sh
@@ -20,6 +20,6 @@ pip install -r api/requirements.txt
 
 # Install pipecat from submodule last so it overrides any pipecat-ai pulled in by dependencies
 echo "Installing pipecat dependencies..."
-pip install -e ./pipecat[cartesia,deepgram,openai,elevenlabs,groq,google,azure,sarvam,soundfile,silero,webrtc,speechmatics,openrouter,camb,inworld]
+pip install -e ./pipecat[cartesia,deepgram,openai,elevenlabs,groq,google,azure,sarvam,soundfile,silero,webrtc,speechmatics,openrouter,camb,mcp,inworld,smallest]
 
 echo "Setup complete! Pipecat is now available as a git submodule."
diff --git a/scripts/setup_requirements.ps1 b/scripts/setup_requirements.ps1
index 21f53592..b2106d8a 100644
--- a/scripts/setup_requirements.ps1
+++ b/scripts/setup_requirements.ps1
@@ -60,7 +60,7 @@ if ($Dev) {
 
 # Install pipecat in editable mode with all extras
 Write-Host "Installing pipecat dependencies..."
-uv pip install -e './pipecat[cartesia,deepgram,openai,elevenlabs,groq,google,azure,sarvam,soundfile,silero,webrtc,speechmatics,openrouter,camb]'
+uv pip install -e './pipecat[cartesia,deepgram,openai,elevenlabs,groq,google,azure,sarvam,soundfile,silero,webrtc,speechmatics,openrouter,camb,mcp,inworld,smallest]'
 
 if ($Dev) {
     Write-Host "Installing pipecat dev dependencies..."
diff --git a/scripts/setup_requirements.sh b/scripts/setup_requirements.sh
index 8074661f..9a328c70 100755
--- a/scripts/setup_requirements.sh
+++ b/scripts/setup_requirements.sh
@@ -80,7 +80,7 @@ fi
 
 # Install pipecat in editable mode with all extras
 echo "Installing pipecat dependencies..."
-uv pip install -e ./pipecat[cartesia,deepgram,openai,elevenlabs,groq,google,azure,sarvam,soundfile,silero,webrtc,speechmatics,openrouter,camb,mcp,inworld]
+uv pip install -e ./pipecat[cartesia,deepgram,openai,elevenlabs,groq,google,azure,sarvam,soundfile,silero,webrtc,speechmatics,openrouter,camb,mcp,inworld,smallest]
 
 if [ "$DEV_MODE" -eq 1 ]; then
     echo "Installing pipecat dev dependencies..."
diff --git a/sdk/python/src/dograh_sdk/_generated_models.py b/sdk/python/src/dograh_sdk/_generated_models.py
index 2932698e..788b0634 100644
--- a/sdk/python/src/dograh_sdk/_generated_models.py
+++ b/sdk/python/src/dograh_sdk/_generated_models.py
@@ -1,6 +1,6 @@
 # generated by datamodel-codegen:
-#   filename:  dograh-openapi-XXXXXX.json.ZaaB4gFvgD
-#   timestamp: 2026-06-19T10:19:05+00:00
+#   filename:  dograh-openapi-XXXXXX.json.6F33jkClt9
+#   timestamp: 2026-06-19T12:41:10+00:00
 
 from __future__ import annotations
 
diff --git a/ui/src/app/workflow/[workflowId]/components/workflow-tester/EmbeddedVoiceTester.tsx b/ui/src/app/workflow/[workflowId]/components/workflow-tester/EmbeddedVoiceTester.tsx
index 9a0ff85d..e821971d 100644
--- a/ui/src/app/workflow/[workflowId]/components/workflow-tester/EmbeddedVoiceTester.tsx
+++ b/ui/src/app/workflow/[workflowId]/components/workflow-tester/EmbeddedVoiceTester.tsx
@@ -148,6 +148,7 @@ export function EmbeddedVoiceTester({
                 error={apiKeyError}
                 errorCode={apiKeyErrorCode}
                 onNavigateToBilling={() => router.push("/billing")}
+                onNavigateToDevelopers={() => router.push("/api-keys")}
                 onNavigateToModelConfig={() => router.push("/model-configurations")}
             />
 
diff --git a/ui/src/app/workflow/[workflowId]/run/[runId]/components/ApiKeyErrorDialog.tsx b/ui/src/app/workflow/[workflowId]/run/[runId]/components/ApiKeyErrorDialog.tsx
index 29672545..fe80d1eb 100644
--- a/ui/src/app/workflow/[workflowId]/run/[runId]/components/ApiKeyErrorDialog.tsx
+++ b/ui/src/app/workflow/[workflowId]/run/[runId]/components/ApiKeyErrorDialog.tsx
@@ -1,14 +1,17 @@
-import { AlertCircle, CreditCard, Key } from "lucide-react";
+import { AlertCircle, CreditCard, ExternalLink, Key } from "lucide-react";
 
 import { Button } from "@/components/ui/button";
 import { Dialog, DialogContent, DialogDescription, DialogFooter, DialogHeader, DialogTitle } from "@/components/ui/dialog";
 
+const SERVICE_KEYS_DOCS_URL = "https://docs.dograh.com/configurations/api-keys#service-keys";
+
 interface ApiKeyErrorDialogProps {
     open: boolean;
     onOpenChange: (open: boolean) => void;
     error: string | null;
     errorCode: string | null;
     onNavigateToBilling: () => void;
+    onNavigateToDevelopers: () => void;
     onNavigateToModelConfig: () => void;
 }
 
@@ -18,15 +21,29 @@ export const ApiKeyErrorDialog = ({
     error,
     errorCode,
     onNavigateToBilling,
+    onNavigateToDevelopers,
     onNavigateToModelConfig,
 }: ApiKeyErrorDialogProps) => {
     const isBillingCreditsError = errorCode === 'insufficient_credits';
+    const isServiceKeyOrgMismatch = errorCode === 'service_key_org_mismatch';
     const isQuotaError = isBillingCreditsError || errorCode === 'quota_exceeded';
 
-    const title = isQuotaError ? "Insufficient Credits" : "API Configuration Error";
+    const title = isQuotaError
+        ? "Insufficient Credits"
+        : isServiceKeyOrgMismatch
+            ? "Service Token Account Mismatch"
+            : "API Configuration Error";
     const icon = isQuotaError ? <CreditCard className="h-5 w-5 text-orange-500" /> : <Key className="h-5 w-5 text-red-500" />;
-    const buttonText = isBillingCreditsError ? "Go to Billing" : "Go to Model Configurations";
-    const onNavigate = isBillingCreditsError ? onNavigateToBilling : onNavigateToModelConfig;
+    const buttonText = isBillingCreditsError
+        ? "Go to Billing"
+        : isServiceKeyOrgMismatch
+            ? "Go to Developers"
+            : "Go to Model Configurations";
+    const onNavigate = isBillingCreditsError
+        ? onNavigateToBilling
+        : isServiceKeyOrgMismatch
+            ? onNavigateToDevelopers
+            : onNavigateToModelConfig;
 
     return (
         <Dialog open={open} onOpenChange={onOpenChange}>
@@ -46,6 +63,16 @@ export const ApiKeyErrorDialog = ({
                                         Purchase credits from Billing to continue using Dograh-managed models.
                                     </p>
                                 )}
+                                {isServiceKeyOrgMismatch && (
+                                    <a
+                                        href={SERVICE_KEYS_DOCS_URL}
+                                        target="_blank"
+                                        rel="noopener noreferrer"
+                                        className="inline-flex items-center gap-0.5 text-muted-foreground underline"
+                                    >
+                                        Learn more <ExternalLink className="h-3 w-3" />
+                                    </a>
+                                )}
                             </div>
                         </div>
                     </DialogDescription>
diff --git a/ui/src/app/workflow/[workflowId]/run/[runId]/hooks/useWebSocketRTC.tsx b/ui/src/app/workflow/[workflowId]/run/[runId]/hooks/useWebSocketRTC.tsx
index b8b19182..2ad08dc2 100644
--- a/ui/src/app/workflow/[workflowId]/run/[runId]/hooks/useWebSocketRTC.tsx
+++ b/ui/src/app/workflow/[workflowId]/run/[runId]/hooks/useWebSocketRTC.tsx
@@ -33,6 +33,7 @@ const HANDLED_SERVICE_ERROR_TYPES = new Set([
     'quota_exceeded',
     'insufficient_credits',
     'invalid_service_key',
+    'service_key_org_mismatch',
     'quota_check_failed',
 ]);
 
diff --git a/ui/src/app/workflow/[workflowId]/run/[runId]/page.tsx b/ui/src/app/workflow/[workflowId]/run/[runId]/page.tsx
index 3362e1ab..4cf8dbc1 100644
--- a/ui/src/app/workflow/[workflowId]/run/[runId]/page.tsx
+++ b/ui/src/app/workflow/[workflowId]/run/[runId]/page.tsx
@@ -408,10 +408,6 @@ function RunMetricsSection({
             </CardHeader>
             <CardContent className="grid gap-3 sm:grid-cols-2 xl:grid-cols-3">
                 <MetricCard label="Duration" value={formatDuration(costInfo?.call_duration_seconds)} />
-                <MetricCard
-                    label="Token Usage"
-                    value={costInfo?.dograh_token_usage != null ? costInfo.dograh_token_usage.toLocaleString() : 'N/A'}
-                />
                 <MetricCard label="User Turns" value={String(metrics.userTurns)} />
                 <MetricCard label="Bot Turns" value={String(metrics.botTurns)} />
                 <MetricCard label="Tool Calls" value={String(metrics.toolCalls)} />