fix(ari): pre-register ext channel id and defer bridge to its StasisS… (#284)

* fix(ari): pre-register ext channel id and defer bridge to its StasisStart

Two race conditions in the inbound ARI flow could leave a call silent:

1. Bridging both channels immediately after creating the ext media leg
   raced against the ext channel entering the Stasis application; slow
   chan_websocket handshakes produced "Channel not in Stasis application"
   422 errors on addChannel.

2. Asterisk could fire StasisStart for the ext channel before the
   externalMedia POST response returned, so _is_ext_channel returned
   False and the event was dropped as an unknown outbound call.

Fixes:
- Generate the ext channel id as dograh-ext-<uuid> client-side and pass
  it to Asterisk via the channelId query param. Mark the ext channel,
  set its channel->run mapping, register the pending bridge entry, and
  persist gathered_context.ext_channel_id all before the POST.
- Defer the bridge to a new _complete_bridge_after_ext_ready handler
  triggered by the ext channel's own StasisStart. Both channels are
  guaranteed in Stasis by then, so addChannel cannot 422.
- On POST failure or channelId mismatch, roll back the pending entry
  and ERROR loudly.

* fix: replace in-memory dict with redis storage
This commit is contained in:
Sabiha Khan 2026-05-13 18:33:34 +05:30 committed by GitHub
parent 59619e9eaa
commit ebeffdbc40
No known key found for this signature in database
GPG key ID: B5690EEEBB952194

View file

@ -14,6 +14,7 @@ setup_logging()
import asyncio import asyncio
import json import json
import signal import signal
import uuid
from typing import Dict, Optional, Set from typing import Dict, Optional, Set
from urllib.parse import urlparse from urllib.parse import urlparse
@ -35,7 +36,9 @@ from api.services.telephony.transfer_event_protocol import (
# Redis key pattern and TTL for channel-to-run mapping # Redis key pattern and TTL for channel-to-run mapping
_CHANNEL_KEY_PREFIX = "ari:channel:" _CHANNEL_KEY_PREFIX = "ari:channel:"
_EXT_CHANNEL_KEY_PREFIX = "ari:ext_channel:" _EXT_CHANNEL_KEY_PREFIX = "ari:ext_channel:"
_PENDING_BRIDGE_PREFIX = "ari:pending_bridge:"
_CHANNEL_KEY_TTL = 3600 # 1 hour safety expiry _CHANNEL_KEY_TTL = 3600 # 1 hour safety expiry
_PENDING_BRIDGE_TTL = 300 # 5 min safety expiry for bridge-pending state
class ARIConnection: class ARIConnection:
@ -121,6 +124,33 @@ class ARIConnection:
r = await self._get_redis() r = await self._get_redis()
await r.delete(f"{_EXT_CHANNEL_KEY_PREFIX}{channel_id}") await r.delete(f"{_EXT_CHANNEL_KEY_PREFIX}{channel_id}")
async def _set_pending_bridge(
self,
ext_channel_id: str,
caller_channel_id: str,
workflow_run_id: str,
):
"""Store the bridge context to be consumed when ext media enters Stasis."""
r = await self._get_redis()
await r.set(
f"{_PENDING_BRIDGE_PREFIX}{ext_channel_id}",
json.dumps(
{
"caller_channel_id": caller_channel_id,
"workflow_run_id": workflow_run_id,
}
),
ex=_PENDING_BRIDGE_TTL,
)
async def _pop_pending_bridge(self, ext_channel_id: str) -> Optional[dict]:
"""Read and delete the pending bridge context. Returns None if absent."""
r = await self._get_redis()
val = await r.getdel(f"{_PENDING_BRIDGE_PREFIX}{ext_channel_id}")
if val is None:
return None
return json.loads(val)
@property @property
def ws_url(self) -> str: def ws_url(self) -> str:
"""Build the ARI WebSocket URL.""" """Build the ARI WebSocket URL."""
@ -249,12 +279,25 @@ class ARIConnection:
) )
if event_type == "StasisStart": if event_type == "StasisStart":
# Skip external media channels we created — they fire
# their own StasisStart but need no further handling.
if await self._is_ext_channel(channel_id): if await self._is_ext_channel(channel_id):
logger.debug( # External media channel has entered Stasis. If there is a
f"[ARI org={self.organization_id}] StasisStart for our " # queued bridge for it, finish bridging now; otherwise the
f"externalMedia channel {channel_id}, ignoring" # caller-side handler did not register one and this event is
# nothing for us to act on.
pending = await self._pop_pending_bridge(channel_id)
if pending is None:
logger.debug(
f"[ARI org={self.organization_id}] StasisStart for ext "
f"channel {channel_id} with no pending bridge"
)
return
logger.info(
f"[ARI org={self.organization_id}] Ext channel {channel_id} "
f"entered Stasis — completing bridge for caller "
f"{pending['caller_channel_id']} (run {pending['workflow_run_id']})"
)
asyncio.create_task(
self._complete_bridge_after_ext_ready(channel_id, pending)
) )
return return
@ -394,12 +437,18 @@ class ARIConnection:
workflow_id: str, workflow_id: str,
user_id: str, user_id: str,
workflow_run_id: str, workflow_run_id: str,
channel_id: Optional[str] = None,
) -> str: ) -> str:
"""Create an external media channel via chan_websocket. """Create an external media channel via chan_websocket.
Uses ARI externalMedia with transport=websocket so Asterisk connects Uses ARI externalMedia with transport=websocket so Asterisk connects
to our backend over WebSocket (via websocket_client.conf). to our backend over WebSocket (via websocket_client.conf).
Dynamic routing params are passed as URI query params via v() in transport_data. Dynamic routing params are passed as URI query params via v() in transport_data.
If ``channel_id`` is provided, it is passed to Asterisk as the
``channelId`` query parameter so the new channel is created with
that id. The caller can then register ext-channel state ahead of
the POST and avoid racing against the StasisStart event.
""" """
# v() appends URI query params to the websocket_client.conf URL # v() appends URI query params to the websocket_client.conf URL
# e.g. wss://api.dograh.com/ws/ari?workflow_id=1&user_id=2&workflow_run_id=3 # e.g. wss://api.dograh.com/ws/ari?workflow_id=1&user_id=2&workflow_run_id=3
@ -409,22 +458,25 @@ class ARIConnection:
f"workflow_run_id={workflow_run_id})" f"workflow_run_id={workflow_run_id})"
) )
params = {
"app": self.app_name,
"external_host": self.ws_client_name,
"format": "ulaw",
"transport": "websocket",
"encapsulation": "none",
"connection_type": "client",
"direction": "both",
"transport_data": transport_data,
}
if channel_id:
params["channelId"] = channel_id
result = await self._ari_request( result = await self._ari_request(
"POST", "POST", "/channels/externalMedia", params=params
"/channels/externalMedia",
params={
"app": self.app_name,
"external_host": self.ws_client_name,
"format": "ulaw",
"transport": "websocket",
"encapsulation": "none",
"connection_type": "client",
"direction": "both",
"transport_data": transport_data,
},
) )
ext_channel_id = result.get("id", "") ext_channel_id = result.get("id", "")
if ext_channel_id: if ext_channel_id:
# Idempotent — caller may have already marked it before the POST.
await self._mark_ext_channel(ext_channel_id) await self._mark_ext_channel(ext_channel_id)
logger.info( logger.info(
f"[ARI org={self.organization_id}] Created external media channel: {ext_channel_id}" f"[ARI org={self.organization_id}] Created external media channel: {ext_channel_id}"
@ -579,42 +631,98 @@ class ARIConnection:
workflow_id: str, workflow_id: str,
user_id: str, user_id: str,
): ):
"""Handle StasisStart by creating external media and bridging.""" """Set up external media for a caller channel that has entered Stasis.
Creates the external media channel via chan_websocket and registers
a pending bridge entry keyed by its channel id. The bridge itself is
created in :meth:`_complete_bridge_after_ext_ready` once the external
media channel has entered Stasis (its own StasisStart event).
"""
ext_channel_id = f"dograh-ext-{uuid.uuid4()}"
try: try:
logger.info( logger.info(
f"[ARI org={self.organization_id}] Setting up external media for " f"[ARI org={self.organization_id}] Setting up external media for "
f"channel {channel_id} via ws_client={self.ws_client_name}" f"channel {channel_id} via ws_client={self.ws_client_name} "
f"(ext_channel_id={ext_channel_id})"
) )
# 1. Track channel for StasisEnd cleanup (Redis) # 1. Track caller channel for StasisEnd cleanup (Redis).
await self._set_channel_run(channel_id, workflow_run_id) await self._set_channel_run(channel_id, workflow_run_id)
# 2. Create external media channel via chan_websocket # 2. Pre-register all ext-channel state synchronously, before the
# Asterisk connects to our backend using websocket_client.conf config, # externalMedia POST is sent. Asterisk can fire StasisStart for
# with routing params appended as URI query params via v() # the ext channel before the POST response returns; registering
ext_channel_id = await self._create_external_media( # here guarantees that event handler finds the marker and the
workflow_id, user_id, workflow_run_id # pending bridge entry regardless of ordering.
await self._mark_ext_channel(ext_channel_id)
await self._set_channel_run(ext_channel_id, workflow_run_id)
await self._set_pending_bridge(
ext_channel_id, channel_id, workflow_run_id
) )
if not ext_channel_id: await db_client.update_workflow_run(
run_id=int(workflow_run_id),
gathered_context={"ext_channel_id": ext_channel_id},
)
# 3. Create the ext media channel with the id we just registered.
created_id = await self._create_external_media(
workflow_id,
user_id,
workflow_run_id,
channel_id=ext_channel_id,
)
if not created_id:
await self._pop_pending_bridge(ext_channel_id)
logger.error( logger.error(
f"[ARI org={self.organization_id}] Failed to create external media for {channel_id}" f"[ARI org={self.organization_id}] Failed to create external "
f"media for {channel_id} (ext_channel_id={ext_channel_id})"
)
return
if created_id != ext_channel_id:
# Asterisk ignored our channelId — pending state is stale and
# will never be consumed. Clear it and surface loudly.
await self._pop_pending_bridge(ext_channel_id)
logger.error(
f"[ARI org={self.organization_id}] Asterisk returned channel "
f"id {created_id} but we requested {ext_channel_id}; "
f"channelId may not be honored on this ARI version"
) )
return return
# 3. Track ext channel for StasisEnd cleanup (Redis) logger.info(
await self._set_channel_run(ext_channel_id, workflow_run_id) f"[ARI org={self.organization_id}] Queued bridge for caller "
f"{channel_id} <-> ext {ext_channel_id} (run {workflow_run_id}); "
f"waiting for ext channel StasisStart"
)
except Exception as e:
await self._pop_pending_bridge(ext_channel_id)
logger.error(
f"[ARI org={self.organization_id}] Error handling StasisStart "
f"for channel {channel_id}: {e}"
)
# 4. Bridge the call channel with the external media channel async def _complete_bridge_after_ext_ready(
self, ext_channel_id: str, pending: dict
):
"""Bridge the caller and external media channels for a queued entry.
Invoked from the external media channel's StasisStart handler with
the pending entry that :meth:`_handle_stasis_start` registered.
Both channels are in the Stasis application at this point, so the
bridge and addChannel calls can succeed.
"""
caller_channel_id = pending["caller_channel_id"]
workflow_run_id = pending["workflow_run_id"]
try:
bridge_id = await self._create_bridge_and_add_channels( bridge_id = await self._create_bridge_and_add_channels(
[channel_id, ext_channel_id] [caller_channel_id, ext_channel_id]
) )
if not bridge_id: if not bridge_id:
logger.error( logger.error(
f"[ARI org={self.organization_id}] Failed to bridge channels" f"[ARI org={self.organization_id}] Failed to bridge "
f"channels {caller_channel_id} <-> {ext_channel_id}"
) )
return return
# 5. Store ARI resource IDs in gathered_context for cleanup/debugging
await db_client.update_workflow_run( await db_client.update_workflow_run(
run_id=int(workflow_run_id), run_id=int(workflow_run_id),
gathered_context={ gathered_context={
@ -624,8 +732,8 @@ class ARIConnection:
) )
except Exception as e: except Exception as e:
logger.error( logger.error(
f"[ARI org={self.organization_id}] Error handling StasisStart " f"[ARI org={self.organization_id}] Error completing bridge for "
f"for channel {channel_id}: {e}" f"caller {caller_channel_id} / ext {ext_channel_id}: {e}"
) )
async def _handle_stasis_end(self, channel_id: str, workflow_run_id: str): async def _handle_stasis_end(self, channel_id: str, workflow_run_id: str):