mirror of
https://github.com/dograh-hq/dograh.git
synced 2026-06-19 08:28:10 +02:00
Feat/campaign enhancements (#163)
* feat: add circuit breaker to safeguard * feat: Add Circuit breaker in campaigns to safeguard against telephony failures * feat: add schedules in campaigns
This commit is contained in:
parent
7552b6c819
commit
fe4ea648e4
17 changed files with 2037 additions and 149 deletions
|
|
@ -33,6 +33,9 @@ class CampaignEventType(str, Enum):
|
|||
RETRY_SCHEDULED = "retry_scheduled"
|
||||
RETRY_FAILED = "retry_failed"
|
||||
|
||||
# Circuit breaker events
|
||||
CIRCUIT_BREAKER_TRIPPED = "circuit_breaker_tripped"
|
||||
|
||||
|
||||
class RetryReason(str, Enum):
|
||||
"""Reasons for retry."""
|
||||
|
|
@ -218,6 +221,18 @@ class RetryFailedEvent(BaseCampaignEvent):
|
|||
last_reason: str = "" # RetryReason value
|
||||
|
||||
|
||||
@dataclass
|
||||
class CircuitBreakerTrippedEvent(BaseCampaignEvent):
|
||||
"""Event sent when the circuit breaker trips and pauses a campaign."""
|
||||
|
||||
type: str = CampaignEventType.CIRCUIT_BREAKER_TRIPPED
|
||||
failure_rate: float = 0.0
|
||||
failure_count: int = 0
|
||||
success_count: int = 0
|
||||
threshold: float = 0.0
|
||||
window_seconds: int = 0
|
||||
|
||||
|
||||
def parse_campaign_event(data: str) -> Any:
|
||||
"""Parse a campaign event message."""
|
||||
try:
|
||||
|
|
@ -239,6 +254,7 @@ def parse_campaign_event(data: str) -> Any:
|
|||
CampaignEventType.RETRY_NEEDED: RetryNeededEvent,
|
||||
CampaignEventType.RETRY_SCHEDULED: RetryScheduledEvent,
|
||||
CampaignEventType.RETRY_FAILED: RetryFailedEvent,
|
||||
CampaignEventType.CIRCUIT_BREAKER_TRIPPED: CircuitBreakerTrippedEvent,
|
||||
}
|
||||
|
||||
event_class = event_class_map.get(event_type)
|
||||
|
|
|
|||
|
|
@ -14,6 +14,7 @@ from api.services.campaign.campaign_event_protocol import (
|
|||
BatchCompletedEvent,
|
||||
BatchFailedEvent,
|
||||
CampaignCompletedEvent,
|
||||
CircuitBreakerTrippedEvent,
|
||||
RetryNeededEvent,
|
||||
SyncCompletedEvent,
|
||||
)
|
||||
|
|
@ -123,6 +124,32 @@ class CampaignEventPublisher:
|
|||
|
||||
await self.redis.publish(RedisChannel.CAMPAIGN_EVENTS.value, event.to_json())
|
||||
|
||||
async def publish_circuit_breaker_tripped(
|
||||
self,
|
||||
campaign_id: int,
|
||||
failure_rate: float,
|
||||
failure_count: int,
|
||||
success_count: int,
|
||||
threshold: float,
|
||||
window_seconds: int,
|
||||
):
|
||||
"""Publish circuit breaker tripped event."""
|
||||
event = CircuitBreakerTrippedEvent(
|
||||
campaign_id=campaign_id,
|
||||
failure_rate=failure_rate,
|
||||
failure_count=failure_count,
|
||||
success_count=success_count,
|
||||
threshold=threshold,
|
||||
window_seconds=window_seconds,
|
||||
)
|
||||
|
||||
await self.redis.publish(RedisChannel.CAMPAIGN_EVENTS.value, event.to_json())
|
||||
|
||||
logger.warning(
|
||||
f"Published circuit breaker tripped event for campaign {campaign_id}: "
|
||||
f"failure_rate={failure_rate:.2%} ({failure_count} failures)"
|
||||
)
|
||||
|
||||
|
||||
# Global publisher instance with lazy Redis connection
|
||||
async def get_campaign_event_publisher() -> CampaignEventPublisher:
|
||||
|
|
|
|||
|
|
@ -14,6 +14,7 @@ import asyncio
|
|||
import signal
|
||||
from datetime import UTC, datetime, timedelta
|
||||
from typing import Dict
|
||||
from zoneinfo import ZoneInfo
|
||||
|
||||
import redis.asyncio as aioredis
|
||||
from loguru import logger
|
||||
|
|
@ -25,11 +26,13 @@ from api.enums import RedisChannel
|
|||
from api.services.campaign.campaign_event_protocol import (
|
||||
BatchCompletedEvent,
|
||||
BatchFailedEvent,
|
||||
CircuitBreakerTrippedEvent,
|
||||
RetryNeededEvent,
|
||||
SyncCompletedEvent,
|
||||
parse_campaign_event,
|
||||
)
|
||||
from api.services.campaign.campaign_event_publisher import CampaignEventPublisher
|
||||
from api.services.campaign.circuit_breaker import circuit_breaker
|
||||
from api.tasks.arq import enqueue_job
|
||||
from api.tasks.function_names import FunctionNames
|
||||
|
||||
|
|
@ -165,6 +168,14 @@ class CampaignOrchestrator:
|
|||
await self._schedule_next_batch(campaign_id)
|
||||
self._last_activity[campaign_id] = datetime.now(UTC)
|
||||
|
||||
elif isinstance(event, CircuitBreakerTrippedEvent):
|
||||
# Circuit breaker tripped - clear state for this campaign
|
||||
logger.warning(
|
||||
f"campaign_id: {campaign_id} - Circuit breaker tripped event received: "
|
||||
f"failure_rate={event.failure_rate:.2%}"
|
||||
)
|
||||
self._clear_campaign_state(campaign_id)
|
||||
|
||||
async def _handle_retry_event(self, event: RetryNeededEvent):
|
||||
"""Process retry event and schedule if eligible (from campaign_retry_manager)."""
|
||||
|
||||
|
|
@ -274,6 +285,53 @@ class CampaignOrchestrator:
|
|||
f"last reason: {reason}"
|
||||
)
|
||||
|
||||
def _is_within_schedule(self, campaign: CampaignModel) -> bool:
|
||||
"""Check if the current time falls within the campaign's schedule windows.
|
||||
|
||||
Returns True (allow scheduling) if:
|
||||
- No schedule_config in metadata
|
||||
- Schedule is disabled
|
||||
- No slots configured
|
||||
- Invalid timezone (fail open)
|
||||
- Current time matches a slot
|
||||
"""
|
||||
if not campaign.orchestrator_metadata:
|
||||
return True
|
||||
|
||||
schedule_config = campaign.orchestrator_metadata.get("schedule_config")
|
||||
if not schedule_config:
|
||||
return True
|
||||
|
||||
if not schedule_config.get("enabled", False):
|
||||
return True
|
||||
|
||||
slots = schedule_config.get("slots")
|
||||
if not slots:
|
||||
return True
|
||||
|
||||
timezone_str = schedule_config.get("timezone", "UTC")
|
||||
try:
|
||||
tz = ZoneInfo(timezone_str)
|
||||
except (KeyError, Exception):
|
||||
logger.warning(
|
||||
f"campaign_id: {campaign.id} - Invalid timezone '{timezone_str}' in schedule_config, "
|
||||
f"failing open (allowing scheduling)"
|
||||
)
|
||||
return True
|
||||
|
||||
now = datetime.now(tz)
|
||||
current_day = now.weekday() # 0=Monday through 6=Sunday
|
||||
current_time = now.strftime("%H:%M")
|
||||
|
||||
for slot in slots:
|
||||
if slot.get("day_of_week") == current_day:
|
||||
start = slot.get("start_time", "")
|
||||
end = slot.get("end_time", "")
|
||||
if start <= current_time < end:
|
||||
return True
|
||||
|
||||
return False
|
||||
|
||||
async def _schedule_next_batch(self, campaign_id: int):
|
||||
"""Schedule next batch immediately if work available."""
|
||||
|
||||
|
|
@ -302,6 +360,40 @@ class CampaignOrchestrator:
|
|||
)
|
||||
return
|
||||
|
||||
# Check schedule window before scheduling
|
||||
if not self._is_within_schedule(campaign):
|
||||
logger.info(
|
||||
f"campaign_id: {campaign_id} - Outside scheduled time window, skipping batch"
|
||||
)
|
||||
return
|
||||
|
||||
# Safety net: check circuit breaker before scheduling
|
||||
cb_config = None
|
||||
if campaign.orchestrator_metadata:
|
||||
cb_config = campaign.orchestrator_metadata.get("circuit_breaker")
|
||||
|
||||
is_open, stats = await circuit_breaker.is_circuit_open(
|
||||
campaign_id=campaign_id,
|
||||
config=cb_config,
|
||||
)
|
||||
|
||||
if is_open and stats:
|
||||
logger.warning(
|
||||
f"campaign_id: {campaign_id} - Circuit breaker is open, "
|
||||
f"pausing campaign. Stats: {stats}"
|
||||
)
|
||||
await db_client.update_campaign(campaign_id=campaign_id, state="paused")
|
||||
await self.publisher.publish_circuit_breaker_tripped(
|
||||
campaign_id=campaign_id,
|
||||
failure_rate=stats["failure_rate"],
|
||||
failure_count=stats["failure_count"],
|
||||
success_count=stats["success_count"],
|
||||
threshold=stats["threshold"],
|
||||
window_seconds=stats["window_seconds"],
|
||||
)
|
||||
self._clear_campaign_state(campaign_id)
|
||||
return
|
||||
|
||||
# Check for available work (queued runs + due retries)
|
||||
has_work = await self._has_pending_work(campaign_id)
|
||||
|
||||
|
|
@ -399,6 +491,12 @@ class CampaignOrchestrator:
|
|||
if campaign_id not in self._batch_in_progress:
|
||||
has_work = await self._has_pending_work(campaign_id)
|
||||
if has_work:
|
||||
if not self._is_within_schedule(campaign):
|
||||
logger.info(
|
||||
f"campaign_id: {campaign_id} - Found orphaned work but outside "
|
||||
f"schedule window, skipping"
|
||||
)
|
||||
continue
|
||||
logger.info(
|
||||
f"campaign_id: {campaign_id} - Found orphaned work (likely new retries), "
|
||||
f"scheduling batch to process"
|
||||
|
|
@ -428,6 +526,12 @@ class CampaignOrchestrator:
|
|||
# Check for any pending work
|
||||
has_work = await self._has_pending_work(campaign_id)
|
||||
if has_work:
|
||||
# If outside schedule window, don't mark complete — work remains for next window
|
||||
if not self._is_within_schedule(campaign):
|
||||
logger.debug(
|
||||
f"campaign_id: {campaign_id} - Outside schedule window with pending work, "
|
||||
f"not marking complete"
|
||||
)
|
||||
return False
|
||||
|
||||
# Check in-memory last activity
|
||||
|
|
|
|||
301
api/services/campaign/circuit_breaker.py
Normal file
301
api/services/campaign/circuit_breaker.py
Normal file
|
|
@ -0,0 +1,301 @@
|
|||
"""Campaign circuit breaker for automatic pause on high failure rates.
|
||||
|
||||
Uses two Redis sorted sets (ZSETs) per campaign — one for failures, one for
|
||||
successes — as sliding windows. ZCARD gives O(1) counts without iterating
|
||||
members, keeping the Lua scripts simple.
|
||||
"""
|
||||
|
||||
import time
|
||||
from typing import Optional, Tuple
|
||||
|
||||
import redis.asyncio as aioredis
|
||||
from loguru import logger
|
||||
|
||||
from api.constants import DEFAULT_CIRCUIT_BREAKER_CONFIG, REDIS_URL
|
||||
from api.db import db_client
|
||||
from api.services.campaign.campaign_event_publisher import get_campaign_event_publisher
|
||||
|
||||
|
||||
class CircuitBreaker:
|
||||
"""Sliding window circuit breaker for campaign call failures."""
|
||||
|
||||
def __init__(self):
|
||||
self.redis_client: Optional[aioredis.Redis] = None
|
||||
|
||||
async def _get_redis(self) -> aioredis.Redis:
|
||||
"""Get or create Redis connection."""
|
||||
if self.redis_client is None:
|
||||
self.redis_client = await aioredis.from_url(
|
||||
REDIS_URL, decode_responses=True
|
||||
)
|
||||
return self.redis_client
|
||||
|
||||
@staticmethod
|
||||
def _keys(campaign_id: int) -> Tuple[str, str]:
|
||||
"""Return (failures_key, successes_key) for a campaign."""
|
||||
return f"cb_failures:{campaign_id}", f"cb_successes:{campaign_id}"
|
||||
|
||||
async def record_call_outcome(
|
||||
self,
|
||||
campaign_id: int,
|
||||
is_failure: bool,
|
||||
config: Optional[dict] = None,
|
||||
) -> Tuple[bool, Optional[dict]]:
|
||||
"""Record a call outcome and check if the circuit breaker should trip.
|
||||
|
||||
Args:
|
||||
campaign_id: The campaign ID.
|
||||
is_failure: True if the call failed, False if succeeded.
|
||||
config: Optional per-campaign circuit breaker config override.
|
||||
Falls back to DEFAULT_CIRCUIT_BREAKER_CONFIG.
|
||||
|
||||
Returns:
|
||||
Tuple of (tripped: bool, stats: dict or None).
|
||||
If tripped is True, stats contains failure_rate, failure_count,
|
||||
success_count, threshold, window_seconds.
|
||||
"""
|
||||
cb_config = {**DEFAULT_CIRCUIT_BREAKER_CONFIG, **(config or {})}
|
||||
|
||||
if not cb_config.get("enabled", True):
|
||||
return False, None
|
||||
|
||||
redis_client = await self._get_redis()
|
||||
|
||||
window_seconds = cb_config["window_seconds"]
|
||||
threshold = cb_config["failure_threshold"]
|
||||
min_calls = cb_config["min_calls_in_window"]
|
||||
|
||||
now = time.time()
|
||||
window_start = now - window_seconds
|
||||
|
||||
fail_key, succ_key = self._keys(campaign_id)
|
||||
|
||||
lua_script = """
|
||||
local fail_key = KEYS[1]
|
||||
local succ_key = KEYS[2]
|
||||
local now = tonumber(ARGV[1])
|
||||
local window_start = tonumber(ARGV[2])
|
||||
local is_failure = tonumber(ARGV[3])
|
||||
local threshold = tonumber(ARGV[4])
|
||||
local min_calls = tonumber(ARGV[5])
|
||||
local ttl = tonumber(ARGV[6])
|
||||
|
||||
-- Trim both sets to the sliding window
|
||||
redis.call('ZREMRANGEBYSCORE', fail_key, 0, window_start)
|
||||
redis.call('ZREMRANGEBYSCORE', succ_key, 0, window_start)
|
||||
|
||||
-- Add the new outcome to the appropriate set
|
||||
if is_failure == 1 then
|
||||
redis.call('ZADD', fail_key, now, now)
|
||||
else
|
||||
redis.call('ZADD', succ_key, now, now)
|
||||
end
|
||||
|
||||
-- Refresh TTL on both keys
|
||||
redis.call('EXPIRE', fail_key, ttl)
|
||||
redis.call('EXPIRE', succ_key, ttl)
|
||||
|
||||
-- Count via ZCARD (O(1))
|
||||
local failures = redis.call('ZCARD', fail_key)
|
||||
local successes = redis.call('ZCARD', succ_key)
|
||||
local total = failures + successes
|
||||
|
||||
-- Check trip condition
|
||||
if total >= min_calls and (failures / total) >= threshold then
|
||||
return {1, failures, successes, total}
|
||||
end
|
||||
|
||||
return {0, failures, successes, total}
|
||||
"""
|
||||
|
||||
try:
|
||||
result = await redis_client.eval(
|
||||
lua_script,
|
||||
2,
|
||||
fail_key,
|
||||
succ_key,
|
||||
now,
|
||||
window_start,
|
||||
1 if is_failure else 0,
|
||||
threshold,
|
||||
min_calls,
|
||||
window_seconds + 60, # TTL with buffer
|
||||
)
|
||||
|
||||
tripped = bool(result[0])
|
||||
failure_count = int(result[1])
|
||||
success_count = int(result[2])
|
||||
total = int(result[3])
|
||||
failure_rate = failure_count / total if total > 0 else 0.0
|
||||
|
||||
if tripped:
|
||||
logger.warning(
|
||||
f"Circuit breaker TRIPPED for campaign {campaign_id}: "
|
||||
f"failure_rate={failure_rate:.2%} ({failure_count}/{total}) "
|
||||
f"threshold={threshold:.2%} window={window_seconds}s"
|
||||
)
|
||||
|
||||
stats = {
|
||||
"failure_rate": failure_rate,
|
||||
"failure_count": failure_count,
|
||||
"success_count": success_count,
|
||||
"threshold": threshold,
|
||||
"window_seconds": window_seconds,
|
||||
}
|
||||
return tripped, stats
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Circuit breaker error for campaign {campaign_id}: {e}")
|
||||
# Fail open - do NOT trip on errors
|
||||
return False, None
|
||||
|
||||
async def is_circuit_open(
|
||||
self,
|
||||
campaign_id: int,
|
||||
config: Optional[dict] = None,
|
||||
) -> Tuple[bool, Optional[dict]]:
|
||||
"""Check if the circuit breaker is in open (tripped) state without recording.
|
||||
|
||||
Used as a safety net check before scheduling batches.
|
||||
"""
|
||||
cb_config = {**DEFAULT_CIRCUIT_BREAKER_CONFIG, **(config or {})}
|
||||
|
||||
if not cb_config.get("enabled", True):
|
||||
return False, None
|
||||
|
||||
redis_client = await self._get_redis()
|
||||
|
||||
window_seconds = cb_config["window_seconds"]
|
||||
threshold = cb_config["failure_threshold"]
|
||||
min_calls = cb_config["min_calls_in_window"]
|
||||
|
||||
now = time.time()
|
||||
window_start = now - window_seconds
|
||||
|
||||
fail_key, succ_key = self._keys(campaign_id)
|
||||
|
||||
lua_script = """
|
||||
local fail_key = KEYS[1]
|
||||
local succ_key = KEYS[2]
|
||||
local window_start = tonumber(ARGV[1])
|
||||
local threshold = tonumber(ARGV[2])
|
||||
local min_calls = tonumber(ARGV[3])
|
||||
|
||||
-- Trim both sets
|
||||
redis.call('ZREMRANGEBYSCORE', fail_key, 0, window_start)
|
||||
redis.call('ZREMRANGEBYSCORE', succ_key, 0, window_start)
|
||||
|
||||
-- Count via ZCARD
|
||||
local failures = redis.call('ZCARD', fail_key)
|
||||
local successes = redis.call('ZCARD', succ_key)
|
||||
local total = failures + successes
|
||||
|
||||
if total >= min_calls and (failures / total) >= threshold then
|
||||
return {1, failures, successes, total}
|
||||
end
|
||||
|
||||
return {0, failures, successes, total}
|
||||
"""
|
||||
|
||||
try:
|
||||
result = await redis_client.eval(
|
||||
lua_script,
|
||||
2,
|
||||
fail_key,
|
||||
succ_key,
|
||||
window_start,
|
||||
threshold,
|
||||
min_calls,
|
||||
)
|
||||
|
||||
is_open = bool(result[0])
|
||||
failure_count = int(result[1])
|
||||
success_count = int(result[2])
|
||||
total = int(result[3])
|
||||
failure_rate = failure_count / total if total > 0 else 0.0
|
||||
|
||||
stats = {
|
||||
"failure_rate": failure_rate,
|
||||
"failure_count": failure_count,
|
||||
"success_count": success_count,
|
||||
"threshold": threshold,
|
||||
"window_seconds": window_seconds,
|
||||
}
|
||||
return is_open, stats
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Circuit breaker check error for campaign {campaign_id}: {e}")
|
||||
return False, None
|
||||
|
||||
async def record_and_evaluate(self, campaign_id: int, is_failure: bool) -> None:
|
||||
"""Record a call outcome, and if the breaker trips, pause the campaign.
|
||||
|
||||
This is the main entry point called from telephony status callbacks.
|
||||
It handles fetching campaign config, recording the outcome, and
|
||||
pausing + publishing an event if the breaker trips.
|
||||
|
||||
Exceptions are caught internally so this never disrupts the caller.
|
||||
"""
|
||||
try:
|
||||
campaign = await db_client.get_campaign_by_id(campaign_id)
|
||||
if not campaign or campaign.state != "running":
|
||||
return
|
||||
|
||||
cb_config = {}
|
||||
if campaign.orchestrator_metadata:
|
||||
cb_config = campaign.orchestrator_metadata.get("circuit_breaker", {})
|
||||
|
||||
tripped, stats = await self.record_call_outcome(
|
||||
campaign_id=campaign_id,
|
||||
is_failure=is_failure,
|
||||
config=cb_config,
|
||||
)
|
||||
|
||||
if tripped and stats:
|
||||
logger.warning(
|
||||
f"Circuit breaker tripped for campaign {campaign_id}, "
|
||||
f"pausing campaign. Stats: {stats}"
|
||||
)
|
||||
|
||||
await db_client.update_campaign(campaign_id=campaign_id, state="paused")
|
||||
|
||||
publisher = await get_campaign_event_publisher()
|
||||
await publisher.publish_circuit_breaker_tripped(
|
||||
campaign_id=campaign_id,
|
||||
failure_rate=stats["failure_rate"],
|
||||
failure_count=stats["failure_count"],
|
||||
success_count=stats["success_count"],
|
||||
threshold=stats["threshold"],
|
||||
window_seconds=stats["window_seconds"],
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error in circuit breaker for campaign {campaign_id}: {e}")
|
||||
|
||||
async def reset(self, campaign_id: int) -> bool:
|
||||
"""Reset the circuit breaker state for a campaign.
|
||||
|
||||
Called when a campaign is resumed to give it a clean slate.
|
||||
"""
|
||||
redis_client = await self._get_redis()
|
||||
fail_key, succ_key = self._keys(campaign_id)
|
||||
|
||||
try:
|
||||
await redis_client.delete(fail_key, succ_key)
|
||||
logger.info(f"Circuit breaker reset for campaign {campaign_id}")
|
||||
return True
|
||||
except Exception as e:
|
||||
logger.error(
|
||||
f"Error resetting circuit breaker for campaign {campaign_id}: {e}"
|
||||
)
|
||||
return False
|
||||
|
||||
async def close(self):
|
||||
"""Close Redis connection."""
|
||||
if self.redis_client:
|
||||
await self.redis_client.close()
|
||||
self.redis_client = None
|
||||
|
||||
|
||||
# Global circuit breaker instance
|
||||
circuit_breaker = CircuitBreaker()
|
||||
|
|
@ -4,6 +4,7 @@ from typing import Any, Dict
|
|||
from loguru import logger
|
||||
|
||||
from api.db import db_client
|
||||
from api.services.campaign.circuit_breaker import circuit_breaker
|
||||
from api.tasks.arq import enqueue_job
|
||||
from api.tasks.function_names import FunctionNames
|
||||
|
||||
|
|
@ -67,6 +68,9 @@ class CampaignRunnerService:
|
|||
# stale campaign checker would do that if there are pending work.
|
||||
await db_client.update_campaign(campaign_id=campaign_id, state="running")
|
||||
|
||||
# Reset circuit breaker so the resumed campaign starts with a clean slate
|
||||
await circuit_breaker.reset(campaign_id)
|
||||
|
||||
logger.info(f"Campaign {campaign_id} resumed")
|
||||
|
||||
async def get_campaign_status(self, campaign_id: int) -> Dict[str, Any]:
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue