Feat/campaign enhancements (#163)

* feat: add circuit breaker to safeguard

* feat: Add Circuit breaker in campaigns to safeguard against telephony failures

* feat: add schedules in campaigns
This commit is contained in:
Abhishek 2026-02-17 21:04:15 +05:30 committed by GitHub
parent 7552b6c819
commit fe4ea648e4
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
17 changed files with 2037 additions and 149 deletions

View file

@ -33,6 +33,9 @@ class CampaignEventType(str, Enum):
RETRY_SCHEDULED = "retry_scheduled"
RETRY_FAILED = "retry_failed"
# Circuit breaker events
CIRCUIT_BREAKER_TRIPPED = "circuit_breaker_tripped"
class RetryReason(str, Enum):
"""Reasons for retry."""
@ -218,6 +221,18 @@ class RetryFailedEvent(BaseCampaignEvent):
last_reason: str = "" # RetryReason value
@dataclass
class CircuitBreakerTrippedEvent(BaseCampaignEvent):
"""Event sent when the circuit breaker trips and pauses a campaign."""
type: str = CampaignEventType.CIRCUIT_BREAKER_TRIPPED
failure_rate: float = 0.0
failure_count: int = 0
success_count: int = 0
threshold: float = 0.0
window_seconds: int = 0
def parse_campaign_event(data: str) -> Any:
"""Parse a campaign event message."""
try:
@ -239,6 +254,7 @@ def parse_campaign_event(data: str) -> Any:
CampaignEventType.RETRY_NEEDED: RetryNeededEvent,
CampaignEventType.RETRY_SCHEDULED: RetryScheduledEvent,
CampaignEventType.RETRY_FAILED: RetryFailedEvent,
CampaignEventType.CIRCUIT_BREAKER_TRIPPED: CircuitBreakerTrippedEvent,
}
event_class = event_class_map.get(event_type)

View file

@ -14,6 +14,7 @@ from api.services.campaign.campaign_event_protocol import (
BatchCompletedEvent,
BatchFailedEvent,
CampaignCompletedEvent,
CircuitBreakerTrippedEvent,
RetryNeededEvent,
SyncCompletedEvent,
)
@ -123,6 +124,32 @@ class CampaignEventPublisher:
await self.redis.publish(RedisChannel.CAMPAIGN_EVENTS.value, event.to_json())
async def publish_circuit_breaker_tripped(
self,
campaign_id: int,
failure_rate: float,
failure_count: int,
success_count: int,
threshold: float,
window_seconds: int,
):
"""Publish circuit breaker tripped event."""
event = CircuitBreakerTrippedEvent(
campaign_id=campaign_id,
failure_rate=failure_rate,
failure_count=failure_count,
success_count=success_count,
threshold=threshold,
window_seconds=window_seconds,
)
await self.redis.publish(RedisChannel.CAMPAIGN_EVENTS.value, event.to_json())
logger.warning(
f"Published circuit breaker tripped event for campaign {campaign_id}: "
f"failure_rate={failure_rate:.2%} ({failure_count} failures)"
)
# Global publisher instance with lazy Redis connection
async def get_campaign_event_publisher() -> CampaignEventPublisher:

View file

@ -14,6 +14,7 @@ import asyncio
import signal
from datetime import UTC, datetime, timedelta
from typing import Dict
from zoneinfo import ZoneInfo
import redis.asyncio as aioredis
from loguru import logger
@ -25,11 +26,13 @@ from api.enums import RedisChannel
from api.services.campaign.campaign_event_protocol import (
BatchCompletedEvent,
BatchFailedEvent,
CircuitBreakerTrippedEvent,
RetryNeededEvent,
SyncCompletedEvent,
parse_campaign_event,
)
from api.services.campaign.campaign_event_publisher import CampaignEventPublisher
from api.services.campaign.circuit_breaker import circuit_breaker
from api.tasks.arq import enqueue_job
from api.tasks.function_names import FunctionNames
@ -165,6 +168,14 @@ class CampaignOrchestrator:
await self._schedule_next_batch(campaign_id)
self._last_activity[campaign_id] = datetime.now(UTC)
elif isinstance(event, CircuitBreakerTrippedEvent):
# Circuit breaker tripped - clear state for this campaign
logger.warning(
f"campaign_id: {campaign_id} - Circuit breaker tripped event received: "
f"failure_rate={event.failure_rate:.2%}"
)
self._clear_campaign_state(campaign_id)
async def _handle_retry_event(self, event: RetryNeededEvent):
"""Process retry event and schedule if eligible (from campaign_retry_manager)."""
@ -274,6 +285,53 @@ class CampaignOrchestrator:
f"last reason: {reason}"
)
def _is_within_schedule(self, campaign: CampaignModel) -> bool:
"""Check if the current time falls within the campaign's schedule windows.
Returns True (allow scheduling) if:
- No schedule_config in metadata
- Schedule is disabled
- No slots configured
- Invalid timezone (fail open)
- Current time matches a slot
"""
if not campaign.orchestrator_metadata:
return True
schedule_config = campaign.orchestrator_metadata.get("schedule_config")
if not schedule_config:
return True
if not schedule_config.get("enabled", False):
return True
slots = schedule_config.get("slots")
if not slots:
return True
timezone_str = schedule_config.get("timezone", "UTC")
try:
tz = ZoneInfo(timezone_str)
except (KeyError, Exception):
logger.warning(
f"campaign_id: {campaign.id} - Invalid timezone '{timezone_str}' in schedule_config, "
f"failing open (allowing scheduling)"
)
return True
now = datetime.now(tz)
current_day = now.weekday() # 0=Monday through 6=Sunday
current_time = now.strftime("%H:%M")
for slot in slots:
if slot.get("day_of_week") == current_day:
start = slot.get("start_time", "")
end = slot.get("end_time", "")
if start <= current_time < end:
return True
return False
async def _schedule_next_batch(self, campaign_id: int):
"""Schedule next batch immediately if work available."""
@ -302,6 +360,40 @@ class CampaignOrchestrator:
)
return
# Check schedule window before scheduling
if not self._is_within_schedule(campaign):
logger.info(
f"campaign_id: {campaign_id} - Outside scheduled time window, skipping batch"
)
return
# Safety net: check circuit breaker before scheduling
cb_config = None
if campaign.orchestrator_metadata:
cb_config = campaign.orchestrator_metadata.get("circuit_breaker")
is_open, stats = await circuit_breaker.is_circuit_open(
campaign_id=campaign_id,
config=cb_config,
)
if is_open and stats:
logger.warning(
f"campaign_id: {campaign_id} - Circuit breaker is open, "
f"pausing campaign. Stats: {stats}"
)
await db_client.update_campaign(campaign_id=campaign_id, state="paused")
await self.publisher.publish_circuit_breaker_tripped(
campaign_id=campaign_id,
failure_rate=stats["failure_rate"],
failure_count=stats["failure_count"],
success_count=stats["success_count"],
threshold=stats["threshold"],
window_seconds=stats["window_seconds"],
)
self._clear_campaign_state(campaign_id)
return
# Check for available work (queued runs + due retries)
has_work = await self._has_pending_work(campaign_id)
@ -399,6 +491,12 @@ class CampaignOrchestrator:
if campaign_id not in self._batch_in_progress:
has_work = await self._has_pending_work(campaign_id)
if has_work:
if not self._is_within_schedule(campaign):
logger.info(
f"campaign_id: {campaign_id} - Found orphaned work but outside "
f"schedule window, skipping"
)
continue
logger.info(
f"campaign_id: {campaign_id} - Found orphaned work (likely new retries), "
f"scheduling batch to process"
@ -428,6 +526,12 @@ class CampaignOrchestrator:
# Check for any pending work
has_work = await self._has_pending_work(campaign_id)
if has_work:
# If outside schedule window, don't mark complete — work remains for next window
if not self._is_within_schedule(campaign):
logger.debug(
f"campaign_id: {campaign_id} - Outside schedule window with pending work, "
f"not marking complete"
)
return False
# Check in-memory last activity

View file

@ -0,0 +1,301 @@
"""Campaign circuit breaker for automatic pause on high failure rates.
Uses two Redis sorted sets (ZSETs) per campaign one for failures, one for
successes as sliding windows. ZCARD gives O(1) counts without iterating
members, keeping the Lua scripts simple.
"""
import time
from typing import Optional, Tuple
import redis.asyncio as aioredis
from loguru import logger
from api.constants import DEFAULT_CIRCUIT_BREAKER_CONFIG, REDIS_URL
from api.db import db_client
from api.services.campaign.campaign_event_publisher import get_campaign_event_publisher
class CircuitBreaker:
"""Sliding window circuit breaker for campaign call failures."""
def __init__(self):
self.redis_client: Optional[aioredis.Redis] = None
async def _get_redis(self) -> aioredis.Redis:
"""Get or create Redis connection."""
if self.redis_client is None:
self.redis_client = await aioredis.from_url(
REDIS_URL, decode_responses=True
)
return self.redis_client
@staticmethod
def _keys(campaign_id: int) -> Tuple[str, str]:
"""Return (failures_key, successes_key) for a campaign."""
return f"cb_failures:{campaign_id}", f"cb_successes:{campaign_id}"
async def record_call_outcome(
self,
campaign_id: int,
is_failure: bool,
config: Optional[dict] = None,
) -> Tuple[bool, Optional[dict]]:
"""Record a call outcome and check if the circuit breaker should trip.
Args:
campaign_id: The campaign ID.
is_failure: True if the call failed, False if succeeded.
config: Optional per-campaign circuit breaker config override.
Falls back to DEFAULT_CIRCUIT_BREAKER_CONFIG.
Returns:
Tuple of (tripped: bool, stats: dict or None).
If tripped is True, stats contains failure_rate, failure_count,
success_count, threshold, window_seconds.
"""
cb_config = {**DEFAULT_CIRCUIT_BREAKER_CONFIG, **(config or {})}
if not cb_config.get("enabled", True):
return False, None
redis_client = await self._get_redis()
window_seconds = cb_config["window_seconds"]
threshold = cb_config["failure_threshold"]
min_calls = cb_config["min_calls_in_window"]
now = time.time()
window_start = now - window_seconds
fail_key, succ_key = self._keys(campaign_id)
lua_script = """
local fail_key = KEYS[1]
local succ_key = KEYS[2]
local now = tonumber(ARGV[1])
local window_start = tonumber(ARGV[2])
local is_failure = tonumber(ARGV[3])
local threshold = tonumber(ARGV[4])
local min_calls = tonumber(ARGV[5])
local ttl = tonumber(ARGV[6])
-- Trim both sets to the sliding window
redis.call('ZREMRANGEBYSCORE', fail_key, 0, window_start)
redis.call('ZREMRANGEBYSCORE', succ_key, 0, window_start)
-- Add the new outcome to the appropriate set
if is_failure == 1 then
redis.call('ZADD', fail_key, now, now)
else
redis.call('ZADD', succ_key, now, now)
end
-- Refresh TTL on both keys
redis.call('EXPIRE', fail_key, ttl)
redis.call('EXPIRE', succ_key, ttl)
-- Count via ZCARD (O(1))
local failures = redis.call('ZCARD', fail_key)
local successes = redis.call('ZCARD', succ_key)
local total = failures + successes
-- Check trip condition
if total >= min_calls and (failures / total) >= threshold then
return {1, failures, successes, total}
end
return {0, failures, successes, total}
"""
try:
result = await redis_client.eval(
lua_script,
2,
fail_key,
succ_key,
now,
window_start,
1 if is_failure else 0,
threshold,
min_calls,
window_seconds + 60, # TTL with buffer
)
tripped = bool(result[0])
failure_count = int(result[1])
success_count = int(result[2])
total = int(result[3])
failure_rate = failure_count / total if total > 0 else 0.0
if tripped:
logger.warning(
f"Circuit breaker TRIPPED for campaign {campaign_id}: "
f"failure_rate={failure_rate:.2%} ({failure_count}/{total}) "
f"threshold={threshold:.2%} window={window_seconds}s"
)
stats = {
"failure_rate": failure_rate,
"failure_count": failure_count,
"success_count": success_count,
"threshold": threshold,
"window_seconds": window_seconds,
}
return tripped, stats
except Exception as e:
logger.error(f"Circuit breaker error for campaign {campaign_id}: {e}")
# Fail open - do NOT trip on errors
return False, None
async def is_circuit_open(
self,
campaign_id: int,
config: Optional[dict] = None,
) -> Tuple[bool, Optional[dict]]:
"""Check if the circuit breaker is in open (tripped) state without recording.
Used as a safety net check before scheduling batches.
"""
cb_config = {**DEFAULT_CIRCUIT_BREAKER_CONFIG, **(config or {})}
if not cb_config.get("enabled", True):
return False, None
redis_client = await self._get_redis()
window_seconds = cb_config["window_seconds"]
threshold = cb_config["failure_threshold"]
min_calls = cb_config["min_calls_in_window"]
now = time.time()
window_start = now - window_seconds
fail_key, succ_key = self._keys(campaign_id)
lua_script = """
local fail_key = KEYS[1]
local succ_key = KEYS[2]
local window_start = tonumber(ARGV[1])
local threshold = tonumber(ARGV[2])
local min_calls = tonumber(ARGV[3])
-- Trim both sets
redis.call('ZREMRANGEBYSCORE', fail_key, 0, window_start)
redis.call('ZREMRANGEBYSCORE', succ_key, 0, window_start)
-- Count via ZCARD
local failures = redis.call('ZCARD', fail_key)
local successes = redis.call('ZCARD', succ_key)
local total = failures + successes
if total >= min_calls and (failures / total) >= threshold then
return {1, failures, successes, total}
end
return {0, failures, successes, total}
"""
try:
result = await redis_client.eval(
lua_script,
2,
fail_key,
succ_key,
window_start,
threshold,
min_calls,
)
is_open = bool(result[0])
failure_count = int(result[1])
success_count = int(result[2])
total = int(result[3])
failure_rate = failure_count / total if total > 0 else 0.0
stats = {
"failure_rate": failure_rate,
"failure_count": failure_count,
"success_count": success_count,
"threshold": threshold,
"window_seconds": window_seconds,
}
return is_open, stats
except Exception as e:
logger.error(f"Circuit breaker check error for campaign {campaign_id}: {e}")
return False, None
async def record_and_evaluate(self, campaign_id: int, is_failure: bool) -> None:
"""Record a call outcome, and if the breaker trips, pause the campaign.
This is the main entry point called from telephony status callbacks.
It handles fetching campaign config, recording the outcome, and
pausing + publishing an event if the breaker trips.
Exceptions are caught internally so this never disrupts the caller.
"""
try:
campaign = await db_client.get_campaign_by_id(campaign_id)
if not campaign or campaign.state != "running":
return
cb_config = {}
if campaign.orchestrator_metadata:
cb_config = campaign.orchestrator_metadata.get("circuit_breaker", {})
tripped, stats = await self.record_call_outcome(
campaign_id=campaign_id,
is_failure=is_failure,
config=cb_config,
)
if tripped and stats:
logger.warning(
f"Circuit breaker tripped for campaign {campaign_id}, "
f"pausing campaign. Stats: {stats}"
)
await db_client.update_campaign(campaign_id=campaign_id, state="paused")
publisher = await get_campaign_event_publisher()
await publisher.publish_circuit_breaker_tripped(
campaign_id=campaign_id,
failure_rate=stats["failure_rate"],
failure_count=stats["failure_count"],
success_count=stats["success_count"],
threshold=stats["threshold"],
window_seconds=stats["window_seconds"],
)
except Exception as e:
logger.error(f"Error in circuit breaker for campaign {campaign_id}: {e}")
async def reset(self, campaign_id: int) -> bool:
"""Reset the circuit breaker state for a campaign.
Called when a campaign is resumed to give it a clean slate.
"""
redis_client = await self._get_redis()
fail_key, succ_key = self._keys(campaign_id)
try:
await redis_client.delete(fail_key, succ_key)
logger.info(f"Circuit breaker reset for campaign {campaign_id}")
return True
except Exception as e:
logger.error(
f"Error resetting circuit breaker for campaign {campaign_id}: {e}"
)
return False
async def close(self):
"""Close Redis connection."""
if self.redis_client:
await self.redis_client.close()
self.redis_client = None
# Global circuit breaker instance
circuit_breaker = CircuitBreaker()

View file

@ -4,6 +4,7 @@ from typing import Any, Dict
from loguru import logger
from api.db import db_client
from api.services.campaign.circuit_breaker import circuit_breaker
from api.tasks.arq import enqueue_job
from api.tasks.function_names import FunctionNames
@ -67,6 +68,9 @@ class CampaignRunnerService:
# stale campaign checker would do that if there are pending work.
await db_client.update_campaign(campaign_id=campaign_id, state="running")
# Reset circuit breaker so the resumed campaign starts with a clean slate
await circuit_breaker.reset(campaign_id)
logger.info(f"Campaign {campaign_id} resumed")
async def get_campaign_status(self, campaign_id: int) -> Dict[str, Any]: