plano/cli/planoai/chatgpt_watchdog.py

211 lines
6.8 KiB
Python

"""
Background daemon that monitors ChatGPT OAuth token expiry and restarts
Plano processes with a fresh token before the old one expires.
The watchdog is spawned by start_native() when ChatGPT providers are present.
It runs as a fully daemonized process (double-fork) and exits after triggering
a restart (a fresh watchdog is spawned by the new start_native() call).
"""
import json
import os
import time
from typing import Optional
from planoai.consts import (
NATIVE_PID_FILE,
PLANO_RUN_DIR,
PLANO_WATCHDOG_LOG_FILE,
PLANO_WATCHDOG_STATE_FILE,
)
# Wake up this many seconds before token expiry to refresh
WATCHDOG_REFRESH_LEAD_SECONDS = 5 * 60 # 5 minutes
# How often the watchdog polls for expiry (in seconds)
WATCHDOG_POLL_INTERVAL_SECONDS = 30
# Env var sentinel: if set, spawn_watchdog() is a no-op (prevents recursive spawning)
_NO_WATCHDOG_ENV_VAR = "PLANO_NO_WATCHDOG"
def _log(msg: str) -> None:
print(f"[{time.strftime('%Y-%m-%d %H:%M:%S')}] watchdog: {msg}", flush=True)
def _seconds_until_expiry() -> Optional[float]:
"""Return seconds until the ChatGPT access token expires, or None if unknown."""
try:
from planoai.chatgpt_auth import load_auth
auth = load_auth()
if not auth:
return None
expires_at = auth.get("expires_at")
if not expires_at:
return None
return float(expires_at) - time.time()
except Exception:
return None
def _stop_services(skip_pids: set) -> None:
"""Stop envoy and brightstaff without killing the watchdog (self)."""
from planoai.native_runner import stop_native
stop_native(skip_pids=skip_pids)
def _do_restart(plano_config_file: str) -> bool:
"""
Refresh the ChatGPT token, stop Envoy+brightstaff, restart with the new token.
Returns True on success, False if the refresh failed.
"""
# 1. Load saved state (env dict + metadata)
if not os.path.exists(PLANO_WATCHDOG_STATE_FILE):
_log("Watchdog state file missing — cannot restart services")
return False
with open(PLANO_WATCHDOG_STATE_FILE) as f:
state = json.load(f)
env = state["env"]
with_tracing = state.get("with_tracing", False)
# 2. Refresh the token
try:
from planoai.chatgpt_auth import get_access_token
access_token, account_id = get_access_token()
except Exception as exc:
_log(
f"Token refresh failed: {exc}"
"run 'planoai chatgpt login' to re-authenticate"
)
return False
env["CHATGPT_ACCESS_TOKEN"] = access_token
if account_id:
env["CHATGPT_ACCOUNT_ID"] = account_id
# 3. Stop envoy + brightstaff (skip self so we don't self-terminate)
_stop_services(skip_pids={os.getpid()})
# 4. Unset the sentinel so start_native() spawns a fresh watchdog
os.environ.pop(_NO_WATCHDOG_ENV_VAR, None)
# 5. Restart with the fresh token; this also spawns the next watchdog
from planoai.native_runner import start_native
start_native(
plano_config_file,
env,
with_tracing=with_tracing,
spawn_watchdog=True,
)
return True
def _watchdog_main(plano_config_file: str) -> None:
"""Main loop running inside the watchdog daemon process."""
_log(f"Watchdog started (PID {os.getpid()})")
while True:
time.sleep(WATCHDOG_POLL_INTERVAL_SECONDS)
secs = _seconds_until_expiry()
if secs is None:
_log("Cannot read token expiry — will retry next cycle")
continue
if secs > WATCHDOG_REFRESH_LEAD_SECONDS:
continue # Token still healthy
_log(f"Token expires in {secs:.0f}s — refreshing and restarting services")
success = _do_restart(plano_config_file)
if not success:
_log(
"Restart failed — exiting watchdog. "
"Services will continue until the token expires, "
"then requests will fail. Run 'planoai chatgpt login' to fix."
)
# Either _do_restart spawned a new watchdog, or it failed.
# Either way, this watchdog's job is done.
return
def spawn_watchdog(plano_config_file: str) -> int:
"""
Spawn a background watchdog daemon to monitor ChatGPT token expiry.
Called from start_native() after services are healthy. Returns the watchdog
daemon PID, or 0 if no watchdog was spawned (no ChatGPT providers, or
recursive spawn was prevented by _NO_WATCHDOG_ENV_VAR).
"""
# Prevent recursive spawning (watchdog calls start_native which calls us)
if os.environ.get(_NO_WATCHDOG_ENV_VAR):
return 0
# Only spawn if the config has ChatGPT providers
try:
import yaml
with open(plano_config_file) as f:
config = yaml.safe_load(f)
providers = config.get("model_providers") or config.get("llm_providers") or []
has_chatgpt = any(
str(p.get("model", "")).startswith("chatgpt/") for p in providers
)
if not has_chatgpt:
return 0
except Exception:
return 0
os.makedirs(PLANO_RUN_DIR, exist_ok=True)
log_fd = os.open(
PLANO_WATCHDOG_LOG_FILE, os.O_WRONLY | os.O_CREAT | os.O_TRUNC, 0o644
)
# Double-fork to daemonize (mirrors _daemon_exec in native_runner.py)
pid = os.fork()
if pid > 0:
# Parent: close log fd, wait for first child, read back grandchild PID
os.close(log_fd)
os.waitpid(pid, 0)
grandchild_pid_path = os.path.join(PLANO_RUN_DIR, f".daemon_pid_{pid}")
deadline = time.time() + 5
while time.time() < deadline:
if os.path.exists(grandchild_pid_path):
with open(grandchild_pid_path) as f:
grandchild_pid = int(f.read().strip())
os.unlink(grandchild_pid_path)
return grandchild_pid
time.sleep(0.05)
os.close(log_fd) if False else None # already closed above
return 0 # Timed out — watchdog did not start
# First child: create new session, fork again
os.setsid()
grandchild_pid = os.fork()
if grandchild_pid > 0:
# Intermediate child: write grandchild PID and exit
pid_path = os.path.join(PLANO_RUN_DIR, f".daemon_pid_{os.getpid()}")
with open(pid_path, "w") as f:
f.write(str(grandchild_pid))
os._exit(0)
# Grandchild: the actual daemon
os.dup2(log_fd, 1) # stdout -> watchdog log
os.dup2(log_fd, 2) # stderr -> watchdog log
os.close(log_fd)
devnull = os.open(os.devnull, os.O_RDONLY)
os.dup2(devnull, 0)
os.close(devnull)
# Set sentinel so any start_native() we call doesn't spawn another watchdog
os.environ[_NO_WATCHDOG_ENV_VAR] = "1"
try:
_watchdog_main(plano_config_file)
except Exception as exc:
_log(f"Watchdog crashed: {exc}")
finally:
os._exit(0)