mirror of
https://github.com/katanemo/plano.git
synced 2026-04-30 11:26:27 +02:00
211 lines
6.8 KiB
Python
211 lines
6.8 KiB
Python
"""
|
|
Background daemon that monitors ChatGPT OAuth token expiry and restarts
|
|
Plano processes with a fresh token before the old one expires.
|
|
|
|
The watchdog is spawned by start_native() when ChatGPT providers are present.
|
|
It runs as a fully daemonized process (double-fork) and exits after triggering
|
|
a restart (a fresh watchdog is spawned by the new start_native() call).
|
|
"""
|
|
|
|
import json
|
|
import os
|
|
import time
|
|
from typing import Optional
|
|
|
|
from planoai.consts import (
|
|
NATIVE_PID_FILE,
|
|
PLANO_RUN_DIR,
|
|
PLANO_WATCHDOG_LOG_FILE,
|
|
PLANO_WATCHDOG_STATE_FILE,
|
|
)
|
|
|
|
# Wake up this many seconds before token expiry to refresh
|
|
WATCHDOG_REFRESH_LEAD_SECONDS = 5 * 60 # 5 minutes
|
|
|
|
# How often the watchdog polls for expiry (in seconds)
|
|
WATCHDOG_POLL_INTERVAL_SECONDS = 30
|
|
|
|
# Env var sentinel: if set, spawn_watchdog() is a no-op (prevents recursive spawning)
|
|
_NO_WATCHDOG_ENV_VAR = "PLANO_NO_WATCHDOG"
|
|
|
|
|
|
def _log(msg: str) -> None:
|
|
print(f"[{time.strftime('%Y-%m-%d %H:%M:%S')}] watchdog: {msg}", flush=True)
|
|
|
|
|
|
def _seconds_until_expiry() -> Optional[float]:
|
|
"""Return seconds until the ChatGPT access token expires, or None if unknown."""
|
|
try:
|
|
from planoai.chatgpt_auth import load_auth
|
|
|
|
auth = load_auth()
|
|
if not auth:
|
|
return None
|
|
expires_at = auth.get("expires_at")
|
|
if not expires_at:
|
|
return None
|
|
return float(expires_at) - time.time()
|
|
except Exception:
|
|
return None
|
|
|
|
|
|
def _stop_services(skip_pids: set) -> None:
|
|
"""Stop envoy and brightstaff without killing the watchdog (self)."""
|
|
from planoai.native_runner import stop_native
|
|
|
|
stop_native(skip_pids=skip_pids)
|
|
|
|
|
|
def _do_restart(plano_config_file: str) -> bool:
|
|
"""
|
|
Refresh the ChatGPT token, stop Envoy+brightstaff, restart with the new token.
|
|
Returns True on success, False if the refresh failed.
|
|
"""
|
|
# 1. Load saved state (env dict + metadata)
|
|
if not os.path.exists(PLANO_WATCHDOG_STATE_FILE):
|
|
_log("Watchdog state file missing — cannot restart services")
|
|
return False
|
|
with open(PLANO_WATCHDOG_STATE_FILE) as f:
|
|
state = json.load(f)
|
|
env = state["env"]
|
|
with_tracing = state.get("with_tracing", False)
|
|
|
|
# 2. Refresh the token
|
|
try:
|
|
from planoai.chatgpt_auth import get_access_token
|
|
|
|
access_token, account_id = get_access_token()
|
|
except Exception as exc:
|
|
_log(
|
|
f"Token refresh failed: {exc} — "
|
|
"run 'planoai chatgpt login' to re-authenticate"
|
|
)
|
|
return False
|
|
|
|
env["CHATGPT_ACCESS_TOKEN"] = access_token
|
|
if account_id:
|
|
env["CHATGPT_ACCOUNT_ID"] = account_id
|
|
|
|
# 3. Stop envoy + brightstaff (skip self so we don't self-terminate)
|
|
_stop_services(skip_pids={os.getpid()})
|
|
|
|
# 4. Unset the sentinel so start_native() spawns a fresh watchdog
|
|
os.environ.pop(_NO_WATCHDOG_ENV_VAR, None)
|
|
|
|
# 5. Restart with the fresh token; this also spawns the next watchdog
|
|
from planoai.native_runner import start_native
|
|
|
|
start_native(
|
|
plano_config_file,
|
|
env,
|
|
with_tracing=with_tracing,
|
|
spawn_watchdog=True,
|
|
)
|
|
return True
|
|
|
|
|
|
def _watchdog_main(plano_config_file: str) -> None:
|
|
"""Main loop running inside the watchdog daemon process."""
|
|
_log(f"Watchdog started (PID {os.getpid()})")
|
|
|
|
while True:
|
|
time.sleep(WATCHDOG_POLL_INTERVAL_SECONDS)
|
|
|
|
secs = _seconds_until_expiry()
|
|
if secs is None:
|
|
_log("Cannot read token expiry — will retry next cycle")
|
|
continue
|
|
|
|
if secs > WATCHDOG_REFRESH_LEAD_SECONDS:
|
|
continue # Token still healthy
|
|
|
|
_log(f"Token expires in {secs:.0f}s — refreshing and restarting services")
|
|
success = _do_restart(plano_config_file)
|
|
if not success:
|
|
_log(
|
|
"Restart failed — exiting watchdog. "
|
|
"Services will continue until the token expires, "
|
|
"then requests will fail. Run 'planoai chatgpt login' to fix."
|
|
)
|
|
# Either _do_restart spawned a new watchdog, or it failed.
|
|
# Either way, this watchdog's job is done.
|
|
return
|
|
|
|
|
|
def spawn_watchdog(plano_config_file: str) -> int:
|
|
"""
|
|
Spawn a background watchdog daemon to monitor ChatGPT token expiry.
|
|
|
|
Called from start_native() after services are healthy. Returns the watchdog
|
|
daemon PID, or 0 if no watchdog was spawned (no ChatGPT providers, or
|
|
recursive spawn was prevented by _NO_WATCHDOG_ENV_VAR).
|
|
"""
|
|
# Prevent recursive spawning (watchdog calls start_native which calls us)
|
|
if os.environ.get(_NO_WATCHDOG_ENV_VAR):
|
|
return 0
|
|
|
|
# Only spawn if the config has ChatGPT providers
|
|
try:
|
|
import yaml
|
|
|
|
with open(plano_config_file) as f:
|
|
config = yaml.safe_load(f)
|
|
providers = config.get("model_providers") or config.get("llm_providers") or []
|
|
has_chatgpt = any(
|
|
str(p.get("model", "")).startswith("chatgpt/") for p in providers
|
|
)
|
|
if not has_chatgpt:
|
|
return 0
|
|
except Exception:
|
|
return 0
|
|
|
|
os.makedirs(PLANO_RUN_DIR, exist_ok=True)
|
|
log_fd = os.open(
|
|
PLANO_WATCHDOG_LOG_FILE, os.O_WRONLY | os.O_CREAT | os.O_TRUNC, 0o644
|
|
)
|
|
|
|
# Double-fork to daemonize (mirrors _daemon_exec in native_runner.py)
|
|
pid = os.fork()
|
|
if pid > 0:
|
|
# Parent: close log fd, wait for first child, read back grandchild PID
|
|
os.close(log_fd)
|
|
os.waitpid(pid, 0)
|
|
grandchild_pid_path = os.path.join(PLANO_RUN_DIR, f".daemon_pid_{pid}")
|
|
deadline = time.time() + 5
|
|
while time.time() < deadline:
|
|
if os.path.exists(grandchild_pid_path):
|
|
with open(grandchild_pid_path) as f:
|
|
grandchild_pid = int(f.read().strip())
|
|
os.unlink(grandchild_pid_path)
|
|
return grandchild_pid
|
|
time.sleep(0.05)
|
|
os.close(log_fd) if False else None # already closed above
|
|
return 0 # Timed out — watchdog did not start
|
|
|
|
# First child: create new session, fork again
|
|
os.setsid()
|
|
grandchild_pid = os.fork()
|
|
if grandchild_pid > 0:
|
|
# Intermediate child: write grandchild PID and exit
|
|
pid_path = os.path.join(PLANO_RUN_DIR, f".daemon_pid_{os.getpid()}")
|
|
with open(pid_path, "w") as f:
|
|
f.write(str(grandchild_pid))
|
|
os._exit(0)
|
|
|
|
# Grandchild: the actual daemon
|
|
os.dup2(log_fd, 1) # stdout -> watchdog log
|
|
os.dup2(log_fd, 2) # stderr -> watchdog log
|
|
os.close(log_fd)
|
|
devnull = os.open(os.devnull, os.O_RDONLY)
|
|
os.dup2(devnull, 0)
|
|
os.close(devnull)
|
|
|
|
# Set sentinel so any start_native() we call doesn't spawn another watchdog
|
|
os.environ[_NO_WATCHDOG_ENV_VAR] = "1"
|
|
|
|
try:
|
|
_watchdog_main(plano_config_file)
|
|
except Exception as exc:
|
|
_log(f"Watchdog crashed: {exc}")
|
|
finally:
|
|
os._exit(0)
|