"""Phase 4 daemon concurrency primitives (DAEMON-04, DAEMON-05).

Persistent-fd flock wrapper. Hold one instance for process lifetime.
fcntl.flock (NOT lockf) -- fd-close does not release (see apenwarr 2010, Pitfall 2).

Constitutional guard:
- C1 HUMAN-FIRST: ProcessLock.try_acquire_exclusive is non-blocking; daemon
  yields immediately when any shared lockholder exists.
- C-USER-CONSENT (formerly C2 per D7-16): the user_initiated_sleep
  branch of _dispatch_socket_request only sets pending flags after receiving
  an explicit consent payload from the wrapper; the FSM transition itself is
  performed by _tick_body, never by the dispatcher (C-DISPATCHER-FSM-ISOLATION).
- C-DISPATCHER-FSM-ISOLATION (Phase 7 structural; supersedes the bare `C2`
  inline-comment shorthand previously used at the FSM-yield call sites): the
  socket dispatcher MUST NOT transition the FSM directly; it only sets pending
  flags consumed by _tick_body under the FSM lock. New socket_server
  inherits this invariant.
- T-04-06 mitigation: flock is bound to process + open-file-description,
  so closing an unrelated fd (e.g. /etc/passwd) does NOT release our lock.
- T-04-02 mitigation: cleanup_stale_socket + asyncio cleanup_socket kwarg
  survive SIGKILL-orphaned sockets.
- T-04-07 mitigation: lock + socket created with mode 0o600 so cross-user
  access requires OS privilege escalation (out of scope).

This module has NO LLM code and NO paid-API env var references.
"""
from __future__ import annotations

import asyncio
import errno
import fcntl
import json
import os
from datetime import datetime, timezone
from pathlib import Path
from typing import Any, Awaitable, Callable

LOCK_PATH: Path = Path.home() / ".iai-mcp" / ".lock"
SOCKET_PATH: Path = Path.home() / ".iai-mcp" / ".daemon.sock"


class ProcessLock:
    """Persistent-fd flock wrapper.

    Hold one instance per process for the entire process lifetime.
    fcntl.flock (BSD) NOT lockf (POSIX) -- closing an unrelated fd does NOT
    release our lock (see apenwarr 2010, Pitfall 2).

    Semantics:
    - acquire_shared():           blocking LOCK_SH (MCP pattern)
    - try_acquire_exclusive():    LOCK_EX | LOCK_NB (daemon heavy-op pattern)
    - holds_exclusive_nb():       cooperative-yield probe
    - release():                  LOCK_UN (release without closing fd)
    - close():                    os.close() the fd (shutdown only)
    """

    def __init__(self, path: Path = LOCK_PATH) -> None:
        path.parent.mkdir(parents=True, exist_ok=True)
        # O_CREAT so lock file is created if missing; mode 0o600 keeps it user-only.
        self._fd: int | None = os.open(path, os.O_RDWR | os.O_CREAT, 0o600)
        # Ensure mode is actually 0o600 even if umask altered it on create.
        try:
            os.chmod(path, 0o600)
        except OSError:
            pass
        self._path = path

    def acquire_shared(self) -> None:
        """Blocking LOCK_SH. MCP sessions call this at session start."""
        if self._fd is None:
            raise RuntimeError("ProcessLock closed; cannot acquire")
        fcntl.flock(self._fd, fcntl.LOCK_SH)

    def try_acquire_exclusive(self) -> bool:
        """Non-blocking LOCK_EX | LOCK_NB.

        Returns True if acquired, False if any shared holder blocks us.
        Daemon calls this before heavy ops; False -> yield to MCP.
        """
        if self._fd is None:
            return False
        try:
            fcntl.flock(self._fd, fcntl.LOCK_EX | fcntl.LOCK_NB)
            return True
        except OSError as exc:
            if exc.errno in (errno.EAGAIN, errno.EWOULDBLOCK):
                return False
            raise

    def holds_exclusive_nb(self) -> bool:
        """D-06 cooperative-yield probe.

        Non-blocking check: do we still hold the exclusive lock?

        Returns True if our fd has the exclusive lock. Returns False if
        another process (e.g., MCP) acquired a shared lock while we were
        working between REM cycles.

        Implementation: fcntl.flock with LOCK_EX | LOCK_NB on our existing fd.
        On Linux/macOS, re-acquiring an already-held lock is a no-op success.
        On contention (shared lock held by another process), raises BlockingIOError
        which we catch and translate to False. EWOULDBLOCK/EAGAIN may surface as
        OSError on some platforms -- caught the same way.
        """
        if self._fd is None:
            return False
        try:
            fcntl.flock(self._fd, fcntl.LOCK_EX | fcntl.LOCK_NB)
            return True
        except BlockingIOError:
            return False
        except OSError as exc:
            if exc.errno in (errno.EAGAIN, errno.EWOULDBLOCK):
                return False
            raise

    def release(self) -> None:
        """LOCK_UN: release lock but keep fd open for later reacquisition."""
        if self._fd is None:
            return
        fcntl.flock(self._fd, fcntl.LOCK_UN)

    def close(self) -> None:
        """Close fd. Only call at process shutdown -- closing releases the lock."""
        if self._fd is not None:
            try:
                os.close(self._fd)
            finally:
                self._fd = None


def cleanup_stale_socket(path: Path = SOCKET_PATH) -> None:
    """Remove a stale socket file left over from SIGKILL-orphaned daemon.

    Pitfall 10 mitigation: the in-process case is handled either by the
    3.13+ kwarg (see serve_control_socket) or by the 3.12 finally-block
    emulation, but a prior daemon killed with SIGKILL never got to run its
    cleanup. Call this BEFORE the server binds.
    """
    try:
        path.unlink()
    except FileNotFoundError:
        pass
    except OSError:
        # Path may be a non-socket file -- still try to unlink. If even that
        # fails (e.g. permission), let asyncio surface the EADDRINUSE.
        try:
            path.unlink()
        except OSError:
            pass


def _validate_socket_message(req: dict) -> tuple[bool, str | None]:
    """Per-type schema validation (ASVS V5).

    Returns (ok, error_message). `req` must already be known to be a dict.
    """
    req_type = req.get("type")
    if not isinstance(req_type, str):
        return False, "type must be a string"

    if req_type == "status":
        # No required fields.
        return True, None

    if req_type == "user_initiated_sleep":
        reason = req.get("reason")
        ts = req.get("ts")
        if not isinstance(reason, str):
            return False, "reason must be a string"
        if not isinstance(ts, str):
            return False, "ts must be a string"
        return True, None

    if req_type in ("force_wake", "force_rem"):
        ts = req.get("ts")
        if not isinstance(ts, str):
            return False, "ts must be a string"
        return True, None

    if req_type in ("pause", "resume"):
        # pause may optionally carry `seconds`; we don't persist it as a timer
        # (the flag is binary) but we DO validate the type if supplied.
        if "seconds" in req:
            seconds = req.get("seconds")
            if not isinstance(seconds, int) or isinstance(seconds, bool):
                return False, "seconds must be an int"
        return True, None

    # TOK-14 / D5-05: 7th message type `session_open`.
    # Both session_id and ts are OPTIONAL; when supplied, they must be strings.
    # Absence is tolerated so the TS wrapper can emit a bare ping on MCP boot
    # without stalling on id/ts bookkeeping.
    if req_type == "session_open":
        if "session_id" in req and not isinstance(req["session_id"], str):
            return False, "session_id must be a string"
        if "ts" in req and not isinstance(req["ts"], str):
            return False, "ts must be a string"
        return True, None

    # Unknown types are not rejected at validation time; the dispatcher
    # returns a structured unknown_message_type response so the caller sees
    # a different reason code from "invalid_message".
    return True, None


async def _dispatch_socket_request(
    req: dict,
    store: Any,
    lock: ProcessLock,
    state: dict,
) -> dict:
    """Default dispatcher for NDJSON socket requests.

    Handles seven message types; mutates `state` in-place and persists via
    `save_state` when the message changes scheduler control flags. The
    dispatcher thread NEVER transitions the FSM directly
    (C-DISPATCHER-FSM-ISOLATION; renamed from bare `C2` per D7-16) --
    it only sets pending flags that `_tick_body` reads under the FSM lock.

    Handled types:
    - status                  -> state snapshot including version
    - user_initiated_sleep    -> set user_sleep_request pending flag
    - force_wake              -> set force_wake_request pending flag
    - force_rem               -> set force_rem_request pending flag
    - pause                   -> scheduler_paused=True
    - resume                  -> scheduler_paused=False
    - session_open            -> set first_turn_pending + hippea_cascade_request
                                 (Plan 05-04 TOK-14 / D5-05)
    - any other               -> {"ok": False, "reason": "unknown_message_type"}
    """
    # Reject non-dict requests (defence-in-depth; caller already json.loaded).
    if not isinstance(req, dict):
        return {
            "ok": False,
            "reason": "invalid_message",
            "error": "request must be a JSON object",
        }

    # Per-type schema validation (ASVS V5).
    ok, err = _validate_socket_message(req)
    if not ok:
        return {
            "ok": False,
            "reason": "invalid_message",
            "error": err or "schema_validation_failed",
        }

    req_type = req.get("type")

    # Lazy imports so test monkeypatches of STATE_PATH (via daemon_state) and
    # __version__ (via iai_mcp) always resolve to the current module state.
    from datetime import datetime, timezone

    from iai_mcp import __version__ as pkg_version
    from iai_mcp.daemon_state import save_state

    # -------------------------------------------------------- status snapshot
    if req_type == "status":
        fsm_state = state.get("fsm_state", "WAKE")
        started_at = state.get("daemon_started_at")
        uptime_sec: float | None = None
        if started_at:
            try:
                start_dt = datetime.fromisoformat(started_at)
                uptime_sec = (datetime.now(timezone.utc) - start_dt).total_seconds()
            except (TypeError, ValueError):
                uptime_sec = None

        # Truncate pending_digest to the top-level counters for socket
        # transport; the full digest can be multi-KB once insights are baked.
        pending_digest = state.get("pending_digest")
        if isinstance(pending_digest, dict):
            truncated_digest = {
                "rem_cycles_completed": pending_digest.get("rem_cycles_completed", 0),
                "episodes_processed": pending_digest.get("episodes_processed", 0),
                "schemas_induced_tier0": pending_digest.get(
                    "schemas_induced_tier0", 0,
                ),
                "claude_call_used": pending_digest.get("claude_call_used", False),
            }
        else:
            truncated_digest = None

        return {
            "ok": True,
            # Backwards-compat key used by tests/test_concurrency.py Test 6.
            "state": fsm_state,
            "uptime_sec": uptime_sec,
            # Plan 04-gap-1 additions:
            "version": pkg_version,
            "fsm_state": fsm_state,
            "last_tick_at": state.get("last_tick_at"),
            "quiet_window": state.get("quiet_window"),
            "pending_digest": truncated_digest,
            "daemon_started_at": started_at,
            "scheduler_paused": bool(state.get("scheduler_paused", False)),
        }

    # -------------------------------------------------- user_initiated_sleep
    if req_type == "user_initiated_sleep":
        current_fsm = state.get("fsm_state", "WAKE")
        if current_fsm in ("SLEEP", "DREAMING", "TRANSITIONING"):
            return {"ok": False, "reason": "already_sleeping"}

        # Clip reason to 500 chars (ASVS V5 output hardening mirror).
        reason = str(req.get("reason", ""))[:500]
        ts = str(req.get("ts", ""))
        state["user_sleep_request"] = {
            "reason": reason,
            "ts": ts,
            "pending": True,
        }
        try:
            save_state(state)
        except Exception as exc:  # noqa: BLE001 -- socket must never crash daemon
            return {"ok": False, "reason": "state_write_failed", "error": str(exc)[:200]}
        # Tell the caller we queued the transition; the scheduler owns the FSM
        # and will move WAKE->TRANSITIONING->SLEEP on the next tick
        # (C-DISPATCHER-FSM-ISOLATION; renamed from bare `C2` per D7-16).
        return {"ok": True, "state": "TRANSITIONING"}

    # ---------------------------------------------------------- force_wake
    if req_type == "force_wake":
        ts = str(req.get("ts", ""))
        state["force_wake_request"] = {"ts": ts, "pending": True}
        try:
            save_state(state)
        except Exception as exc:  # noqa: BLE001
            return {"ok": False, "reason": "state_write_failed", "error": str(exc)[:200]}
        return {"ok": True, "reason": "wake_queued"}

    # ----------------------------------------------------------- force_rem
    if req_type == "force_rem":
        ts = str(req.get("ts", ""))
        state["force_rem_request"] = {"ts": ts, "pending": True}
        try:
            save_state(state)
        except Exception as exc:  # noqa: BLE001
            return {"ok": False, "reason": "state_write_failed", "error": str(exc)[:200]}
        return {"ok": True, "reason": "rem_queued"}

    # --------------------------------------------------------- pause/resume
    if req_type == "pause":
        state["scheduler_paused"] = True
        try:
            save_state(state)
        except Exception as exc:  # noqa: BLE001
            return {"ok": False, "reason": "state_write_failed", "error": str(exc)[:200]}
        return {"ok": True, "paused": True}

    if req_type == "resume":
        state["scheduler_paused"] = False
        try:
            save_state(state)
        except Exception as exc:  # noqa: BLE001
            return {"ok": False, "reason": "state_write_failed", "error": str(exc)[:200]}
        return {"ok": True, "paused": False}

    # ---------------------------------------------------------- session_open
    # TOK-14 / D5-05: 7th message type. Sets two flags:
    #   - first_turn_pending[session_id] = True  -> consumed by core's
    #     _first_turn_recall_hook exactly once per session.
    #   - hippea_cascade_request {pending=True, session_id, ts} -> polled by
    #     daemon._hippea_cascade_loop which pre-warms the LRU with records
    #     from the top-K salient communities (Van de Cruys HIPPEA operational
    #     form).
    # Both flags are idempotent under a re-emit: set_overwrite is intentional
    # so a client that retries session_open gets a fresh cascade.
    if req_type == "session_open":
        # Clip session_id to 128 chars (ASVS V5 output hardening — matches
        # user_initiated_sleep.reason clip at 500).
        session_id = str(req.get("session_id", ""))[:128]
        ts = str(req.get("ts", ""))
        state["last_session_open"] = {"session_id": session_id, "ts": ts}
        # first-turn hook flag. Co-exists with existing dict form
        # written by daemon_state.mark_session_opened.
        first_turn = state.setdefault("first_turn_pending", {})
        now_iso = datetime.now(timezone.utc).isoformat()
        if isinstance(first_turn, dict):
            first_turn[session_id] = now_iso
        else:
            # Legacy scalar-bool state -> upgrade in place to the dict form.
            state["first_turn_pending"] = {session_id: now_iso}
        # cascade flag.
        state["hippea_cascade_request"] = {
            "session_id": session_id,
            "ts": ts,
            "pending": True,
        }
        try:
            save_state(state)
        except Exception as exc:  # noqa: BLE001
            return {"ok": False, "reason": "state_write_failed", "error": str(exc)[:200]}
        return {"ok": True, "reason": "session_open_queued"}

    # ------------------------------------------------------------ unknown
    return {
        "ok": False,
        "reason": "unknown_message_type",
        "type": req_type,
    }


async def serve_control_socket(
    store: Any,
    lock: ProcessLock,
    state: dict,
    shutdown: asyncio.Event,
    *,
    dispatcher: Callable[[dict], Awaitable[dict]] | None = None,
    socket_path: Path = SOCKET_PATH,
) -> None:
    """Unix socket NDJSON server at ~/.iai-mcp/.daemon.sock.

    Protocol: each line from client is a JSON request; each response is one
    JSON line back. The cleanup_socket kwarg (Python 3.13+) auto-removes the
    socket file on server shutdown; on 3.12 we emulate in the finally-block.
    Stale-socket pre-cleanup protects against SIGKILL-orphaned files.

    Permissions: chmod 0o600 immediately after bind so cross-user access
    requires privilege escalation (T-04-04 accepted risk).

    When dispatcher is provided it receives only the parsed request dict and
    must return a dict. When None, the default _dispatch_socket_request is used.
    """
    cleanup_stale_socket(socket_path)
    # Ensure parent dir exists (Path.home() / .iai-mcp could be first-run).
    socket_path.parent.mkdir(parents=True, exist_ok=True)

    # Python 3.13 added a `cleanup_socket` kwarg to the event-loop unix server
    # that auto-removes the socket file on shutdown. On 3.12 we emulate the
    # same behaviour by unlinking in the finally-block below. See:
    # https://docs.python.org/3.13/library/asyncio-stream.html
    _supports_cleanup_socket = False
    try:
        import inspect as _inspect
        import asyncio as _asyncio_mod
        _loop_sig = _inspect.signature(
            _asyncio_mod.get_event_loop_policy().new_event_loop().create_unix_server
        )
        _supports_cleanup_socket = "cleanup_socket" in _loop_sig.parameters
    except Exception:
        _supports_cleanup_socket = False

    async def handle(reader: asyncio.StreamReader, writer: asyncio.StreamWriter) -> None:
        try:
            line = await reader.readline()
            if not line:
                return
            try:
                req = json.loads(line)
            except (TypeError, ValueError) as exc:
                writer.write((json.dumps({"error": f"invalid_json: {exc}"}) + "\n").encode("utf-8"))
                await writer.drain()
                return
            try:
                if dispatcher is not None:
                    resp = await dispatcher(req)
                else:
                    resp = await _dispatch_socket_request(req, store, lock, state)
            except Exception as exc:  # noqa: BLE001 -- socket must never crash daemon
                resp = {"error": str(exc)}
            writer.write((json.dumps(resp) + "\n").encode("utf-8"))
            await writer.drain()
        finally:
            try:
                writer.close()
                await writer.wait_closed()
            except Exception:
                pass

    # Build server kwargs. The native 3.13+ behaviour is opted in via
    # `cleanup_socket=True`; on 3.12 the finally-block emulates the same unlink
    # so a subsequent daemon boot cannot hit EADDRINUSE.
    _server_kwargs = {"cleanup_socket": True} if _supports_cleanup_socket else {}
    server = await asyncio.start_unix_server(
        handle, path=str(socket_path), **_server_kwargs,
    )
    # chmod 0o600 immediately after bind (T-04-07 mitigation).
    try:
        os.chmod(str(socket_path), 0o600)
    except OSError:
        pass

    try:
        async with server:
            await shutdown.wait()
    finally:
        # Python 3.12 cleanup-socket emulation: remove the socket file on
        # shutdown so the next daemon boot doesn't hit EADDRINUSE. 3.13+ does
        # this natively inside the server.__aexit__.
        if not _supports_cleanup_socket:
            try:
                socket_path.unlink()
            except FileNotFoundError:
                pass
            except OSError:
                pass