"""Hierarchical community detection (D-05 bootstrap + stable UUIDs + CONN-01/04).

Policy:
- N < SMALL_N_FLAT (200): single flat community. Rich-club coefficient is too noisy
  below this per van den Heuvel & Sporns 2011; Leiden output is unstable too.
- SMALL_N_FLAT <= N < MID_N_LEIDEN (500): run Leiden; accept only if Q >= 0.2
  (MODULARITY_FLOOR), else fall back to flat. Protects against Leiden producing
  visible but unjustified communities in sparse graphs.
- N >= MID_N_LEIDEN: always run Leiden; accept result regardless of Q
  (graph is big enough that any modular structure is meaningful).

Stable UUIDs:
- Every community gets a persistent UUID at creation.
- On re-run, each new community's centroid is matched against prior centroids;
  the highest cosine >= UUID_ROTATE_COSINE (0.7) reuses the prior UUID.
  If no prior centroid passes the 0.7 bar, a fresh UUID is allocated.
- This prevents ID churn on re-runs where Leiden re-orders labels but the
  cluster membership is essentially the same.

CONN-01 three-level parcellation (Phase 1 approximation):
- Level 1: top_communities -- top 7 (Yeo-like) by member count.
- Level 2: mid_regions -- community UUID -> member node UUIDs
           (Schaefer-scale 200-400 sub-parcellation is a Phase-2 refinement;
            for we expose the community -> members mapping).
- Level 3: node_to_community -- every leaf record's community assignment.

CONN-04 refresh threshold:
- needs_refresh(prior, current_Q) returns True iff |prior.Q - current_Q| > 0.05.
  The pipeline or session-start assembler decides when to re-run detect_communities
  based on this signal.
"""
from __future__ import annotations

from dataclasses import dataclass, field
from uuid import UUID, uuid4

import numpy as np

from iai_mcp.graph import _HAS_IGRAPH, IGRAPH_THRESHOLD, MemoryGraph

# bootstrap thresholds
SMALL_N_FLAT = 200
MID_N_LEIDEN = 500
MODULARITY_FLOOR = 0.2

# CONN-04 refresh trigger
REFRESH_DELTA = 0.05

# stable-UUID cosine floor
UUID_ROTATE_COSINE = 0.7

# CONN-01 level-1 cap (Yeo-like 7 networks)
MAX_TOP_COMMUNITIES = 7


@dataclass
class CommunityAssignment:
    """Output of detect_communities -- consumed by pipeline.pipeline_recall.

    - node_to_community: leaf UUID -> community UUID
    - community_centroids: community UUID -> mean of member embeddings
    - modularity: Leiden Q (0.0 for flat)
    - backend: "flat" | "leiden-networkx" | "leiden-igraph"
    - top_communities: up to MAX_TOP_COMMUNITIES by member count (CONN-01 L1)
    - mid_regions: community UUID -> list of member leaf UUIDs (CONN-01 L2)
    """

    node_to_community: dict[UUID, UUID] = field(default_factory=dict)
    community_centroids: dict[UUID, list[float]] = field(default_factory=dict)
    modularity: float = 0.0
    backend: str = "flat"
    top_communities: list[UUID] = field(default_factory=list)
    mid_regions: dict[UUID, list[UUID]] = field(default_factory=dict)


# ---------------------------------------------------------------- math helpers


def _cosine(a: list[float], b: list[float]) -> float:
    av = np.asarray(a, dtype=np.float32)
    bv = np.asarray(b, dtype=np.float32)
    na = float(np.linalg.norm(av))
    nb = float(np.linalg.norm(bv))
    if na == 0 or nb == 0:
        return 0.0
    return float(np.dot(av, bv) / (na * nb))


def _compute_centroid(embeddings: list[list[float]]) -> list[float]:
    if not embeddings:
        return []
    arr = np.asarray(embeddings, dtype=np.float32)
    centroid = arr.mean(axis=0)
    norm = float(np.linalg.norm(centroid))
    if norm > 0:
        centroid = centroid / norm
    return centroid.tolist()


def _map_to_stable_uuids(
    raw_partition: dict[UUID, int],
    graph: MemoryGraph,
    prior: CommunityAssignment | None,
) -> tuple[dict[UUID, UUID], dict[UUID, list[float]]]:
    """assign UUIDs to raw integer community labels, reusing prior UUIDs
    when a new centroid matches a prior centroid with cosine >= UUID_ROTATE_COSINE.

    Matching is greedy (descending best-match-first) and one-to-one: each prior
    UUID is claimed by at most one new community.
    """
    # Group nodes by raw integer label.
    groups: dict[int, list[UUID]] = {}
    for node, grp in raw_partition.items():
        groups.setdefault(grp, []).append(node)

    # Compute new centroids per group. Filter out nodes with no embedding
    # (e.g. sentinel UUIDs like PROFILE_SENTINEL) and zero-pad the remaining
    # members to the *current* store dim rather than a hardcoded 384d, so the
    # centroid input stays homogeneous after a 384d -> 1024d re-embed migration.
    new_centroids: dict[int, list[float]] = {}
    for grp, nodes in groups.items():
        valid = [e for n in nodes if (e := graph.get_embedding(n))]
        if not valid:
            continue
        dim = len(valid[0])
        embs = [graph.get_embedding(n) or [0.0] * dim for n in nodes]
        new_centroids[grp] = _compute_centroid(embs)

    # Greedy one-to-one assignment: for each new group, pick the best unused
    # prior UUID with cosine >= UUID_ROTATE_COSINE.
    uuid_for_group: dict[int, UUID] = {}
    used_prior: set[UUID] = set()
    if prior:
        # Stable ordering: by group id ascending so tie-breaks are deterministic.
        for grp in sorted(new_centroids.keys()):
            cent = new_centroids[grp]
            best_prior: UUID | None = None
            best_sim: float = -1.0
            for prior_uuid, prior_cent in prior.community_centroids.items():
                if prior_uuid in used_prior:
                    continue
                s = _cosine(cent, prior_cent)
                if s > best_sim:
                    best_sim = s
                    best_prior = prior_uuid
            if best_prior is not None and best_sim >= UUID_ROTATE_COSINE:
                uuid_for_group[grp] = best_prior
                used_prior.add(best_prior)

    # Allocate fresh UUIDs for groups that didn't match any prior.
    for grp in groups:
        if grp not in uuid_for_group:
            uuid_for_group[grp] = uuid4()

    # Build final maps.
    node_to_community: dict[UUID, UUID] = {}
    community_centroids: dict[UUID, list[float]] = {}
    for grp, nodes in groups.items():
        u = uuid_for_group[grp]
        community_centroids[u] = new_centroids[grp]
        for n in nodes:
            node_to_community[n] = u

    return node_to_community, community_centroids


# ------------------------------------------------------------- flat assignment


def _flat_assignment(
    graph: MemoryGraph, prior: CommunityAssignment | None
) -> CommunityAssignment:
    """Single flat community covering every node."""
    nodes: list[UUID] = []
    valid_embs: list[list[float]] = []
    for node in graph._nx.nodes():
        u = UUID(node)
        nodes.append(u)
        emb = graph.get_embedding(u)
        if emb:
            valid_embs.append(emb)
    if not nodes:
        return CommunityAssignment(backend="flat")

    # Zero-pad any sentinel nodes to the detected store dim so centroid math
    # stays homogeneous post-re-embed (was hardcoded 384d before 1024d support).
    dim = len(valid_embs[0]) if valid_embs else 0
    embs: list[list[float]] = []
    for node in graph._nx.nodes():
        u = UUID(node)
        emb = graph.get_embedding(u)
        embs.append(emb if emb else [0.0] * dim)
    centroid = _compute_centroid(embs) if dim else []

    # Stable UUID across flat runs: reuse prior's single UUID if centroid matches.
    flat_uuid: UUID | None = None
    if prior and len(prior.community_centroids) == 1:
        prior_uuid, prior_cent = next(iter(prior.community_centroids.items()))
        if _cosine(centroid, prior_cent) >= UUID_ROTATE_COSINE:
            flat_uuid = prior_uuid
    if flat_uuid is None:
        flat_uuid = uuid4()

    node_to_community = {n: flat_uuid for n in nodes}
    community_centroids = {flat_uuid: centroid}
    return CommunityAssignment(
        node_to_community=node_to_community,
        community_centroids=community_centroids,
        modularity=0.0,
        backend="flat",
        top_communities=[flat_uuid],
        mid_regions={flat_uuid: nodes},
    )


# ------------------------------------------------------------------ leiden run


def _run_leiden(graph: MemoryGraph) -> tuple[dict[UUID, int], float, str]:
    """Run leidenalg on a NetworkX graph via an igraph mirror.

    Returns (node_uuid -> int label, modularity Q, backend_label).
    Backend label reflects which library owns the hot path per D-04:
    "leiden-igraph" for N >= IGRAPH_THRESHOLD, "leiden-networkx" for smaller graphs
    (both internally use leidenalg since python-louvain is Louvain, not Leiden).
    Seed=42 for determinism across calls.
    """
    import igraph as ig  # local import so leiden dep is lazy
    import leidenalg

    g = graph._nx
    nodes = list(g.nodes())
    idx = {n: i for i, n in enumerate(nodes)}
    edges = [(idx[u], idx[v]) for u, v in g.edges()]
    weights = [float(g[u][v].get("weight", 1.0)) for u, v in g.edges()]

    ih = ig.Graph(n=len(nodes), edges=edges, directed=False)
    if weights:
        ih.es["weight"] = weights

    part = leidenalg.find_partition(
        ih,
        leidenalg.ModularityVertexPartition,
        seed=42,
        weights="weight" if weights else None,
    )
    q = float(part.modularity)
    mapping = {
        UUID(nodes[i]): int(part.membership[i]) for i in range(len(nodes))
    }

    # Backend label matches split even though both paths use leidenalg.
    if _HAS_IGRAPH and graph.node_count() >= IGRAPH_THRESHOLD:
        return mapping, q, "leiden-igraph"
    return mapping, q, "leiden-networkx"


# ------------------------------------------------------------------ public API


def detect_communities(
    graph: MemoryGraph,
    prior: CommunityAssignment | None = None,
) -> CommunityAssignment:
    """D-05 bootstrap + stable UUIDs + CONN-01 three-level parcellation.

    Empty graph -> empty CommunityAssignment(backend="flat").
    """
    n = graph.node_count()
    if n == 0:
        return CommunityAssignment(backend="flat")
    if n < SMALL_N_FLAT:
        return _flat_assignment(graph, prior)

    try:
        raw_partition, q, backend = _run_leiden(graph)
    except Exception:
        # Leiden unavailable or graph pathological -> degrade gracefully.
        return _flat_assignment(graph, prior)

    # Mid-N guard: Leiden output only acceptable if Q >= 0.2.
    if n < MID_N_LEIDEN and q < MODULARITY_FLOOR:
        return _flat_assignment(graph, prior)

    node_to_community, community_centroids = _map_to_stable_uuids(
        raw_partition, graph, prior
    )

    # CONN-01 level 1: top 7 communities by member count.
    counts: dict[UUID, int] = {}
    for c in node_to_community.values():
        counts[c] = counts.get(c, 0) + 1
    top = sorted(counts.items(), key=lambda kv: kv[1], reverse=True)[
        :MAX_TOP_COMMUNITIES
    ]
    top_communities = [u for u, _ in top]

    # CONN-01 level 2 (mid-regions): community UUID -> member node UUIDs.
    mid_regions: dict[UUID, list[UUID]] = {}
    for node, comm in node_to_community.items():
        mid_regions.setdefault(comm, []).append(node)

    return CommunityAssignment(
        node_to_community=node_to_community,
        community_centroids=community_centroids,
        modularity=q,
        backend=backend,
        top_communities=top_communities,
        mid_regions=mid_regions,
    )


def needs_refresh(
    prior: CommunityAssignment, current_modularity: float
) -> bool:
    """CONN-04: refresh signal when |Δ modularity| > REFRESH_DELTA (0.05).

    Consumer (session-start assembler / maintenance job) calls this on each
    new Leiden run; a True return triggers a re-assignment + cache invalidation.
    """
    return abs(prior.modularity - current_modularity) > REFRESH_DELTA