mirror of
https://github.com/ModernRelay/omnigraph.git
synced 2026-07-03 02:51:04 +02:00
mr-668: POST /graphs runtime create endpoint (PR 7/10)
PR 7 of the MR-668 multi-graph server work. Operators can now add a graph to a running multi-graph server without restarting: curl -X POST http://server/graphs \ -H "Content-Type: application/json" \ -d '{ "graph_id": "beta", "uri": "/data/beta.omni", "schema": { "source": "node Person { name: String @key }\n" }, "policy": { "file": "./policies/beta.yaml" } }' DELETE remains deferred (out of v0.7.0 scope per the trimmed plan — no `delete_prefix`, no tombstones). Body shape (decision 7): - Nested `schema: { source: "..." }` (mirrors the `policy: { file }` pattern; leaves room for future fields without breakage). - Optional nested `policy: { file: "..." }` for per-graph Cedar. - 32 MiB body limit (reuses `INGEST_REQUEST_BODY_LIMIT_BYTES`). - Asymmetric with `SchemaApplyRequest` which keeps flat `schema_source: String` — documented in api.rs. Atomic YAML rewrite + drift detection: - New `config::rewrite_atomic(path, new_config, expected_hash)`: flock → re-read + hash check → serialize → write `.tmp` → fsync → rename → fsync parent dir. Returns the new hash for the caller to update its in-memory baseline. - New `config::hash_config_file(path)` — SHA-256 of the on-disk bytes, used at startup and after each rewrite. - New `RewriteAtomicError { Drift | Io | Serialize }` enum. - `AppState.config_hash: Option<Arc<Mutex<[u8;32]>>>` carries the in-memory baseline. Updated after every successful rewrite so subsequent POSTs don't false-trigger drift. - The mutex is `std::sync::Mutex` (brief critical section, no .await inside). The flock itself serializes file access process-wide AND across multiple server instances (defense in depth). - All sync I/O runs inside `tokio::task::spawn_blocking` — flock is sync. Handler ordering (the load-bearing sequence): 1. Mode check: 405 in single mode. 2. Cedar authorize: `GraphCreate` against `Omnigraph::Server::"root"`. 3. Validate body: `GraphId::try_from` (regex + reserved-name), empty schema/uri checks, per-graph policy file parse. 4. Pre-check registry for duplicate graph_id / duplicate uri (409). 5. `Omnigraph::init` the new engine. 6. Atomic YAML rewrite (drift detection inside). 7. Publish in registry (atomic re-check via `GraphRegistry::insert`). Failure modes (documented in handler rustdoc): - Init fails → orphan storage at `req.uri` (PR 2a cleans up schema files; Lance datasets remain orphans until `delete_prefix` lands). - YAML rewrite fails (drift, IO) → orphan storage; YAML unchanged. - Registry insert fails (race) → YAML has entry but registry doesn't; next restart opens it cleanly. New dependency: `fs2 = "0.4"` (workspace + omnigraph-server). POSIX-only file locking. Linux/macOS deployment supported; Windows out of scope. Tests (10 new in `tests/server.rs::multi_graph_startup`): - `post_graphs_creates_a_new_graph_end_to_end` — happy path, includes YAML inspection to confirm the rewrite landed. - `post_graphs_baseline_hash_updates_between_rewrites` — two POSTs in a row both succeed (drift baseline updates correctly). - `post_graphs_duplicate_graph_id_returns_409` - `post_graphs_duplicate_uri_returns_409` - `post_graphs_invalid_graph_id_returns_400` (reserved name) - `post_graphs_empty_schema_source_returns_400` - `post_graphs_returns_405_in_single_mode` - `post_graphs_yaml_drift_detection_returns_503` — operator hand-edits omnigraph.yaml; server refuses to clobber. - `hash_config_file_is_deterministic_and_detects_changes` - `rewrite_atomic_refuses_when_hash_drifts` OpenAPI: `server_graphs_create` registered in `ApiDoc::paths(...)`; openapi.json regenerated. Result: 225 server tests green (74 lib + 66 openapi + 85 integration), all MR-731 regressions still pinned. LOC: ~580 lib.rs net (handler + helpers), ~120 config.rs (rewrite machinery), +71 api.rs (request/response shapes), +332 tests/server.rs. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
94b6346bdd
commit
a4e6cb689a
9 changed files with 1030 additions and 5 deletions
|
|
@ -5,7 +5,9 @@ use std::path::{Path, PathBuf};
|
|||
|
||||
use clap::ValueEnum;
|
||||
use color_eyre::eyre::{Result, bail};
|
||||
use fs2::FileExt;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use sha2::{Digest, Sha256};
|
||||
pub const DEFAULT_CONFIG_FILE: &str = "omnigraph.yaml";
|
||||
|
||||
#[derive(Debug, Clone, Default, Serialize, Deserialize)]
|
||||
|
|
@ -371,6 +373,126 @@ fn absolute_base_dir(cwd: &Path, path: &Path) -> Result<PathBuf> {
|
|||
.unwrap_or_else(|| cwd.to_path_buf()))
|
||||
}
|
||||
|
||||
/// SHA-256 hash of the file at `path`. Used to baseline `omnigraph.yaml`
|
||||
/// at server startup; later compared inside `rewrite_atomic` to detect
|
||||
/// operator hand-edits ("YAML drift") that would otherwise be clobbered
|
||||
/// silently. Read errors propagate so startup fails loudly if the
|
||||
/// config file disappears between `load_config` and the hashing.
|
||||
pub fn hash_config_file(path: &Path) -> std::io::Result<[u8; 32]> {
|
||||
let bytes = fs::read(path)?;
|
||||
let digest = Sha256::digest(&bytes);
|
||||
let mut out = [0u8; 32];
|
||||
out.copy_from_slice(&digest);
|
||||
Ok(out)
|
||||
}
|
||||
|
||||
/// Why `rewrite_atomic` refused to rewrite.
|
||||
#[derive(Debug, thiserror::Error)]
|
||||
pub enum RewriteAtomicError {
|
||||
/// The on-disk file no longer matches the expected hash — an
|
||||
/// operator hand-edited `omnigraph.yaml` between server start
|
||||
/// and now. Rewriting would clobber their changes; instead we
|
||||
/// refuse loudly. Maps to HTTP 503.
|
||||
#[error(
|
||||
"omnigraph.yaml drift detected: on-disk file does not match the server's startup baseline. \
|
||||
Stop the server, reconcile the edits, then restart."
|
||||
)]
|
||||
Drift,
|
||||
/// IO failure during the rewrite — couldn't acquire flock, couldn't
|
||||
/// write the staging file, couldn't rename, etc. The on-disk file
|
||||
/// is unchanged (rename is atomic on POSIX). Maps to HTTP 500.
|
||||
#[error("{0}")]
|
||||
Io(#[from] std::io::Error),
|
||||
/// Failed to serialize the new `OmnigraphConfig` to YAML. Should
|
||||
/// not happen in practice — `OmnigraphConfig` has no infallible
|
||||
/// serde paths in the current types. Maps to HTTP 500.
|
||||
#[error("serialize config: {0}")]
|
||||
Serialize(#[from] serde_yaml::Error),
|
||||
}
|
||||
|
||||
/// Atomically rewrite `omnigraph.yaml` under an exclusive `fcntl::flock`
|
||||
/// with SHA-256 drift detection (MR-668 PR 7).
|
||||
///
|
||||
/// Returns the new file's hash on success — callers update their
|
||||
/// in-memory baseline to this value before releasing other request
|
||||
/// handlers.
|
||||
///
|
||||
/// Sequence (everything inside the flock):
|
||||
/// 1. Acquire `LOCK_EX` on `path`.
|
||||
/// 2. Re-read on-disk bytes, hash them.
|
||||
/// 3. If on-disk hash != `expected_hash` → `RewriteAtomicError::Drift`.
|
||||
/// 4. Serialize `new_config` to YAML.
|
||||
/// 5. Write to `path.tmp` and `sync_all` it.
|
||||
/// 6. `rename(path.tmp, path)` (atomic on POSIX).
|
||||
/// 7. `sync_all` the parent directory for crash-durability.
|
||||
/// 8. Release flock (RAII drop on the File).
|
||||
///
|
||||
/// Sync I/O throughout — callers wrap in `tokio::task::spawn_blocking`
|
||||
/// so the async runtime doesn't stall.
|
||||
///
|
||||
/// **Comments are stripped.** `serde_yaml::to_string` produces canonical
|
||||
/// YAML without preserving the operator's comments. Decision Q20 in the
|
||||
/// MR-668 plan accepts this tradeoff for v0.7.0; a future split-file
|
||||
/// design (`omnigraph.yaml` operator-owned + `omnigraph.runtime.yaml`
|
||||
/// server-owned) is the escalation path if operators push back.
|
||||
pub fn rewrite_atomic(
|
||||
path: &Path,
|
||||
new_config: &OmnigraphConfig,
|
||||
expected_hash: &[u8; 32],
|
||||
) -> std::result::Result<[u8; 32], RewriteAtomicError> {
|
||||
// 1. flock. Open RW so flock works; we re-read via fs::read below.
|
||||
let lock_file = fs::OpenOptions::new()
|
||||
.read(true)
|
||||
.write(true)
|
||||
.open(path)?;
|
||||
lock_file.lock_exclusive()?;
|
||||
// RAII unlock via `_lock_guard` — the file dropping releases the flock.
|
||||
let _lock_guard = lock_file;
|
||||
|
||||
// 2. Re-read + hash.
|
||||
let current_bytes = fs::read(path)?;
|
||||
let mut current_hash = [0u8; 32];
|
||||
current_hash.copy_from_slice(&Sha256::digest(¤t_bytes));
|
||||
|
||||
// 3. Drift check.
|
||||
if current_hash != *expected_hash {
|
||||
return Err(RewriteAtomicError::Drift);
|
||||
}
|
||||
|
||||
// 4. Serialize new config.
|
||||
let serialized = serde_yaml::to_string(new_config)?;
|
||||
|
||||
// 5. Write to .tmp + fsync.
|
||||
let tmp_path = staging_path(path);
|
||||
fs::write(&tmp_path, &serialized)?;
|
||||
let tmp_file = fs::File::open(&tmp_path)?;
|
||||
tmp_file.sync_all()?;
|
||||
drop(tmp_file);
|
||||
|
||||
// 6. Atomic rename.
|
||||
fs::rename(&tmp_path, path)?;
|
||||
|
||||
// 7. fsync parent dir for crash-durability (POSIX rename isn't
|
||||
// durable until the directory entry is synced).
|
||||
if let Some(parent) = path.parent() {
|
||||
let dir = fs::File::open(parent)?;
|
||||
dir.sync_all()?;
|
||||
}
|
||||
|
||||
// Compute the new file's hash for the caller to update its baseline.
|
||||
let mut new_hash = [0u8; 32];
|
||||
new_hash.copy_from_slice(&Sha256::digest(serialized.as_bytes()));
|
||||
Ok(new_hash)
|
||||
}
|
||||
|
||||
/// Staging path used during `rewrite_atomic`: `<path>.tmp` to avoid
|
||||
/// colliding with any other workflow that might be reading the file.
|
||||
fn staging_path(path: &Path) -> PathBuf {
|
||||
let mut s = path.as_os_str().to_owned();
|
||||
s.push(".tmp");
|
||||
PathBuf::from(s)
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use std::fs;
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue