From 7c916f5b982373172cde090bfd29c094c20d04a5 Mon Sep 17 00:00:00 2001 From: Ragnor Comerford Date: Mon, 15 Jun 2026 15:07:54 +0200 Subject: [PATCH] refactor(compiler): remove dead OpenAI embedding client (RFC-012 Phase 1) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The compiler-side EmbeddingClient (OpenAI/`text-embedding-3-small`) was pub(crate), #![allow(dead_code)], and had zero callers anywhere in the workspace; the live nearest("string") path and the offline `omnigraph embed` CLI both use the engine Gemini client. It carried the only NANOGRAPH_* env vars (vestigial 'nanograph os' naming) and was the sole user of reqwest+tokio in omnigraph-compiler — dropping them removes an HTTP client and async runtime from a crate that advertises 'Zero Lance dependency' (invariant 11). Rewrites docs/user/search/embeddings.md to the single-client reality and corrects the false 'engine embeds @embed at ingest' claim. Verified: build green, 238 compiler tests pass, `rg NANOGRAPH_` empty. --- AGENTS.md | 2 +- Cargo.lock | 2 - crates/omnigraph-compiler/Cargo.toml | 5 - crates/omnigraph-compiler/src/embedding.rs | 379 --------------------- crates/omnigraph-compiler/src/lib.rs | 1 - docs/user/search/embeddings.md | 29 +- 6 files changed, 17 insertions(+), 401 deletions(-) delete mode 100644 crates/omnigraph-compiler/src/embedding.rs diff --git a/AGENTS.md b/AGENTS.md index 7e42a2a..91e25ae 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -80,7 +80,7 @@ Full diagram and concurrency model: [docs/dev/architecture.md](docs/dev/architec | Mutations — insert/update/delete, D2, atomicity | [docs/user/mutations/index.md](docs/user/mutations/index.md) | | Search funcs (`nearest`/`bm25`/`rrf`), hybrid ranking | [docs/user/search/index.md](docs/user/search/index.md) | | Indexes (BTREE / inverted / vector / graph topology) | [docs/user/search/indexes.md](docs/user/search/indexes.md) | -| Embeddings (compiler + engine clients, env vars, `@embed`) | [docs/user/search/embeddings.md](docs/user/search/embeddings.md) | +| Embeddings (engine client, env vars, `@embed`) | [docs/user/search/embeddings.md](docs/user/search/embeddings.md) | | Concepts — what OmniGraph is, L1/L2 framing | [docs/user/concepts/index.md](docs/user/concepts/index.md) | | Quickstart — init → load → query → branch | [docs/user/quickstart.md](docs/user/quickstart.md) | | Branches, commit graph, system branches | [docs/user/branching/index.md](docs/user/branching/index.md) | diff --git a/Cargo.lock b/Cargo.lock index 33dd652..2419e9f 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -4915,12 +4915,10 @@ dependencies = [ "arrow-select", "pest", "pest_derive", - "reqwest 0.12.28", "serde", "serde_json", "sha2 0.10.9", "thiserror", - "tokio", ] [[package]] diff --git a/crates/omnigraph-compiler/Cargo.toml b/crates/omnigraph-compiler/Cargo.toml index bbf03f1..4645b81 100644 --- a/crates/omnigraph-compiler/Cargo.toml +++ b/crates/omnigraph-compiler/Cargo.toml @@ -20,10 +20,5 @@ pest_derive = { workspace = true } thiserror = { workspace = true } serde = { workspace = true } serde_json = { workspace = true } -reqwest = { workspace = true } ahash = { workspace = true } -tokio = { workspace = true } sha2 = { workspace = true } - -[dev-dependencies] -tokio = { workspace = true } diff --git a/crates/omnigraph-compiler/src/embedding.rs b/crates/omnigraph-compiler/src/embedding.rs deleted file mode 100644 index 6c9e6f3..0000000 --- a/crates/omnigraph-compiler/src/embedding.rs +++ /dev/null @@ -1,379 +0,0 @@ -#![allow(dead_code)] - -use std::time::Duration; - -use reqwest::Client; -use serde::Deserialize; -use tokio::time::sleep; - -use crate::error::{NanoError, Result}; - -const DEFAULT_EMBED_MODEL: &str = "text-embedding-3-small"; -const DEFAULT_OPENAI_BASE_URL: &str = "https://api.openai.com/v1"; -const DEFAULT_TIMEOUT_MS: u64 = 30_000; -const DEFAULT_RETRY_ATTEMPTS: usize = 4; -const DEFAULT_RETRY_BACKOFF_MS: u64 = 200; - -#[derive(Clone)] -enum EmbeddingTransport { - Mock, - OpenAi { - api_key: String, - base_url: String, - http: Client, - }, -} - -#[derive(Clone)] -pub(crate) struct EmbeddingClient { - model: String, - retry_attempts: usize, - retry_backoff_ms: u64, - transport: EmbeddingTransport, -} - -struct EmbedCallError { - message: String, - retryable: bool, -} - -#[derive(Debug, Deserialize)] -struct OpenAiEmbeddingResponse { - data: Vec, -} - -#[derive(Debug, Deserialize)] -struct OpenAiEmbeddingDatum { - index: usize, - embedding: Vec, -} - -#[derive(Debug, Deserialize)] -struct OpenAiErrorEnvelope { - error: OpenAiErrorBody, -} - -#[derive(Debug, Deserialize)] -struct OpenAiErrorBody { - message: String, -} - -impl EmbeddingClient { - pub(crate) fn from_env() -> Result { - let model = std::env::var("NANOGRAPH_EMBED_MODEL") - .ok() - .map(|v| v.trim().to_string()) - .filter(|v| !v.is_empty()) - .unwrap_or_else(|| DEFAULT_EMBED_MODEL.to_string()); - let retry_attempts = - parse_env_usize("NANOGRAPH_EMBED_RETRY_ATTEMPTS", DEFAULT_RETRY_ATTEMPTS); - let retry_backoff_ms = - parse_env_u64("NANOGRAPH_EMBED_RETRY_BACKOFF_MS", DEFAULT_RETRY_BACKOFF_MS); - - if env_flag("NANOGRAPH_EMBEDDINGS_MOCK") { - return Ok(Self { - model, - retry_attempts, - retry_backoff_ms, - transport: EmbeddingTransport::Mock, - }); - } - - let api_key = std::env::var("OPENAI_API_KEY") - .ok() - .map(|v| v.trim().to_string()) - .filter(|v| !v.is_empty()) - .ok_or_else(|| { - NanoError::Execution( - "OPENAI_API_KEY is required when an embedding call is needed".to_string(), - ) - })?; - let base_url = std::env::var("OPENAI_BASE_URL") - .ok() - .map(|v| v.trim_end_matches('/').to_string()) - .filter(|v| !v.is_empty()) - .unwrap_or_else(|| DEFAULT_OPENAI_BASE_URL.to_string()); - let timeout_ms = parse_env_u64("NANOGRAPH_EMBED_TIMEOUT_MS", DEFAULT_TIMEOUT_MS); - let http = Client::builder() - .timeout(Duration::from_millis(timeout_ms)) - .build() - .map_err(|e| { - NanoError::Execution(format!("failed to initialize HTTP client: {}", e)) - })?; - - Ok(Self { - model, - retry_attempts, - retry_backoff_ms, - transport: EmbeddingTransport::OpenAi { - api_key, - base_url, - http, - }, - }) - } - - #[cfg(test)] - pub(crate) fn mock_for_tests() -> Self { - Self { - model: DEFAULT_EMBED_MODEL.to_string(), - retry_attempts: DEFAULT_RETRY_ATTEMPTS, - retry_backoff_ms: DEFAULT_RETRY_BACKOFF_MS, - transport: EmbeddingTransport::Mock, - } - } - - pub(crate) fn model(&self) -> &str { - &self.model - } - - pub(crate) async fn embed_text(&self, input: &str, expected_dim: usize) -> Result> { - let mut vectors = self.embed_texts(&[input.to_string()], expected_dim).await?; - vectors.pop().ok_or_else(|| { - NanoError::Execution("embedding provider returned no vector".to_string()) - }) - } - - pub(crate) async fn embed_texts( - &self, - inputs: &[String], - expected_dim: usize, - ) -> Result>> { - if expected_dim == 0 { - return Err(NanoError::Execution( - "embedding dimension must be greater than zero".to_string(), - )); - } - if inputs.is_empty() { - return Ok(Vec::new()); - } - - match &self.transport { - EmbeddingTransport::Mock => Ok(inputs - .iter() - .map(|input| mock_embedding(input, expected_dim)) - .collect()), - EmbeddingTransport::OpenAi { .. } => { - self.embed_texts_openai_with_retry(inputs, expected_dim) - .await - } - } - } - - async fn embed_texts_openai_with_retry( - &self, - inputs: &[String], - expected_dim: usize, - ) -> Result>> { - let max_attempt = self.retry_attempts.max(1); - let mut attempt = 0usize; - loop { - attempt += 1; - match self.embed_texts_openai_once(inputs, expected_dim).await { - Ok(vectors) => return Ok(vectors), - Err(err) => { - if !err.retryable || attempt >= max_attempt { - return Err(NanoError::Execution(err.message)); - } - let shift = (attempt - 1).min(10) as u32; - let delay = self.retry_backoff_ms.saturating_mul(1u64 << shift); - sleep(Duration::from_millis(delay)).await; - } - } - } - } - - async fn embed_texts_openai_once( - &self, - inputs: &[String], - expected_dim: usize, - ) -> std::result::Result>, EmbedCallError> { - let (api_key, base_url, http) = match &self.transport { - EmbeddingTransport::OpenAi { - api_key, - base_url, - http, - } => (api_key, base_url, http), - EmbeddingTransport::Mock => unreachable!("mock transport should not call OpenAI"), - }; - - let request = serde_json::json!({ - "model": self.model, - "input": inputs, - "dimensions": expected_dim, - }); - let url = format!("{}/embeddings", base_url); - let response = http - .post(&url) - .bearer_auth(api_key) - .json(&request) - .send() - .await; - - let response = match response { - Ok(resp) => resp, - Err(err) => { - let retryable = err.is_timeout() || err.is_connect() || err.is_request(); - return Err(EmbedCallError { - message: format!("embedding request failed: {}", err), - retryable, - }); - } - }; - - let status = response.status(); - let body = match response.text().await { - Ok(body) => body, - Err(err) => { - return Err(EmbedCallError { - message: format!( - "embedding response read failed (status {}): {}", - status, err - ), - retryable: status.is_server_error() || status.as_u16() == 429, - }); - } - }; - - if !status.is_success() { - let message = parse_openai_error_message(&body).unwrap_or_else(|| body.clone()); - return Err(EmbedCallError { - message: format!( - "embedding request failed with status {}: {}", - status, message - ), - retryable: status.is_server_error() || status.as_u16() == 429, - }); - } - - let mut parsed: OpenAiEmbeddingResponse = - serde_json::from_str(&body).map_err(|err| EmbedCallError { - message: format!("embedding response decode failed: {}", err), - retryable: false, - })?; - - if parsed.data.len() != inputs.len() { - return Err(EmbedCallError { - message: format!( - "embedding response size mismatch: expected {}, got {}", - inputs.len(), - parsed.data.len() - ), - retryable: false, - }); - } - - parsed.data.sort_by_key(|item| item.index); - let mut vectors = Vec::with_capacity(parsed.data.len()); - for (idx, item) in parsed.data.into_iter().enumerate() { - if item.index != idx { - return Err(EmbedCallError { - message: format!( - "embedding response index mismatch at position {}: got {}", - idx, item.index - ), - retryable: false, - }); - } - if item.embedding.len() != expected_dim { - return Err(EmbedCallError { - message: format!( - "embedding dimension mismatch: expected {}, got {}", - expected_dim, - item.embedding.len() - ), - retryable: false, - }); - } - vectors.push(item.embedding); - } - Ok(vectors) - } -} - -fn parse_openai_error_message(body: &str) -> Option { - serde_json::from_str::(body) - .ok() - .map(|e| e.error.message) - .filter(|msg| !msg.trim().is_empty()) -} - -fn parse_env_usize(name: &str, default: usize) -> usize { - std::env::var(name) - .ok() - .and_then(|v| v.parse::().ok()) - .filter(|v| *v > 0) - .unwrap_or(default) -} - -fn parse_env_u64(name: &str, default: u64) -> u64 { - std::env::var(name) - .ok() - .and_then(|v| v.parse::().ok()) - .filter(|v| *v > 0) - .unwrap_or(default) -} - -fn env_flag(name: &str) -> bool { - std::env::var(name) - .ok() - .map(|v| { - let s = v.trim().to_ascii_lowercase(); - s == "1" || s == "true" || s == "yes" || s == "on" - }) - .unwrap_or(false) -} - -fn mock_embedding(input: &str, dim: usize) -> Vec { - let mut seed = fnv1a64(input.as_bytes()); - let mut out = Vec::with_capacity(dim); - for _ in 0..dim { - seed = xorshift64(seed); - let ratio = (seed as f64 / u64::MAX as f64) as f32; - out.push((ratio * 2.0) - 1.0); - } - - let norm = out - .iter() - .map(|v| (*v as f64) * (*v as f64)) - .sum::() - .sqrt() as f32; - if norm > f32::EPSILON { - for value in &mut out { - *value /= norm; - } - } - out -} - -fn fnv1a64(bytes: &[u8]) -> u64 { - let mut hash = 14695981039346656037u64; - for byte in bytes { - hash ^= *byte as u64; - hash = hash.wrapping_mul(1099511628211u64); - } - hash -} - -fn xorshift64(mut x: u64) -> u64 { - x ^= x << 13; - x ^= x >> 7; - x ^= x << 17; - x -} - -#[cfg(test)] -mod tests { - use super::*; - - #[tokio::test] - async fn mock_embeddings_are_deterministic() { - let client = EmbeddingClient::mock_for_tests(); - let a = client.embed_text("alpha", 8).await.unwrap(); - let b = client.embed_text("alpha", 8).await.unwrap(); - let c = client.embed_text("beta", 8).await.unwrap(); - assert_eq!(a, b); - assert_ne!(a, c); - assert_eq!(a.len(), 8); - } -} diff --git a/crates/omnigraph-compiler/src/lib.rs b/crates/omnigraph-compiler/src/lib.rs index ba1aba2..4f85c08 100644 --- a/crates/omnigraph-compiler/src/lib.rs +++ b/crates/omnigraph-compiler/src/lib.rs @@ -1,5 +1,4 @@ pub mod catalog; -pub mod embedding; pub mod error; pub mod ir; pub mod json_output; diff --git a/docs/user/search/embeddings.md b/docs/user/search/embeddings.md index 31455c4..10723d8 100644 --- a/docs/user/search/embeddings.md +++ b/docs/user/search/embeddings.md @@ -1,28 +1,31 @@ # Embeddings -OmniGraph has **two** embedding clients with different defaults and purposes. +OmniGraph embeds text through a **single engine-side client** (Google Gemini). It is used in two places: the +query-time auto-embed of a string passed to `nearest($v, "string")`, and the offline `omnigraph embed` file +pipeline. Both paths use the same client, so query vectors and CLI-produced document vectors share one model +and one vector space. -## Compiler-side client — query-time normalization - -- Default model: `text-embedding-3-small` (OpenAI-style schema) -- Env: `NANOGRAPH_EMBED_MODEL`, `OPENAI_API_KEY`, `OPENAI_BASE_URL` (default `https://api.openai.com/v1`), `NANOGRAPH_EMBEDDINGS_MOCK`, `NANOGRAPH_EMBED_TIMEOUT_MS=30000`, `NANOGRAPH_EMBED_RETRY_ATTEMPTS=4`, `NANOGRAPH_EMBED_RETRY_BACKOFF_MS=200` -- Methods: `embed_text(input, expected_dim)`, `embed_texts(inputs, expected_dim)` -- Mock mode: deterministic FNV-1a + xorshift64 → L2-normalized vectors - -## Engine-side client — runtime ingest +## Engine embedding client - Model: `gemini-embedding-2-preview` - Env: `GEMINI_API_KEY`, `OMNIGRAPH_GEMINI_BASE_URL` (default Google generativelanguage v1beta), `OMNIGRAPH_EMBED_TIMEOUT_MS=30000`, `OMNIGRAPH_EMBED_RETRY_ATTEMPTS=4`, `OMNIGRAPH_EMBED_RETRY_BACKOFF_MS=200`, `OMNIGRAPH_EMBEDDINGS_MOCK` -- Two task types: `embed_query_text` (RETRIEVAL_QUERY) and `embed_document_text` (RETRIEVAL_DOCUMENT) +- Two task types: `embed_query_text` (RETRIEVAL_QUERY) for query strings and `embed_document_text` (RETRIEVAL_DOCUMENT) for stored documents - Exponential backoff with retryable detection (timeouts, 429, 5xx) +- Vectors are stored as L2-normalized `FixedSizeList(Float32, dim)`; the requested dimension is driven by the target column width -## Schema integration +## `@embed` schema annotation -Mark a Vector property with `@embed("source_text_property")`. At ingest, the engine pulls the source text and writes the embedding into the vector column. Stored as L2-normalized FixedSizeList(Float32, dim). +Mark a Vector property with `@embed("source_text_property")`. Today this is a **catalog annotation** consumed +by the query typechecker and linter: it records which String property is the embedding source and lets +`nearest($v, "string")` auto-embed a query string for comparison against that vector column. + +**It does not embed at ingest.** Stored vectors are supplied directly in your load data, or pre-filled by the +offline `omnigraph embed` pipeline below. (Ingest-time execution of `@embed` is a planned enhancement; until +it ships, populate the vector column yourself.) ## CLI `omnigraph embed` (offline file pipeline) -Operates on **JSONL files** (not on a graph). Three modes (mutually exclusive): +Operates on **JSONL files** (not on a graph), using the same engine client. Three modes (mutually exclusive): - (default) `fill_missing` — only embed rows whose target field is empty - `--reembed-all` — overwrite all