mirror of
https://github.com/VectifyAI/PageIndex.git
synced 2026-06-27 20:29:41 +02:00
refactor(filesystem): make pifs providers configurable
This commit is contained in:
parent
7c021a7dd0
commit
de1992def1
7 changed files with 154 additions and 61 deletions
|
|
@ -2,11 +2,11 @@ from .commands import PIFSCommandExecutor
|
|||
from .core import PageIndexFileSystem
|
||||
from .hybrid_projection import HybridProjectionSearchBackend
|
||||
from .metadata_generation import (
|
||||
MetadataGenerationBackend,
|
||||
MetadataGenerationError,
|
||||
MetadataGenerationInput,
|
||||
MetadataGenerationResult,
|
||||
MetadataGenerator,
|
||||
OpenAIMetadataGenerator,
|
||||
)
|
||||
from .projection_indexing import SummaryProjectionIndexer
|
||||
from .semantic_index import (
|
||||
|
|
@ -20,11 +20,11 @@ from .types import OpenResult, SearchResult
|
|||
__all__ = [
|
||||
"OpenResult",
|
||||
"HybridProjectionSearchBackend",
|
||||
"MetadataGenerationBackend",
|
||||
"MetadataGenerationError",
|
||||
"MetadataGenerationInput",
|
||||
"MetadataGenerationResult",
|
||||
"MetadataGenerator",
|
||||
"OpenAIMetadataGenerator",
|
||||
"PIFSCommandExecutor",
|
||||
"PageIndexFileSystem",
|
||||
"RebuildableSemanticIndex",
|
||||
|
|
|
|||
|
|
@ -9,11 +9,11 @@ from urllib.parse import unquote, urlparse
|
|||
from ..client import PageIndexClient
|
||||
from .metadata import MetadataQueryEngine
|
||||
from .metadata_generation import (
|
||||
MetadataGenerationBackend,
|
||||
MetadataGenerationError,
|
||||
MetadataGenerationInput,
|
||||
MetadataGenerationResult,
|
||||
MetadataGenerator,
|
||||
OpenAIMetadataGenerator,
|
||||
)
|
||||
from .projection_indexing import SummaryProjectionIndexer
|
||||
from .semantic_folder_policy import (
|
||||
|
|
@ -91,7 +91,11 @@ class PageIndexFileSystem:
|
|||
workspace: Union[str, Path],
|
||||
*,
|
||||
semantic_retrieval_backend: Any | None = None,
|
||||
metadata_generator: MetadataGenerator | None = None,
|
||||
metadata_generator: MetadataGenerationBackend | None = None,
|
||||
metadata_provider: str = "openai",
|
||||
metadata_model: str | None = None,
|
||||
metadata_base_url: str | None = None,
|
||||
metadata_max_text_chars: int = 24000,
|
||||
summary_projection_indexer: SummaryProjectionIndexer | None = None,
|
||||
summary_projection_index: bool = True,
|
||||
summary_projection_index_dir: Union[str, Path, None] = None,
|
||||
|
|
@ -105,6 +109,10 @@ class PageIndexFileSystem:
|
|||
self.metadata = MetadataQueryEngine(self.store)
|
||||
self.semantic_retrieval_backend = semantic_retrieval_backend
|
||||
self.metadata_generator = metadata_generator
|
||||
self.metadata_provider = metadata_provider
|
||||
self.metadata_model = metadata_model
|
||||
self.metadata_base_url = metadata_base_url
|
||||
self.metadata_max_text_chars = metadata_max_text_chars
|
||||
self.summary_projection_indexer = summary_projection_indexer
|
||||
self.summary_projection_index = summary_projection_index
|
||||
self.summary_projection_index_dir = (
|
||||
|
|
@ -199,7 +207,12 @@ class PageIndexFileSystem:
|
|||
|
||||
def _ensure_register_completion_defaults(self) -> None:
|
||||
if self.metadata_generator is None:
|
||||
self.metadata_generator = OpenAIMetadataGenerator()
|
||||
self.metadata_generator = MetadataGenerator(
|
||||
provider=self.metadata_provider,
|
||||
model=self.metadata_model,
|
||||
base_url=self.metadata_base_url,
|
||||
max_text_chars=self.metadata_max_text_chars,
|
||||
)
|
||||
if self.summary_projection_index and self.summary_projection_indexer is None:
|
||||
self.summary_projection_indexer = SummaryProjectionIndexer.from_provider(
|
||||
self.summary_projection_index_dir,
|
||||
|
|
|
|||
|
|
@ -1,6 +1,5 @@
|
|||
from __future__ import annotations
|
||||
|
||||
import hashlib
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
|
|
@ -331,17 +330,22 @@ class EmbeddingCache:
|
|||
return [cached[text_hash] for text_hash in hashes]
|
||||
|
||||
|
||||
class OpenAIEmbeddingClient:
|
||||
def __init__(self, model: str, *, dimensions: int, timeout: float):
|
||||
from openai import OpenAI
|
||||
|
||||
class EmbeddingClient:
|
||||
def __init__(self, *, provider: str, model: str, dimensions: int, timeout: float):
|
||||
self.provider = provider.lower()
|
||||
self.model = model
|
||||
self.dimensions = dimensions
|
||||
self.client = OpenAI(
|
||||
api_key=os.environ.get("OPENAI_API_KEY"),
|
||||
base_url=os.environ.get("OPENAI_BASE_URL") or None,
|
||||
timeout=timeout,
|
||||
)
|
||||
if self.provider != "openai":
|
||||
raise ValueError(f"unknown embedding provider: {provider}")
|
||||
from openai import OpenAI
|
||||
|
||||
api_key = os.environ.get("PIFS_EMBEDDING_API_KEY") or os.environ.get("OPENAI_API_KEY")
|
||||
base_url = os.environ.get("PIFS_EMBEDDING_BASE_URL") or os.environ.get("OPENAI_BASE_URL")
|
||||
if not api_key:
|
||||
raise ValueError(
|
||||
"PIFS_EMBEDDING_API_KEY or OPENAI_API_KEY is required for PIFS embeddings"
|
||||
)
|
||||
self.client = OpenAI(api_key=api_key, base_url=base_url or None, timeout=timeout)
|
||||
|
||||
def embed(self, texts: list[str]) -> list[list[float]]:
|
||||
kwargs: dict[str, Any] = {"model": self.model, "input": texts}
|
||||
|
|
@ -351,32 +355,13 @@ class OpenAIEmbeddingClient:
|
|||
return [list(item.embedding) for item in sorted(response.data, key=lambda item: item.index)]
|
||||
|
||||
|
||||
class HashEmbeddingClient:
|
||||
def __init__(self, dimensions: int = 256):
|
||||
self.dimensions = dimensions
|
||||
|
||||
def embed(self, texts: list[str]) -> list[list[float]]:
|
||||
return [self._embed_one(text) for text in texts]
|
||||
|
||||
def _embed_one(self, text: str) -> list[float]:
|
||||
vector = [0.0] * self.dimensions
|
||||
for term in keyword_terms(text)[:256]:
|
||||
digest = hashlib.blake2b(term.encode("utf-8"), digest_size=8).digest()
|
||||
bucket = int.from_bytes(digest[:4], "little") % self.dimensions
|
||||
sign = 1.0 if digest[4] % 2 == 0 else -1.0
|
||||
vector[bucket] += sign
|
||||
norm = sum(value * value for value in vector) ** 0.5
|
||||
if norm:
|
||||
vector = [value / norm for value in vector]
|
||||
return vector
|
||||
|
||||
|
||||
def make_embedder(provider: str, model: str, *, dimensions: int, timeout: float) -> Any:
|
||||
if provider == "openai":
|
||||
return OpenAIEmbeddingClient(model, dimensions=dimensions, timeout=timeout)
|
||||
if provider == "hash":
|
||||
return HashEmbeddingClient(dimensions=dimensions if dimensions > 0 else 256)
|
||||
raise ValueError(f"unknown embedding provider: {provider}")
|
||||
return EmbeddingClient(
|
||||
provider=provider,
|
||||
model=model,
|
||||
dimensions=dimensions,
|
||||
timeout=timeout,
|
||||
)
|
||||
|
||||
|
||||
def query_text_for_channel(channel: str, query: str, projection: QueryProjection) -> str:
|
||||
|
|
|
|||
|
|
@ -32,7 +32,7 @@ class MetadataGenerationResult:
|
|||
failures: dict[str, str] = field(default_factory=dict)
|
||||
|
||||
|
||||
class MetadataGenerator(Protocol):
|
||||
class MetadataGenerationBackend(Protocol):
|
||||
def generate(
|
||||
self,
|
||||
request: MetadataGenerationInput,
|
||||
|
|
@ -42,23 +42,31 @@ class MetadataGenerator(Protocol):
|
|||
...
|
||||
|
||||
|
||||
class OpenAIMetadataGenerator:
|
||||
class MetadataGenerator:
|
||||
"""Default product generator for retrieval metadata.
|
||||
|
||||
This intentionally lives under pageindex.filesystem instead of benchmark
|
||||
paths. It uses registered text today; callers can pass PageIndex-extracted
|
||||
text through the same MetadataGenerationInput without changing the API.
|
||||
Provider selection is an instance parameter rather than a provider-specific
|
||||
public class name.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
*,
|
||||
provider: str | None = None,
|
||||
model: str | None = None,
|
||||
base_url: str | None = None,
|
||||
max_text_chars: int = 24000,
|
||||
):
|
||||
self.provider = (provider or os.environ.get("PIFS_METADATA_PROVIDER", "openai")).lower()
|
||||
self.model = model or os.environ.get("PIFS_METADATA_MODEL", "gpt-5-nano")
|
||||
self.base_url = base_url if base_url is not None else os.environ.get("OPENAI_BASE_URL")
|
||||
self.base_url = (
|
||||
base_url
|
||||
if base_url is not None
|
||||
else os.environ.get("PIFS_METADATA_BASE_URL") or os.environ.get("OPENAI_BASE_URL")
|
||||
)
|
||||
self.max_text_chars = max_text_chars
|
||||
|
||||
def generate(
|
||||
|
|
@ -67,9 +75,21 @@ class OpenAIMetadataGenerator:
|
|||
*,
|
||||
fields: list[str],
|
||||
) -> MetadataGenerationResult:
|
||||
api_key = os.environ.get("OPENAI_API_KEY")
|
||||
if self.provider != "openai":
|
||||
raise MetadataGenerationError(f"unsupported metadata provider: {self.provider}")
|
||||
return self._generate_openai(request, fields=fields)
|
||||
|
||||
def _generate_openai(
|
||||
self,
|
||||
request: MetadataGenerationInput,
|
||||
*,
|
||||
fields: list[str],
|
||||
) -> MetadataGenerationResult:
|
||||
api_key = os.environ.get("PIFS_METADATA_API_KEY") or os.environ.get("OPENAI_API_KEY")
|
||||
if not api_key:
|
||||
raise MetadataGenerationError("OPENAI_API_KEY is required for PIFS metadata generation")
|
||||
raise MetadataGenerationError(
|
||||
"PIFS_METADATA_API_KEY or OPENAI_API_KEY is required for PIFS metadata generation"
|
||||
)
|
||||
|
||||
from openai import OpenAI
|
||||
|
||||
|
|
@ -122,7 +142,7 @@ class OpenAIMetadataGenerator:
|
|||
properties[field] = {"type": "string"}
|
||||
else:
|
||||
raise MetadataGenerationError(
|
||||
f"OpenAIMetadataGenerator does not support generated metadata field: {field}"
|
||||
f"MetadataGenerator does not support generated metadata field: {field}"
|
||||
)
|
||||
return {
|
||||
"type": "json_schema",
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue