mirror of
https://github.com/VectifyAI/PageIndex.git
synced 2026-06-12 19:55:17 +02:00
158 lines
5.4 KiB
Python
158 lines
5.4 KiB
Python
from __future__ import annotations
|
|
|
|
import json
|
|
import os
|
|
from dataclasses import dataclass, field
|
|
from typing import Any, Protocol
|
|
|
|
|
|
GENERATED_METADATA_FIELDS = ("summary", "doc_type", "domain", "topic", "entity", "relation")
|
|
|
|
|
|
class MetadataGenerationError(RuntimeError):
|
|
pass
|
|
|
|
|
|
@dataclass(frozen=True)
|
|
class MetadataGenerationInput:
|
|
file_ref: str
|
|
external_id: str | None
|
|
title: str
|
|
content_type: str
|
|
source_type: str | None
|
|
text: str
|
|
metadata: dict[str, Any] = field(default_factory=dict)
|
|
text_artifact_path: str | None = None
|
|
|
|
|
|
@dataclass(frozen=True)
|
|
class MetadataGenerationResult:
|
|
values: dict[str, Any] = field(default_factory=dict)
|
|
failures: dict[str, str] = field(default_factory=dict)
|
|
|
|
|
|
class MetadataGenerationBackend(Protocol):
|
|
def generate(
|
|
self,
|
|
request: MetadataGenerationInput,
|
|
*,
|
|
fields: list[str],
|
|
) -> MetadataGenerationResult | dict[str, Any]:
|
|
...
|
|
|
|
|
|
class MetadataGenerator:
|
|
"""Default product generator for retrieval metadata.
|
|
|
|
This intentionally lives under pageindex.filesystem instead of benchmark
|
|
paths. It uses registered text today; callers can pass PageIndex-extracted
|
|
text through the same MetadataGenerationInput without changing the API.
|
|
Provider selection is an instance parameter rather than a provider-specific
|
|
public class name.
|
|
"""
|
|
|
|
def __init__(
|
|
self,
|
|
*,
|
|
provider: str | None = None,
|
|
model: str | None = None,
|
|
base_url: str | None = None,
|
|
max_text_chars: int = 24000,
|
|
):
|
|
self.provider = (provider or os.environ.get("PIFS_METADATA_PROVIDER", "openai")).lower()
|
|
self.model = model or os.environ.get("PIFS_METADATA_MODEL", "gpt-5-nano")
|
|
self.base_url = (
|
|
base_url
|
|
if base_url is not None
|
|
else os.environ.get("PIFS_METADATA_BASE_URL") or os.environ.get("OPENAI_BASE_URL")
|
|
)
|
|
self.max_text_chars = max_text_chars
|
|
|
|
def generate(
|
|
self,
|
|
request: MetadataGenerationInput,
|
|
*,
|
|
fields: list[str],
|
|
) -> MetadataGenerationResult:
|
|
if self.provider != "openai":
|
|
raise MetadataGenerationError(f"unsupported metadata provider: {self.provider}")
|
|
return self._generate_openai(request, fields=fields)
|
|
|
|
def _generate_openai(
|
|
self,
|
|
request: MetadataGenerationInput,
|
|
*,
|
|
fields: list[str],
|
|
) -> MetadataGenerationResult:
|
|
api_key = os.environ.get("PIFS_METADATA_API_KEY") or os.environ.get("OPENAI_API_KEY")
|
|
if not api_key:
|
|
raise MetadataGenerationError(
|
|
"PIFS_METADATA_API_KEY or OPENAI_API_KEY is required for PIFS metadata generation"
|
|
)
|
|
|
|
from openai import OpenAI
|
|
|
|
client = OpenAI(api_key=api_key, base_url=self.base_url or None)
|
|
response = client.chat.completions.create(
|
|
model=self.model,
|
|
messages=[
|
|
{
|
|
"role": "system",
|
|
"content": (
|
|
"Generate grounded retrieval metadata for one document. "
|
|
"Use only the provided document text and ordinary source metadata. "
|
|
"The summary must be a retrieval summary, not a title rewrite. "
|
|
"Do not use filenames, paths, URLs, storage URIs, or outside knowledge. "
|
|
"Return strict JSON matching the requested fields."
|
|
),
|
|
},
|
|
{
|
|
"role": "user",
|
|
"content": json.dumps(
|
|
{
|
|
"requested_fields": fields,
|
|
"document": {
|
|
"title": request.title,
|
|
"source_type": request.source_type,
|
|
"content_type": request.content_type,
|
|
"metadata": request.metadata,
|
|
"text": request.text[: self.max_text_chars],
|
|
},
|
|
},
|
|
ensure_ascii=False,
|
|
),
|
|
},
|
|
],
|
|
response_format=self._response_format(fields),
|
|
)
|
|
content = response.choices[0].message.content or "{}"
|
|
values = json.loads(content)
|
|
return MetadataGenerationResult(
|
|
values={field: values[field] for field in fields if field in values},
|
|
)
|
|
|
|
@staticmethod
|
|
def _response_format(fields: list[str]) -> dict[str, Any]:
|
|
properties: dict[str, Any] = {}
|
|
for field in fields:
|
|
if field in {"summary", "doc_type", "domain", "topic"}:
|
|
properties[field] = {"type": "string"}
|
|
elif field in {"entity", "relation"}:
|
|
properties[field] = {"type": "string"}
|
|
else:
|
|
raise MetadataGenerationError(
|
|
f"MetadataGenerator does not support generated metadata field: {field}"
|
|
)
|
|
return {
|
|
"type": "json_schema",
|
|
"json_schema": {
|
|
"name": "pifs_metadata_generation",
|
|
"strict": True,
|
|
"schema": {
|
|
"type": "object",
|
|
"additionalProperties": False,
|
|
"required": fields,
|
|
"properties": properties,
|
|
},
|
|
},
|
|
}
|