PageIndex/pageindex/filesystem/metadata_generation.py
2026-06-01 01:40:44 +08:00

158 lines
5.4 KiB
Python

from __future__ import annotations
import json
import os
from dataclasses import dataclass, field
from typing import Any, Protocol
GENERATED_METADATA_FIELDS = ("summary", "doc_type", "domain", "topic", "entity", "relation")
class MetadataGenerationError(RuntimeError):
pass
@dataclass(frozen=True)
class MetadataGenerationInput:
file_ref: str
external_id: str | None
title: str
content_type: str
source_type: str | None
text: str
metadata: dict[str, Any] = field(default_factory=dict)
text_artifact_path: str | None = None
@dataclass(frozen=True)
class MetadataGenerationResult:
values: dict[str, Any] = field(default_factory=dict)
failures: dict[str, str] = field(default_factory=dict)
class MetadataGenerationBackend(Protocol):
def generate(
self,
request: MetadataGenerationInput,
*,
fields: list[str],
) -> MetadataGenerationResult | dict[str, Any]:
...
class MetadataGenerator:
"""Default product generator for retrieval metadata.
This intentionally lives under pageindex.filesystem instead of benchmark
paths. It uses registered text today; callers can pass PageIndex-extracted
text through the same MetadataGenerationInput without changing the API.
Provider selection is an instance parameter rather than a provider-specific
public class name.
"""
def __init__(
self,
*,
provider: str | None = None,
model: str | None = None,
base_url: str | None = None,
max_text_chars: int = 24000,
):
self.provider = (provider or os.environ.get("PIFS_METADATA_PROVIDER", "openai")).lower()
self.model = model or os.environ.get("PIFS_METADATA_MODEL", "gpt-5-nano")
self.base_url = (
base_url
if base_url is not None
else os.environ.get("PIFS_METADATA_BASE_URL") or os.environ.get("OPENAI_BASE_URL")
)
self.max_text_chars = max_text_chars
def generate(
self,
request: MetadataGenerationInput,
*,
fields: list[str],
) -> MetadataGenerationResult:
if self.provider != "openai":
raise MetadataGenerationError(f"unsupported metadata provider: {self.provider}")
return self._generate_openai(request, fields=fields)
def _generate_openai(
self,
request: MetadataGenerationInput,
*,
fields: list[str],
) -> MetadataGenerationResult:
api_key = os.environ.get("PIFS_METADATA_API_KEY") or os.environ.get("OPENAI_API_KEY")
if not api_key:
raise MetadataGenerationError(
"PIFS_METADATA_API_KEY or OPENAI_API_KEY is required for PIFS metadata generation"
)
from openai import OpenAI
client = OpenAI(api_key=api_key, base_url=self.base_url or None)
response = client.chat.completions.create(
model=self.model,
messages=[
{
"role": "system",
"content": (
"Generate grounded retrieval metadata for one document. "
"Use only the provided document text and ordinary source metadata. "
"The summary must be a retrieval summary, not a title rewrite. "
"Do not use filenames, paths, URLs, storage URIs, or outside knowledge. "
"Return strict JSON matching the requested fields."
),
},
{
"role": "user",
"content": json.dumps(
{
"requested_fields": fields,
"document": {
"title": request.title,
"source_type": request.source_type,
"content_type": request.content_type,
"metadata": request.metadata,
"text": request.text[: self.max_text_chars],
},
},
ensure_ascii=False,
),
},
],
response_format=self._response_format(fields),
)
content = response.choices[0].message.content or "{}"
values = json.loads(content)
return MetadataGenerationResult(
values={field: values[field] for field in fields if field in values},
)
@staticmethod
def _response_format(fields: list[str]) -> dict[str, Any]:
properties: dict[str, Any] = {}
for field in fields:
if field in {"summary", "doc_type", "domain", "topic"}:
properties[field] = {"type": "string"}
elif field in {"entity", "relation"}:
properties[field] = {"type": "string"}
else:
raise MetadataGenerationError(
f"MetadataGenerator does not support generated metadata field: {field}"
)
return {
"type": "json_schema",
"json_schema": {
"name": "pifs_metadata_generation",
"strict": True,
"schema": {
"type": "object",
"additionalProperties": False,
"required": fields,
"properties": properties,
},
},
}