feat:compatible with Pageindex SDK (#238)

* feat:compatible with Pageindex SDK

* corner cases fixed

* fix: mock behavior of old SDK

* fix: close streaming response and warn on empty api_key

- LegacyCloudAPI: close response in `finally` for both _stream_chat_response
  variants so abandoned iterators no longer leak the TCP connection.
- PageIndexClient: emit a warning instead of silently falling back to local
  when api_key is the empty string, surfacing typical env-var-unset misconfig.
- FakeResponse: add close()/closed to match the real requests.Response API.
- Add unit coverage for stream close (both paths) and the empty-api_key warning.
- Add scripts/e2e_legacy_sdk.py to smoke-test the legacy SDK contract end-to-end
  against api.pageindex.ai.

* chore: mark legacy SDK methods with @deprecated and docstring pointers

- Decorate the 12 PageIndexClient cloud-SDK compat methods with
  @typing_extensions.deprecated(..., category=PendingDeprecationWarning):
  - IDE/type-checkers render them with a strikethrough hint
  - runtime warnings stay silent by default (no spam for existing callers),
    surfaceable via `python -W default::PendingDeprecationWarning`
- Add a one-line docstring on each pointing to the Collection-based equivalent.
- Promote typing-extensions to a direct dependency (was transitive via litellm).

---------

Co-authored-by: XinyanZhou <xinyanzhou@XinyanZhoudeMacBook-Pro.local>
Co-authored-by: saccharin98 <xinyanzhou938@gmail.com>
Co-authored-by: mountain <kose2livs@gmail.com>
This commit is contained in:
Xinyan Zhou 2026-05-11 21:06:23 +08:00 committed by GitHub
parent 6d29886892
commit 595895cf28
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
10 changed files with 1030 additions and 20 deletions

View file

@ -1,5 +1,6 @@
from pageindex.errors import (
PageIndexError,
PageIndexAPIError,
CollectionNotFoundError,
DocumentNotFoundError,
IndexingError,
@ -9,9 +10,10 @@ from pageindex.errors import (
def test_all_errors_inherit_from_base():
for cls in [CollectionNotFoundError, DocumentNotFoundError, IndexingError, CloudAPIError, FileTypeError]:
for cls in [PageIndexAPIError, CollectionNotFoundError, DocumentNotFoundError, IndexingError, CloudAPIError, FileTypeError]:
assert issubclass(cls, PageIndexError)
assert issubclass(cls, Exception)
assert issubclass(CloudAPIError, PageIndexAPIError)
def test_error_message():
@ -20,7 +22,7 @@ def test_error_message():
def test_catch_base_catches_all():
for cls in [CollectionNotFoundError, DocumentNotFoundError, IndexingError, CloudAPIError, FileTypeError]:
for cls in [PageIndexAPIError, CollectionNotFoundError, DocumentNotFoundError, IndexingError, CloudAPIError, FileTypeError]:
try:
raise cls("test")
except PageIndexError:

View file

@ -0,0 +1,325 @@
import pytest
import requests
from pageindex.client import PageIndexAPIError as ClientPageIndexAPIError
from pageindex import PageIndexAPIError, PageIndexClient
from pageindex.client import CloudClient
class FakeResponse:
def __init__(self, status_code=200, payload=None, text="ok", lines=None):
self.status_code = status_code
self._payload = payload or {}
self.text = text
self._lines = lines or []
self.closed = False
def json(self):
return self._payload
def iter_lines(self):
return iter(self._lines)
def close(self):
self.closed = True
class StreamingErrorResponse(FakeResponse):
def iter_lines(self):
raise requests.ReadTimeout("stream stalled")
def test_legacy_imports_and_initializers():
positional = PageIndexClient("pi-test")
keyword = PageIndexClient(api_key="pi-test")
cloud = CloudClient(api_key="pi-test")
assert positional._legacy_cloud_api.api_key == "pi-test"
assert keyword._legacy_cloud_api.api_key == "pi-test"
assert cloud._legacy_cloud_api.api_key == "pi-test"
assert issubclass(PageIndexAPIError, Exception)
assert ClientPageIndexAPIError is PageIndexAPIError
def test_legacy_methods_exist():
client = PageIndexClient("pi-test")
for method_name in [
"submit_document",
"get_ocr",
"get_tree",
"is_retrieval_ready",
"submit_query",
"get_retrieval",
"chat_completions",
"get_document",
"delete_document",
"list_documents",
"create_folder",
"list_folders",
]:
assert callable(getattr(client, method_name))
def test_legacy_base_url_can_be_overridden_from_client(monkeypatch):
calls = []
def fake_request(method, url, headers=None, **kwargs):
calls.append({"method": method, "url": url, "headers": headers})
return FakeResponse(payload={"id": "doc-1"})
monkeypatch.setattr("pageindex.cloud_api.requests.request", fake_request)
monkeypatch.setattr(PageIndexClient, "BASE_URL", "https://staging.pageindex.test")
result = PageIndexClient("pi-test").get_document("doc-1")
assert result == {"id": "doc-1"}
assert calls[0]["method"] == "GET"
assert calls[0]["url"] == "https://staging.pageindex.test/doc/doc-1/metadata/"
assert calls[0]["headers"] == {"api_key": "pi-test"}
def test_submit_document_uses_legacy_endpoint(monkeypatch, tmp_path):
calls = []
def fake_request(method, url, headers=None, files=None, data=None, **kwargs):
calls.append({
"method": method,
"url": url,
"headers": headers,
"data": data,
"files": files,
"kwargs": kwargs,
})
return FakeResponse(payload={"doc_id": "doc-1"})
monkeypatch.setattr("pageindex.cloud_api.requests.request", fake_request)
pdf = tmp_path / "doc.pdf"
pdf.write_bytes(b"%PDF-1.4")
result = PageIndexClient("pi-test").submit_document(
str(pdf),
mode="mcp",
beta_headers=["block_reference"],
folder_id="folder-1",
)
assert result == {"doc_id": "doc-1"}
assert calls[0]["method"] == "POST"
assert calls[0]["url"] == "https://api.pageindex.ai/doc/"
assert calls[0]["headers"] == {"api_key": "pi-test"}
assert "timeout" not in calls[0]["kwargs"]
assert calls[0]["data"]["if_retrieval"] is True
assert calls[0]["data"]["mode"] == "mcp"
assert calls[0]["data"]["beta_headers"] == '["block_reference"]'
assert calls[0]["data"]["folder_id"] == "folder-1"
def test_get_ocr_and_tree_use_legacy_urls(monkeypatch):
get_calls = []
def fake_request(method, url, headers=None, **kwargs):
get_calls.append({"method": method, "url": url, "headers": headers})
return FakeResponse(payload={"status": "completed", "retrieval_ready": True})
monkeypatch.setattr("pageindex.cloud_api.requests.request", fake_request)
client = PageIndexClient("pi-test")
assert client.get_ocr("doc-1", format="page")["status"] == "completed"
assert client.get_tree("doc-1", node_summary=True)["retrieval_ready"] is True
assert get_calls[0]["method"] == "GET"
assert get_calls[0]["url"] == "https://api.pageindex.ai/doc/doc-1/?type=ocr&format=page"
assert get_calls[1]["url"] == "https://api.pageindex.ai/doc/doc-1/?type=tree&summary=True"
def test_get_ocr_rejects_invalid_format():
with pytest.raises(ValueError, match="Format parameter must be"):
PageIndexClient("pi-test").get_ocr("doc-1", format="bad")
def test_submit_query_uses_legacy_payload(monkeypatch):
calls = []
def fake_request(method, url, headers=None, json=None, **kwargs):
calls.append({"method": method, "url": url, "headers": headers, "json": json})
return FakeResponse(payload={"retrieval_id": "ret-1"})
monkeypatch.setattr("pageindex.cloud_api.requests.request", fake_request)
result = PageIndexClient("pi-test").submit_query("doc-1", "What changed?", thinking=True)
assert result == {"retrieval_id": "ret-1"}
assert calls[0]["method"] == "POST"
assert calls[0]["url"] == "https://api.pageindex.ai/retrieval/"
assert calls[0]["json"] == {
"doc_id": "doc-1",
"query": "What changed?",
"thinking": True,
}
def test_chat_completions_non_stream_returns_json(monkeypatch):
calls = []
payload = {"choices": [{"message": {"content": "answer"}}]}
def fake_request(method, url, headers=None, json=None, stream=False, **kwargs):
calls.append({
"method": method,
"url": url,
"headers": headers,
"json": json,
"stream": stream,
})
return FakeResponse(payload=payload)
monkeypatch.setattr("pageindex.cloud_api.requests.request", fake_request)
result = PageIndexClient("pi-test").chat_completions(
[{"role": "user", "content": "hi"}],
doc_id=["doc-1"],
temperature=0.1,
enable_citations=True,
)
assert result == payload
assert calls[0]["method"] == "POST"
assert calls[0]["url"] == "https://api.pageindex.ai/chat/completions/"
assert calls[0]["stream"] is False
assert calls[0]["json"] == {
"messages": [{"role": "user", "content": "hi"}],
"stream": False,
"doc_id": ["doc-1"],
"temperature": 0.1,
"enable_citations": True,
}
def test_chat_completions_stream_parses_text_chunks(monkeypatch):
calls = []
lines = [
b'data: {"choices":[{"delta":{"content":"hel"}}]}',
b'data: {"choices":[{"delta":{"content":"lo"}}]}',
b"data: [DONE]",
]
def fake_request(method, url, **kwargs):
calls.append({"method": method, "url": url, "kwargs": kwargs})
return FakeResponse(lines=lines)
monkeypatch.setattr("pageindex.cloud_api.requests.request", fake_request)
chunks = list(PageIndexClient("pi-test").chat_completions(
[{"role": "user", "content": "hi"}],
stream=True,
))
assert chunks == ["hel", "lo"]
assert "timeout" not in calls[0]["kwargs"]
def test_chat_completions_stream_metadata_returns_raw_chunks(monkeypatch):
calls = []
lines = [
b'data: {"object":"chat.completion.chunk"}',
b"data: [DONE]",
]
def fake_request(method, url, **kwargs):
calls.append({"method": method, "url": url, "json": kwargs.get("json")})
return FakeResponse(lines=lines)
monkeypatch.setattr("pageindex.cloud_api.requests.request", fake_request)
chunks = list(PageIndexClient("pi-test").chat_completions(
[{"role": "user", "content": "hi"}],
stream=True,
stream_metadata=True,
))
assert chunks == [{"object": "chat.completion.chunk"}]
assert "stream_metadata" not in calls[0]["json"]
def test_chat_completions_stream_errors_are_pageindex_api_error(monkeypatch):
def fake_request(*args, **kwargs):
return StreamingErrorResponse()
monkeypatch.setattr("pageindex.cloud_api.requests.request", fake_request)
stream = PageIndexClient("pi-test").chat_completions(
[{"role": "user", "content": "hi"}],
stream=True,
)
with pytest.raises(PageIndexAPIError, match="Failed to stream chat completion: stream stalled"):
list(stream)
def test_api_errors_are_pageindex_api_error(monkeypatch):
def fake_request(*args, **kwargs):
return FakeResponse(status_code=500, text="server error")
monkeypatch.setattr("pageindex.cloud_api.requests.request", fake_request)
with pytest.raises(PageIndexAPIError, match="Failed to get document metadata"):
PageIndexClient("pi-test").get_document("doc-1")
def test_network_errors_are_wrapped_as_pageindex_api_error(monkeypatch):
def fake_request(*args, **kwargs):
raise requests.Timeout("slow network")
monkeypatch.setattr("pageindex.cloud_api.requests.request", fake_request)
with pytest.raises(PageIndexAPIError, match="Failed to get document metadata: slow network"):
PageIndexClient("pi-test").get_document("doc-1")
def test_list_documents_validates_legacy_pagination():
client = PageIndexClient("pi-test")
with pytest.raises(ValueError, match="limit must be between 1 and 100"):
client.list_documents(limit=0)
with pytest.raises(ValueError, match="offset must be non-negative"):
client.list_documents(offset=-1)
def test_chat_completions_stream_closes_response_after_done(monkeypatch):
fake = FakeResponse(lines=[
b'data: {"choices":[{"delta":{"content":"hi"}}]}',
b"data: [DONE]",
])
monkeypatch.setattr("pageindex.cloud_api.requests.request",
lambda *a, **kw: fake)
list(PageIndexClient("pi-test").chat_completions(
[{"role": "user", "content": "x"}], stream=True,
))
assert fake.closed is True
def test_chat_completions_stream_closes_response_on_early_abandon(monkeypatch):
fake = FakeResponse(lines=[
b'data: {"choices":[{"delta":{"content":"a"}}]}',
b'data: {"choices":[{"delta":{"content":"b"}}]}',
b"data: [DONE]",
])
monkeypatch.setattr("pageindex.cloud_api.requests.request",
lambda *a, **kw: fake)
gen = PageIndexClient("pi-test").chat_completions(
[{"role": "user", "content": "x"}], stream=True,
)
next(gen)
gen.close()
assert fake.closed is True
def test_empty_api_key_warns_and_falls_back_to_local(caplog, tmp_path, monkeypatch):
import logging
monkeypatch.setenv("OPENAI_API_KEY", "sk-test")
with caplog.at_level(logging.WARNING, logger="pageindex.client"):
client = PageIndexClient(api_key="", storage_path=str(tmp_path))
assert any("empty api_key" in r.message for r in caplog.records)
assert client._legacy_cloud_api is None

View file

@ -0,0 +1,106 @@
import sys
import asyncio
from types import SimpleNamespace
from pageindex import utils
def test_remove_fields_keeps_legacy_max_len():
data = {
"title": "A long title",
"text": "hidden",
"nodes": [{"summary": "abcdefghijklmnopqrstuvwxyz"}],
}
result = utils.remove_fields(data, fields=["text"], max_len=5)
assert "text" not in result
assert result["title"] == "A lon..."
assert result["nodes"][0]["summary"] == "abcde..."
def test_create_node_mapping_keeps_legacy_page_ranges():
tree = [
{
"node_id": "0001",
"title": "Root",
"page_index": 1,
"nodes": [
{"node_id": "0002", "title": "Child", "page_index": 3, "nodes": []},
],
}
]
plain = utils.create_node_mapping(tree)
ranged = utils.create_node_mapping(tree, include_page_ranges=True, max_page=8)
assert plain["0001"]["title"] == "Root"
assert ranged["0001"]["start_index"] == 1
assert ranged["0001"]["end_index"] == 3
assert ranged["0002"]["start_index"] == 3
assert ranged["0002"]["end_index"] == 8
def test_create_node_mapping_prefers_existing_start_end_ranges():
tree = [
{
"node_id": "0001",
"title": "Root",
"start_index": 1,
"end_index": 10,
"nodes": [
{"node_id": "0002", "title": "Child", "start_index": 3, "end_index": 5},
],
}
]
ranged = utils.create_node_mapping(tree, include_page_ranges=True, max_page=12)
assert ranged["0001"]["start_index"] == 1
assert ranged["0001"]["end_index"] == 10
assert ranged["0002"]["start_index"] == 3
assert ranged["0002"]["end_index"] == 5
def test_print_tree_keeps_legacy_exclude_fields(capsys):
tree = [{"node_id": "0001", "title": "Root", "text": "hidden", "page_index": 1}]
utils.print_tree(tree)
out = capsys.readouterr().out
assert "Root" in out
assert "hidden" not in out
assert "page_index" not in out
def test_call_llm_keeps_legacy_async_openai_contract(monkeypatch):
calls = []
class FakeCompletions:
async def create(self, **kwargs):
calls.append(kwargs)
message = SimpleNamespace(content=" answer ")
choice = SimpleNamespace(message=message)
return SimpleNamespace(choices=[choice])
class FakeAsyncOpenAI:
def __init__(self, api_key):
self.api_key = api_key
self.chat = SimpleNamespace(completions=FakeCompletions())
fake_openai = SimpleNamespace(AsyncOpenAI=FakeAsyncOpenAI)
monkeypatch.setitem(sys.modules, "openai", fake_openai)
result = asyncio.run(utils.call_llm(
"hello",
api_key="sk-test",
model="gpt-test",
temperature=0.2,
))
assert result == "answer"
assert calls == [{
"model": "gpt-test",
"messages": [{"role": "user", "content": "hello"}],
"temperature": 0.2,
}]