mirror of
https://github.com/VectifyAI/PageIndex.git
synced 2026-05-13 00:32:36 +02:00
feat:compatible with Pageindex SDK (#238)
* feat:compatible with Pageindex SDK
* corner cases fixed
* fix: mock behavior of old SDK
* fix: close streaming response and warn on empty api_key
- LegacyCloudAPI: close response in `finally` for both _stream_chat_response
variants so abandoned iterators no longer leak the TCP connection.
- PageIndexClient: emit a warning instead of silently falling back to local
when api_key is the empty string, surfacing typical env-var-unset misconfig.
- FakeResponse: add close()/closed to match the real requests.Response API.
- Add unit coverage for stream close (both paths) and the empty-api_key warning.
- Add scripts/e2e_legacy_sdk.py to smoke-test the legacy SDK contract end-to-end
against api.pageindex.ai.
* chore: mark legacy SDK methods with @deprecated and docstring pointers
- Decorate the 12 PageIndexClient cloud-SDK compat methods with
@typing_extensions.deprecated(..., category=PendingDeprecationWarning):
- IDE/type-checkers render them with a strikethrough hint
- runtime warnings stay silent by default (no spam for existing callers),
surfaceable via `python -W default::PendingDeprecationWarning`
- Add a one-line docstring on each pointing to the Collection-based equivalent.
- Promote typing-extensions to a direct dependency (was transitive via litellm).
---------
Co-authored-by: XinyanZhou <xinyanzhou@XinyanZhoudeMacBook-Pro.local>
Co-authored-by: saccharin98 <xinyanzhou938@gmail.com>
Co-authored-by: mountain <kose2livs@gmail.com>
This commit is contained in:
parent
6d29886892
commit
595895cf28
10 changed files with 1030 additions and 20 deletions
|
|
@ -1,5 +1,6 @@
|
|||
from pageindex.errors import (
|
||||
PageIndexError,
|
||||
PageIndexAPIError,
|
||||
CollectionNotFoundError,
|
||||
DocumentNotFoundError,
|
||||
IndexingError,
|
||||
|
|
@ -9,9 +10,10 @@ from pageindex.errors import (
|
|||
|
||||
|
||||
def test_all_errors_inherit_from_base():
|
||||
for cls in [CollectionNotFoundError, DocumentNotFoundError, IndexingError, CloudAPIError, FileTypeError]:
|
||||
for cls in [PageIndexAPIError, CollectionNotFoundError, DocumentNotFoundError, IndexingError, CloudAPIError, FileTypeError]:
|
||||
assert issubclass(cls, PageIndexError)
|
||||
assert issubclass(cls, Exception)
|
||||
assert issubclass(CloudAPIError, PageIndexAPIError)
|
||||
|
||||
|
||||
def test_error_message():
|
||||
|
|
@ -20,7 +22,7 @@ def test_error_message():
|
|||
|
||||
|
||||
def test_catch_base_catches_all():
|
||||
for cls in [CollectionNotFoundError, DocumentNotFoundError, IndexingError, CloudAPIError, FileTypeError]:
|
||||
for cls in [PageIndexAPIError, CollectionNotFoundError, DocumentNotFoundError, IndexingError, CloudAPIError, FileTypeError]:
|
||||
try:
|
||||
raise cls("test")
|
||||
except PageIndexError:
|
||||
|
|
|
|||
325
tests/test_legacy_sdk_contract.py
Normal file
325
tests/test_legacy_sdk_contract.py
Normal file
|
|
@ -0,0 +1,325 @@
|
|||
import pytest
|
||||
import requests
|
||||
|
||||
from pageindex.client import PageIndexAPIError as ClientPageIndexAPIError
|
||||
from pageindex import PageIndexAPIError, PageIndexClient
|
||||
from pageindex.client import CloudClient
|
||||
|
||||
|
||||
class FakeResponse:
|
||||
def __init__(self, status_code=200, payload=None, text="ok", lines=None):
|
||||
self.status_code = status_code
|
||||
self._payload = payload or {}
|
||||
self.text = text
|
||||
self._lines = lines or []
|
||||
self.closed = False
|
||||
|
||||
def json(self):
|
||||
return self._payload
|
||||
|
||||
def iter_lines(self):
|
||||
return iter(self._lines)
|
||||
|
||||
def close(self):
|
||||
self.closed = True
|
||||
|
||||
|
||||
class StreamingErrorResponse(FakeResponse):
|
||||
def iter_lines(self):
|
||||
raise requests.ReadTimeout("stream stalled")
|
||||
|
||||
|
||||
def test_legacy_imports_and_initializers():
|
||||
positional = PageIndexClient("pi-test")
|
||||
keyword = PageIndexClient(api_key="pi-test")
|
||||
cloud = CloudClient(api_key="pi-test")
|
||||
|
||||
assert positional._legacy_cloud_api.api_key == "pi-test"
|
||||
assert keyword._legacy_cloud_api.api_key == "pi-test"
|
||||
assert cloud._legacy_cloud_api.api_key == "pi-test"
|
||||
assert issubclass(PageIndexAPIError, Exception)
|
||||
assert ClientPageIndexAPIError is PageIndexAPIError
|
||||
|
||||
|
||||
def test_legacy_methods_exist():
|
||||
client = PageIndexClient("pi-test")
|
||||
for method_name in [
|
||||
"submit_document",
|
||||
"get_ocr",
|
||||
"get_tree",
|
||||
"is_retrieval_ready",
|
||||
"submit_query",
|
||||
"get_retrieval",
|
||||
"chat_completions",
|
||||
"get_document",
|
||||
"delete_document",
|
||||
"list_documents",
|
||||
"create_folder",
|
||||
"list_folders",
|
||||
]:
|
||||
assert callable(getattr(client, method_name))
|
||||
|
||||
|
||||
def test_legacy_base_url_can_be_overridden_from_client(monkeypatch):
|
||||
calls = []
|
||||
|
||||
def fake_request(method, url, headers=None, **kwargs):
|
||||
calls.append({"method": method, "url": url, "headers": headers})
|
||||
return FakeResponse(payload={"id": "doc-1"})
|
||||
|
||||
monkeypatch.setattr("pageindex.cloud_api.requests.request", fake_request)
|
||||
monkeypatch.setattr(PageIndexClient, "BASE_URL", "https://staging.pageindex.test")
|
||||
|
||||
result = PageIndexClient("pi-test").get_document("doc-1")
|
||||
|
||||
assert result == {"id": "doc-1"}
|
||||
assert calls[0]["method"] == "GET"
|
||||
assert calls[0]["url"] == "https://staging.pageindex.test/doc/doc-1/metadata/"
|
||||
assert calls[0]["headers"] == {"api_key": "pi-test"}
|
||||
|
||||
|
||||
def test_submit_document_uses_legacy_endpoint(monkeypatch, tmp_path):
|
||||
calls = []
|
||||
|
||||
def fake_request(method, url, headers=None, files=None, data=None, **kwargs):
|
||||
calls.append({
|
||||
"method": method,
|
||||
"url": url,
|
||||
"headers": headers,
|
||||
"data": data,
|
||||
"files": files,
|
||||
"kwargs": kwargs,
|
||||
})
|
||||
return FakeResponse(payload={"doc_id": "doc-1"})
|
||||
|
||||
monkeypatch.setattr("pageindex.cloud_api.requests.request", fake_request)
|
||||
|
||||
pdf = tmp_path / "doc.pdf"
|
||||
pdf.write_bytes(b"%PDF-1.4")
|
||||
result = PageIndexClient("pi-test").submit_document(
|
||||
str(pdf),
|
||||
mode="mcp",
|
||||
beta_headers=["block_reference"],
|
||||
folder_id="folder-1",
|
||||
)
|
||||
|
||||
assert result == {"doc_id": "doc-1"}
|
||||
assert calls[0]["method"] == "POST"
|
||||
assert calls[0]["url"] == "https://api.pageindex.ai/doc/"
|
||||
assert calls[0]["headers"] == {"api_key": "pi-test"}
|
||||
assert "timeout" not in calls[0]["kwargs"]
|
||||
assert calls[0]["data"]["if_retrieval"] is True
|
||||
assert calls[0]["data"]["mode"] == "mcp"
|
||||
assert calls[0]["data"]["beta_headers"] == '["block_reference"]'
|
||||
assert calls[0]["data"]["folder_id"] == "folder-1"
|
||||
|
||||
|
||||
def test_get_ocr_and_tree_use_legacy_urls(monkeypatch):
|
||||
get_calls = []
|
||||
|
||||
def fake_request(method, url, headers=None, **kwargs):
|
||||
get_calls.append({"method": method, "url": url, "headers": headers})
|
||||
return FakeResponse(payload={"status": "completed", "retrieval_ready": True})
|
||||
|
||||
monkeypatch.setattr("pageindex.cloud_api.requests.request", fake_request)
|
||||
client = PageIndexClient("pi-test")
|
||||
|
||||
assert client.get_ocr("doc-1", format="page")["status"] == "completed"
|
||||
assert client.get_tree("doc-1", node_summary=True)["retrieval_ready"] is True
|
||||
|
||||
assert get_calls[0]["method"] == "GET"
|
||||
assert get_calls[0]["url"] == "https://api.pageindex.ai/doc/doc-1/?type=ocr&format=page"
|
||||
assert get_calls[1]["url"] == "https://api.pageindex.ai/doc/doc-1/?type=tree&summary=True"
|
||||
|
||||
|
||||
def test_get_ocr_rejects_invalid_format():
|
||||
with pytest.raises(ValueError, match="Format parameter must be"):
|
||||
PageIndexClient("pi-test").get_ocr("doc-1", format="bad")
|
||||
|
||||
|
||||
def test_submit_query_uses_legacy_payload(monkeypatch):
|
||||
calls = []
|
||||
|
||||
def fake_request(method, url, headers=None, json=None, **kwargs):
|
||||
calls.append({"method": method, "url": url, "headers": headers, "json": json})
|
||||
return FakeResponse(payload={"retrieval_id": "ret-1"})
|
||||
|
||||
monkeypatch.setattr("pageindex.cloud_api.requests.request", fake_request)
|
||||
|
||||
result = PageIndexClient("pi-test").submit_query("doc-1", "What changed?", thinking=True)
|
||||
|
||||
assert result == {"retrieval_id": "ret-1"}
|
||||
assert calls[0]["method"] == "POST"
|
||||
assert calls[0]["url"] == "https://api.pageindex.ai/retrieval/"
|
||||
assert calls[0]["json"] == {
|
||||
"doc_id": "doc-1",
|
||||
"query": "What changed?",
|
||||
"thinking": True,
|
||||
}
|
||||
|
||||
|
||||
def test_chat_completions_non_stream_returns_json(monkeypatch):
|
||||
calls = []
|
||||
payload = {"choices": [{"message": {"content": "answer"}}]}
|
||||
|
||||
def fake_request(method, url, headers=None, json=None, stream=False, **kwargs):
|
||||
calls.append({
|
||||
"method": method,
|
||||
"url": url,
|
||||
"headers": headers,
|
||||
"json": json,
|
||||
"stream": stream,
|
||||
})
|
||||
return FakeResponse(payload=payload)
|
||||
|
||||
monkeypatch.setattr("pageindex.cloud_api.requests.request", fake_request)
|
||||
|
||||
result = PageIndexClient("pi-test").chat_completions(
|
||||
[{"role": "user", "content": "hi"}],
|
||||
doc_id=["doc-1"],
|
||||
temperature=0.1,
|
||||
enable_citations=True,
|
||||
)
|
||||
|
||||
assert result == payload
|
||||
assert calls[0]["method"] == "POST"
|
||||
assert calls[0]["url"] == "https://api.pageindex.ai/chat/completions/"
|
||||
assert calls[0]["stream"] is False
|
||||
assert calls[0]["json"] == {
|
||||
"messages": [{"role": "user", "content": "hi"}],
|
||||
"stream": False,
|
||||
"doc_id": ["doc-1"],
|
||||
"temperature": 0.1,
|
||||
"enable_citations": True,
|
||||
}
|
||||
|
||||
|
||||
def test_chat_completions_stream_parses_text_chunks(monkeypatch):
|
||||
calls = []
|
||||
lines = [
|
||||
b'data: {"choices":[{"delta":{"content":"hel"}}]}',
|
||||
b'data: {"choices":[{"delta":{"content":"lo"}}]}',
|
||||
b"data: [DONE]",
|
||||
]
|
||||
|
||||
def fake_request(method, url, **kwargs):
|
||||
calls.append({"method": method, "url": url, "kwargs": kwargs})
|
||||
return FakeResponse(lines=lines)
|
||||
|
||||
monkeypatch.setattr("pageindex.cloud_api.requests.request", fake_request)
|
||||
|
||||
chunks = list(PageIndexClient("pi-test").chat_completions(
|
||||
[{"role": "user", "content": "hi"}],
|
||||
stream=True,
|
||||
))
|
||||
|
||||
assert chunks == ["hel", "lo"]
|
||||
assert "timeout" not in calls[0]["kwargs"]
|
||||
|
||||
|
||||
def test_chat_completions_stream_metadata_returns_raw_chunks(monkeypatch):
|
||||
calls = []
|
||||
lines = [
|
||||
b'data: {"object":"chat.completion.chunk"}',
|
||||
b"data: [DONE]",
|
||||
]
|
||||
|
||||
def fake_request(method, url, **kwargs):
|
||||
calls.append({"method": method, "url": url, "json": kwargs.get("json")})
|
||||
return FakeResponse(lines=lines)
|
||||
|
||||
monkeypatch.setattr("pageindex.cloud_api.requests.request", fake_request)
|
||||
|
||||
chunks = list(PageIndexClient("pi-test").chat_completions(
|
||||
[{"role": "user", "content": "hi"}],
|
||||
stream=True,
|
||||
stream_metadata=True,
|
||||
))
|
||||
|
||||
assert chunks == [{"object": "chat.completion.chunk"}]
|
||||
assert "stream_metadata" not in calls[0]["json"]
|
||||
|
||||
|
||||
def test_chat_completions_stream_errors_are_pageindex_api_error(monkeypatch):
|
||||
def fake_request(*args, **kwargs):
|
||||
return StreamingErrorResponse()
|
||||
|
||||
monkeypatch.setattr("pageindex.cloud_api.requests.request", fake_request)
|
||||
|
||||
stream = PageIndexClient("pi-test").chat_completions(
|
||||
[{"role": "user", "content": "hi"}],
|
||||
stream=True,
|
||||
)
|
||||
|
||||
with pytest.raises(PageIndexAPIError, match="Failed to stream chat completion: stream stalled"):
|
||||
list(stream)
|
||||
|
||||
|
||||
def test_api_errors_are_pageindex_api_error(monkeypatch):
|
||||
def fake_request(*args, **kwargs):
|
||||
return FakeResponse(status_code=500, text="server error")
|
||||
|
||||
monkeypatch.setattr("pageindex.cloud_api.requests.request", fake_request)
|
||||
|
||||
with pytest.raises(PageIndexAPIError, match="Failed to get document metadata"):
|
||||
PageIndexClient("pi-test").get_document("doc-1")
|
||||
|
||||
|
||||
def test_network_errors_are_wrapped_as_pageindex_api_error(monkeypatch):
|
||||
def fake_request(*args, **kwargs):
|
||||
raise requests.Timeout("slow network")
|
||||
|
||||
monkeypatch.setattr("pageindex.cloud_api.requests.request", fake_request)
|
||||
|
||||
with pytest.raises(PageIndexAPIError, match="Failed to get document metadata: slow network"):
|
||||
PageIndexClient("pi-test").get_document("doc-1")
|
||||
|
||||
|
||||
def test_list_documents_validates_legacy_pagination():
|
||||
client = PageIndexClient("pi-test")
|
||||
|
||||
with pytest.raises(ValueError, match="limit must be between 1 and 100"):
|
||||
client.list_documents(limit=0)
|
||||
with pytest.raises(ValueError, match="offset must be non-negative"):
|
||||
client.list_documents(offset=-1)
|
||||
|
||||
|
||||
def test_chat_completions_stream_closes_response_after_done(monkeypatch):
|
||||
fake = FakeResponse(lines=[
|
||||
b'data: {"choices":[{"delta":{"content":"hi"}}]}',
|
||||
b"data: [DONE]",
|
||||
])
|
||||
monkeypatch.setattr("pageindex.cloud_api.requests.request",
|
||||
lambda *a, **kw: fake)
|
||||
|
||||
list(PageIndexClient("pi-test").chat_completions(
|
||||
[{"role": "user", "content": "x"}], stream=True,
|
||||
))
|
||||
assert fake.closed is True
|
||||
|
||||
|
||||
def test_chat_completions_stream_closes_response_on_early_abandon(monkeypatch):
|
||||
fake = FakeResponse(lines=[
|
||||
b'data: {"choices":[{"delta":{"content":"a"}}]}',
|
||||
b'data: {"choices":[{"delta":{"content":"b"}}]}',
|
||||
b"data: [DONE]",
|
||||
])
|
||||
monkeypatch.setattr("pageindex.cloud_api.requests.request",
|
||||
lambda *a, **kw: fake)
|
||||
|
||||
gen = PageIndexClient("pi-test").chat_completions(
|
||||
[{"role": "user", "content": "x"}], stream=True,
|
||||
)
|
||||
next(gen)
|
||||
gen.close()
|
||||
assert fake.closed is True
|
||||
|
||||
|
||||
def test_empty_api_key_warns_and_falls_back_to_local(caplog, tmp_path, monkeypatch):
|
||||
import logging
|
||||
monkeypatch.setenv("OPENAI_API_KEY", "sk-test")
|
||||
with caplog.at_level(logging.WARNING, logger="pageindex.client"):
|
||||
client = PageIndexClient(api_key="", storage_path=str(tmp_path))
|
||||
|
||||
assert any("empty api_key" in r.message for r in caplog.records)
|
||||
assert client._legacy_cloud_api is None
|
||||
106
tests/test_legacy_utils_contract.py
Normal file
106
tests/test_legacy_utils_contract.py
Normal file
|
|
@ -0,0 +1,106 @@
|
|||
import sys
|
||||
import asyncio
|
||||
from types import SimpleNamespace
|
||||
|
||||
from pageindex import utils
|
||||
|
||||
|
||||
def test_remove_fields_keeps_legacy_max_len():
|
||||
data = {
|
||||
"title": "A long title",
|
||||
"text": "hidden",
|
||||
"nodes": [{"summary": "abcdefghijklmnopqrstuvwxyz"}],
|
||||
}
|
||||
|
||||
result = utils.remove_fields(data, fields=["text"], max_len=5)
|
||||
|
||||
assert "text" not in result
|
||||
assert result["title"] == "A lon..."
|
||||
assert result["nodes"][0]["summary"] == "abcde..."
|
||||
|
||||
|
||||
def test_create_node_mapping_keeps_legacy_page_ranges():
|
||||
tree = [
|
||||
{
|
||||
"node_id": "0001",
|
||||
"title": "Root",
|
||||
"page_index": 1,
|
||||
"nodes": [
|
||||
{"node_id": "0002", "title": "Child", "page_index": 3, "nodes": []},
|
||||
],
|
||||
}
|
||||
]
|
||||
|
||||
plain = utils.create_node_mapping(tree)
|
||||
ranged = utils.create_node_mapping(tree, include_page_ranges=True, max_page=8)
|
||||
|
||||
assert plain["0001"]["title"] == "Root"
|
||||
assert ranged["0001"]["start_index"] == 1
|
||||
assert ranged["0001"]["end_index"] == 3
|
||||
assert ranged["0002"]["start_index"] == 3
|
||||
assert ranged["0002"]["end_index"] == 8
|
||||
|
||||
|
||||
def test_create_node_mapping_prefers_existing_start_end_ranges():
|
||||
tree = [
|
||||
{
|
||||
"node_id": "0001",
|
||||
"title": "Root",
|
||||
"start_index": 1,
|
||||
"end_index": 10,
|
||||
"nodes": [
|
||||
{"node_id": "0002", "title": "Child", "start_index": 3, "end_index": 5},
|
||||
],
|
||||
}
|
||||
]
|
||||
|
||||
ranged = utils.create_node_mapping(tree, include_page_ranges=True, max_page=12)
|
||||
|
||||
assert ranged["0001"]["start_index"] == 1
|
||||
assert ranged["0001"]["end_index"] == 10
|
||||
assert ranged["0002"]["start_index"] == 3
|
||||
assert ranged["0002"]["end_index"] == 5
|
||||
|
||||
|
||||
def test_print_tree_keeps_legacy_exclude_fields(capsys):
|
||||
tree = [{"node_id": "0001", "title": "Root", "text": "hidden", "page_index": 1}]
|
||||
|
||||
utils.print_tree(tree)
|
||||
|
||||
out = capsys.readouterr().out
|
||||
assert "Root" in out
|
||||
assert "hidden" not in out
|
||||
assert "page_index" not in out
|
||||
|
||||
|
||||
def test_call_llm_keeps_legacy_async_openai_contract(monkeypatch):
|
||||
calls = []
|
||||
|
||||
class FakeCompletions:
|
||||
async def create(self, **kwargs):
|
||||
calls.append(kwargs)
|
||||
message = SimpleNamespace(content=" answer ")
|
||||
choice = SimpleNamespace(message=message)
|
||||
return SimpleNamespace(choices=[choice])
|
||||
|
||||
class FakeAsyncOpenAI:
|
||||
def __init__(self, api_key):
|
||||
self.api_key = api_key
|
||||
self.chat = SimpleNamespace(completions=FakeCompletions())
|
||||
|
||||
fake_openai = SimpleNamespace(AsyncOpenAI=FakeAsyncOpenAI)
|
||||
monkeypatch.setitem(sys.modules, "openai", fake_openai)
|
||||
|
||||
result = asyncio.run(utils.call_llm(
|
||||
"hello",
|
||||
api_key="sk-test",
|
||||
model="gpt-test",
|
||||
temperature=0.2,
|
||||
))
|
||||
|
||||
assert result == "answer"
|
||||
assert calls == [{
|
||||
"model": "gpt-test",
|
||||
"messages": [{"role": "user", "content": "hello"}],
|
||||
"temperature": 0.2,
|
||||
}]
|
||||
Loading…
Add table
Add a link
Reference in a new issue