mirror of
https://github.com/VectifyAI/PageIndex.git
synced 2026-06-09 19:45:15 +02:00
Make PageIndexClient parser-agnostic, pdf_parser per index() call
This commit is contained in:
parent
108cb28518
commit
63e11ef152
1 changed files with 12 additions and 8 deletions
|
|
@ -31,7 +31,7 @@ class PageIndexClient:
|
|||
For agent-based QA, see examples/agentic_vectorless_rag_demo.py.
|
||||
"""
|
||||
def __init__(self, api_key: str = None, model: str = None, retrieve_model: str = None,
|
||||
workspace: str = None, pdf_parser: str = DEFAULT_PDF_PARSER):
|
||||
workspace: str = None):
|
||||
if api_key:
|
||||
os.environ["OPENAI_API_KEY"] = api_key
|
||||
elif not os.getenv("OPENAI_API_KEY") and os.getenv("CHATGPT_API_KEY"):
|
||||
|
|
@ -45,15 +45,14 @@ class PageIndexClient:
|
|||
opt = ConfigLoader().load(overrides or None)
|
||||
self.model = opt.model
|
||||
self.retrieve_model = _normalize_retrieve_model(opt.retrieve_model or self.model)
|
||||
self.pdf_parser = pdf_parser
|
||||
if self.workspace:
|
||||
self.workspace.mkdir(parents=True, exist_ok=True)
|
||||
self.documents = {}
|
||||
if self.workspace:
|
||||
self._load_workspace()
|
||||
|
||||
def index(self, file_path: str, mode: str = "auto") -> str:
|
||||
"""Index a document. Returns a document_id."""
|
||||
def index(self, file_path: str, mode: str = "auto", pdf_parser: str = DEFAULT_PDF_PARSER) -> str:
|
||||
"""Index a document. Returns a document_id. pdf_parser only affects PDF mode."""
|
||||
# Persist a canonical absolute path so workspace reloads do not
|
||||
# reinterpret caller-relative paths against the workspace directory.
|
||||
file_path = os.path.abspath(os.path.expanduser(file_path))
|
||||
|
|
@ -75,10 +74,10 @@ class PageIndexClient:
|
|||
if_add_node_text='yes',
|
||||
if_add_node_id='yes',
|
||||
if_add_doc_description='yes',
|
||||
pdf_parser=self.pdf_parser,
|
||||
pdf_parser=pdf_parser,
|
||||
)
|
||||
# Extract per-page text so queries don't need the original PDF
|
||||
page_texts = read_pdf_pages(file_path, pdf_parser=self.pdf_parser)
|
||||
page_texts = read_pdf_pages(file_path, pdf_parser=pdf_parser)
|
||||
pages = [{'page': i, 'content': text} for i, text in enumerate(page_texts, 1)]
|
||||
|
||||
self.documents[doc_id] = {
|
||||
|
|
@ -226,7 +225,12 @@ class PageIndexClient:
|
|||
return get_document_structure(self.documents, doc_id)
|
||||
|
||||
def get_page_content(self, doc_id: str, pages: str) -> str:
|
||||
"""Return page content for the given pages string (e.g. '5-7', '3,8', '12')."""
|
||||
"""Return page content for the given pages string (e.g. '5-7', '3,8', '12').
|
||||
|
||||
Cache hit returns originally-indexed text. The rare cache-miss path
|
||||
re-reads with the default parser; callers needing parser-consistent
|
||||
fallback can use the low-level retrieve.get_page_content directly.
|
||||
"""
|
||||
if self.workspace:
|
||||
self._ensure_doc_loaded(doc_id)
|
||||
return get_page_content(self.documents, doc_id, pages, pdf_parser=self.pdf_parser)
|
||||
return get_page_content(self.documents, doc_id, pages)
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue