diff --git a/pageindex/client.py b/pageindex/client.py index 30a6bb0..1d36409 100644 --- a/pageindex/client.py +++ b/pageindex/client.py @@ -88,7 +88,6 @@ class PageIndexClient: 'doc_name': result.get('doc_name', ''), 'doc_description': result.get('doc_description', ''), 'page_count': len(pages), - 'pdf_parser': self.pdf_parser, 'structure': result['structure'], 'pages': pages, } @@ -139,8 +138,6 @@ class PageIndexClient: } if doc.get('type') == 'pdf': entry['page_count'] = doc.get('page_count') - if doc.get('pdf_parser'): - entry['pdf_parser'] = doc['pdf_parser'] elif doc.get('type') == 'md': entry['line_count'] = doc.get('line_count') return entry @@ -232,4 +229,4 @@ class PageIndexClient: """Return page content for the given pages string (e.g. '5-7', '3,8', '12').""" if self.workspace: self._ensure_doc_loaded(doc_id) - return get_page_content(self.documents, doc_id, pages) + return get_page_content(self.documents, doc_id, pages, pdf_parser=self.pdf_parser) diff --git a/pageindex/retrieve.py b/pageindex/retrieve.py index 52bc2eb..dabd258 100644 --- a/pageindex/retrieve.py +++ b/pageindex/retrieve.py @@ -32,7 +32,7 @@ def _count_pages(doc_info: dict) -> int: return get_number_of_pages(doc_info['path']) -def _get_pdf_page_content(doc_info: dict, page_nums: list[int]) -> list[dict]: +def _get_pdf_page_content(doc_info: dict, page_nums: list[int], pdf_parser: str = DEFAULT_PDF_PARSER) -> list[dict]: """Extract text for specific PDF pages (1-indexed). Prefer cached pages, fallback to PDF.""" cached_pages = doc_info.get('pages') if cached_pages: @@ -41,8 +41,7 @@ def _get_pdf_page_content(doc_info: dict, page_nums: list[int]) -> list[dict]: {'page': p, 'content': page_map[p]} for p in page_nums if p in page_map ] - parser = doc_info.get('pdf_parser') or DEFAULT_PDF_PARSER - all_pages = read_pdf_pages(doc_info['path'], pdf_parser=parser) + all_pages = read_pdf_pages(doc_info['path'], pdf_parser=pdf_parser) total = len(all_pages) valid_pages = [p for p in page_nums if 1 <= p <= total] return [ @@ -105,7 +104,7 @@ def get_document_structure(documents: dict, doc_id: str) -> str: return json.dumps(structure_no_text, ensure_ascii=False) -def get_page_content(documents: dict, doc_id: str, pages: str) -> str: +def get_page_content(documents: dict, doc_id: str, pages: str, pdf_parser: str = DEFAULT_PDF_PARSER) -> str: """ Retrieve page content for a document. @@ -126,7 +125,7 @@ def get_page_content(documents: dict, doc_id: str, pages: str) -> str: try: if doc_info.get('type') == 'pdf': - content = _get_pdf_page_content(doc_info, page_nums) + content = _get_pdf_page_content(doc_info, page_nums, pdf_parser=pdf_parser) else: content = _get_md_page_content(doc_info, page_nums) except Exception as e: