mirror of
https://github.com/VectifyAI/PageIndex.git
synced 2026-06-24 20:28:12 +02:00
Move pdf_parser off doc dict, pass via call args
This commit is contained in:
parent
ec1aaca4c9
commit
108cb28518
2 changed files with 5 additions and 9 deletions
|
|
@ -88,7 +88,6 @@ class PageIndexClient:
|
||||||
'doc_name': result.get('doc_name', ''),
|
'doc_name': result.get('doc_name', ''),
|
||||||
'doc_description': result.get('doc_description', ''),
|
'doc_description': result.get('doc_description', ''),
|
||||||
'page_count': len(pages),
|
'page_count': len(pages),
|
||||||
'pdf_parser': self.pdf_parser,
|
|
||||||
'structure': result['structure'],
|
'structure': result['structure'],
|
||||||
'pages': pages,
|
'pages': pages,
|
||||||
}
|
}
|
||||||
|
|
@ -139,8 +138,6 @@ class PageIndexClient:
|
||||||
}
|
}
|
||||||
if doc.get('type') == 'pdf':
|
if doc.get('type') == 'pdf':
|
||||||
entry['page_count'] = doc.get('page_count')
|
entry['page_count'] = doc.get('page_count')
|
||||||
if doc.get('pdf_parser'):
|
|
||||||
entry['pdf_parser'] = doc['pdf_parser']
|
|
||||||
elif doc.get('type') == 'md':
|
elif doc.get('type') == 'md':
|
||||||
entry['line_count'] = doc.get('line_count')
|
entry['line_count'] = doc.get('line_count')
|
||||||
return entry
|
return entry
|
||||||
|
|
@ -232,4 +229,4 @@ class PageIndexClient:
|
||||||
"""Return page content for the given pages string (e.g. '5-7', '3,8', '12')."""
|
"""Return page content for the given pages string (e.g. '5-7', '3,8', '12')."""
|
||||||
if self.workspace:
|
if self.workspace:
|
||||||
self._ensure_doc_loaded(doc_id)
|
self._ensure_doc_loaded(doc_id)
|
||||||
return get_page_content(self.documents, doc_id, pages)
|
return get_page_content(self.documents, doc_id, pages, pdf_parser=self.pdf_parser)
|
||||||
|
|
|
||||||
|
|
@ -32,7 +32,7 @@ def _count_pages(doc_info: dict) -> int:
|
||||||
return get_number_of_pages(doc_info['path'])
|
return get_number_of_pages(doc_info['path'])
|
||||||
|
|
||||||
|
|
||||||
def _get_pdf_page_content(doc_info: dict, page_nums: list[int]) -> list[dict]:
|
def _get_pdf_page_content(doc_info: dict, page_nums: list[int], pdf_parser: str = DEFAULT_PDF_PARSER) -> list[dict]:
|
||||||
"""Extract text for specific PDF pages (1-indexed). Prefer cached pages, fallback to PDF."""
|
"""Extract text for specific PDF pages (1-indexed). Prefer cached pages, fallback to PDF."""
|
||||||
cached_pages = doc_info.get('pages')
|
cached_pages = doc_info.get('pages')
|
||||||
if cached_pages:
|
if cached_pages:
|
||||||
|
|
@ -41,8 +41,7 @@ def _get_pdf_page_content(doc_info: dict, page_nums: list[int]) -> list[dict]:
|
||||||
{'page': p, 'content': page_map[p]}
|
{'page': p, 'content': page_map[p]}
|
||||||
for p in page_nums if p in page_map
|
for p in page_nums if p in page_map
|
||||||
]
|
]
|
||||||
parser = doc_info.get('pdf_parser') or DEFAULT_PDF_PARSER
|
all_pages = read_pdf_pages(doc_info['path'], pdf_parser=pdf_parser)
|
||||||
all_pages = read_pdf_pages(doc_info['path'], pdf_parser=parser)
|
|
||||||
total = len(all_pages)
|
total = len(all_pages)
|
||||||
valid_pages = [p for p in page_nums if 1 <= p <= total]
|
valid_pages = [p for p in page_nums if 1 <= p <= total]
|
||||||
return [
|
return [
|
||||||
|
|
@ -105,7 +104,7 @@ def get_document_structure(documents: dict, doc_id: str) -> str:
|
||||||
return json.dumps(structure_no_text, ensure_ascii=False)
|
return json.dumps(structure_no_text, ensure_ascii=False)
|
||||||
|
|
||||||
|
|
||||||
def get_page_content(documents: dict, doc_id: str, pages: str) -> str:
|
def get_page_content(documents: dict, doc_id: str, pages: str, pdf_parser: str = DEFAULT_PDF_PARSER) -> str:
|
||||||
"""
|
"""
|
||||||
Retrieve page content for a document.
|
Retrieve page content for a document.
|
||||||
|
|
||||||
|
|
@ -126,7 +125,7 @@ def get_page_content(documents: dict, doc_id: str, pages: str) -> str:
|
||||||
|
|
||||||
try:
|
try:
|
||||||
if doc_info.get('type') == 'pdf':
|
if doc_info.get('type') == 'pdf':
|
||||||
content = _get_pdf_page_content(doc_info, page_nums)
|
content = _get_pdf_page_content(doc_info, page_nums, pdf_parser=pdf_parser)
|
||||||
else:
|
else:
|
||||||
content = _get_md_page_content(doc_info, page_nums)
|
content = _get_md_page_content(doc_info, page_nums)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue