mirror of
https://github.com/VectifyAI/PageIndex.git
synced 2026-06-09 19:45:15 +02:00
Add pypdfium2 as optional PDF parser
Default behavior unchanged. Users can opt in via pdf_parser="pypdfium2" for cleaner text extraction (no broken words, correct Unicode) and 3-5x faster parsing. PyPDF2 remains the only required dependency; pypdfium2 is lazy-imported.
This commit is contained in:
parent
f50e529753
commit
9539fe7513
7 changed files with 95 additions and 51 deletions
|
|
@ -5,12 +5,10 @@ import asyncio
|
|||
import concurrent.futures
|
||||
from pathlib import Path
|
||||
|
||||
import PyPDF2
|
||||
|
||||
from .page_index import page_index
|
||||
from .page_index_md import md_to_tree
|
||||
from .retrieve import get_document, get_document_structure, get_page_content
|
||||
from .utils import ConfigLoader, remove_fields
|
||||
from .utils import ConfigLoader, read_pdf_pages, remove_fields
|
||||
|
||||
META_INDEX = "_meta.json"
|
||||
|
||||
|
|
@ -32,7 +30,8 @@ class PageIndexClient:
|
|||
|
||||
For agent-based QA, see examples/agentic_vectorless_rag_demo.py.
|
||||
"""
|
||||
def __init__(self, api_key: str = None, model: str = None, retrieve_model: str = None, workspace: str = None):
|
||||
def __init__(self, api_key: str = None, model: str = None, retrieve_model: str = None,
|
||||
workspace: str = None, pdf_parser: str = None):
|
||||
if api_key:
|
||||
os.environ["OPENAI_API_KEY"] = api_key
|
||||
elif not os.getenv("OPENAI_API_KEY") and os.getenv("CHATGPT_API_KEY"):
|
||||
|
|
@ -43,9 +42,12 @@ class PageIndexClient:
|
|||
overrides["model"] = model
|
||||
if retrieve_model:
|
||||
overrides["retrieve_model"] = retrieve_model
|
||||
if pdf_parser:
|
||||
overrides["pdf_parser"] = pdf_parser
|
||||
opt = ConfigLoader().load(overrides or None)
|
||||
self.model = opt.model
|
||||
self.retrieve_model = _normalize_retrieve_model(opt.retrieve_model or self.model)
|
||||
self.pdf_parser = opt.pdf_parser
|
||||
if self.workspace:
|
||||
self.workspace.mkdir(parents=True, exist_ok=True)
|
||||
self.documents = {}
|
||||
|
|
@ -74,14 +76,12 @@ class PageIndexClient:
|
|||
if_add_node_summary='yes',
|
||||
if_add_node_text='yes',
|
||||
if_add_node_id='yes',
|
||||
if_add_doc_description='yes'
|
||||
if_add_doc_description='yes',
|
||||
pdf_parser=self.pdf_parser,
|
||||
)
|
||||
# Extract per-page text so queries don't need the original PDF
|
||||
pages = []
|
||||
with open(file_path, 'rb') as f:
|
||||
pdf_reader = PyPDF2.PdfReader(f)
|
||||
for i, page in enumerate(pdf_reader.pages, 1):
|
||||
pages.append({'page': i, 'content': page.extract_text() or ''})
|
||||
page_texts = read_pdf_pages(file_path, pdf_parser=self.pdf_parser)
|
||||
pages = [{'page': i, 'content': text} for i, text in enumerate(page_texts, 1)]
|
||||
|
||||
self.documents[doc_id] = {
|
||||
'id': doc_id,
|
||||
|
|
@ -90,6 +90,7 @@ class PageIndexClient:
|
|||
'doc_name': result.get('doc_name', ''),
|
||||
'doc_description': result.get('doc_description', ''),
|
||||
'page_count': len(pages),
|
||||
'pdf_parser': self.pdf_parser,
|
||||
'structure': result['structure'],
|
||||
'pages': pages,
|
||||
}
|
||||
|
|
@ -140,6 +141,8 @@ class PageIndexClient:
|
|||
}
|
||||
if doc.get('type') == 'pdf':
|
||||
entry['page_count'] = doc.get('page_count')
|
||||
if doc.get('pdf_parser'):
|
||||
entry['pdf_parser'] = doc['pdf_parser']
|
||||
elif doc.get('type') == 'md':
|
||||
entry['line_count'] = doc.get('line_count')
|
||||
return entry
|
||||
|
|
|
|||
|
|
@ -7,4 +7,5 @@ max_token_num_each_node: 20000
|
|||
if_add_node_id: "yes"
|
||||
if_add_node_summary: "yes"
|
||||
if_add_doc_description: "no"
|
||||
if_add_node_text: "no"
|
||||
if_add_node_text: "no"
|
||||
pdf_parser: "PyPDF2" # text extractor: "PyPDF2" (default, no extra install), "pypdfium2" (pip install pypdfium2), or "PyMuPDF"
|
||||
|
|
@ -1074,7 +1074,7 @@ def page_index_main(doc, opt=None):
|
|||
raise ValueError("Unsupported input type. Expected a PDF file path or BytesIO object.")
|
||||
|
||||
print('Parsing PDF...')
|
||||
page_list = get_page_tokens(doc, model=opt.model)
|
||||
page_list = get_page_tokens(doc, model=opt.model, pdf_parser=opt.pdf_parser)
|
||||
|
||||
logger.info({'total_page_number': len(page_list)})
|
||||
logger.info({'total_token': sum([page[1] for page in page_list])})
|
||||
|
|
@ -1111,8 +1111,9 @@ def page_index_main(doc, opt=None):
|
|||
|
||||
|
||||
def page_index(doc, model=None, toc_check_page_num=None, max_page_num_each_node=None, max_token_num_each_node=None,
|
||||
if_add_node_id=None, if_add_node_summary=None, if_add_doc_description=None, if_add_node_text=None):
|
||||
|
||||
if_add_node_id=None, if_add_node_summary=None, if_add_doc_description=None, if_add_node_text=None,
|
||||
pdf_parser=None):
|
||||
|
||||
user_opt = {
|
||||
arg: value for arg, value in locals().items()
|
||||
if arg != "doc" and value is not None
|
||||
|
|
|
|||
|
|
@ -1,10 +1,9 @@
|
|||
import json
|
||||
import PyPDF2
|
||||
|
||||
try:
|
||||
from .utils import get_number_of_pages, remove_fields
|
||||
from .utils import get_number_of_pages, read_pdf_pages, remove_fields
|
||||
except ImportError:
|
||||
from utils import get_number_of_pages, remove_fields
|
||||
from utils import get_number_of_pages, read_pdf_pages, remove_fields
|
||||
|
||||
|
||||
# ── Helpers ──────────────────────────────────────────────────────────────────
|
||||
|
|
@ -34,7 +33,11 @@ def _count_pages(doc_info: dict) -> int:
|
|||
|
||||
|
||||
def _get_pdf_page_content(doc_info: dict, page_nums: list[int]) -> list[dict]:
|
||||
"""Extract text for specific PDF pages (1-indexed). Prefer cached pages, fallback to PDF."""
|
||||
"""Extract text for specific PDF pages (1-indexed). Prefer cached pages, fallback to PDF.
|
||||
|
||||
Honors the parser recorded on the document so cache-miss reads stay consistent
|
||||
with the originally-indexed text. Defaults to PyPDF2 for legacy documents.
|
||||
"""
|
||||
cached_pages = doc_info.get('pages')
|
||||
if cached_pages:
|
||||
page_map = {p['page']: p['content'] for p in cached_pages}
|
||||
|
|
@ -42,15 +45,14 @@ def _get_pdf_page_content(doc_info: dict, page_nums: list[int]) -> list[dict]:
|
|||
{'page': p, 'content': page_map[p]}
|
||||
for p in page_nums if p in page_map
|
||||
]
|
||||
path = doc_info['path']
|
||||
with open(path, 'rb') as f:
|
||||
pdf_reader = PyPDF2.PdfReader(f)
|
||||
total = len(pdf_reader.pages)
|
||||
valid_pages = [p for p in page_nums if 1 <= p <= total]
|
||||
return [
|
||||
{'page': p, 'content': pdf_reader.pages[p - 1].extract_text() or ''}
|
||||
for p in valid_pages
|
||||
]
|
||||
parser = doc_info.get('pdf_parser') or 'PyPDF2'
|
||||
all_pages = read_pdf_pages(doc_info['path'], pdf_parser=parser)
|
||||
total = len(all_pages)
|
||||
valid_pages = [p for p in page_nums if 1 <= p <= total]
|
||||
return [
|
||||
{'page': p, 'content': all_pages[p - 1]}
|
||||
for p in valid_pages
|
||||
]
|
||||
|
||||
|
||||
def _get_md_page_content(doc_info: dict, page_nums: list[int]) -> list[dict]:
|
||||
|
|
|
|||
|
|
@ -384,30 +384,63 @@ def add_preface_if_needed(data):
|
|||
|
||||
|
||||
|
||||
def get_page_tokens(pdf_path, model=None, pdf_parser="PyPDF2"):
|
||||
SUPPORTED_PDF_PARSERS = ("PyPDF2", "pypdfium2", "PyMuPDF")
|
||||
|
||||
|
||||
def read_pdf_pages(doc, pdf_parser="PyPDF2"):
|
||||
"""Return a list of per-page text strings using the selected parser.
|
||||
|
||||
`doc` may be a file path (str/Path) or a BytesIO. `pdf_parser` is one of
|
||||
SUPPORTED_PDF_PARSERS. PyPDF2 is the default and only required dependency;
|
||||
pypdfium2 is lazy-imported so users opt in by installing it separately.
|
||||
"""
|
||||
if pdf_parser == "PyPDF2":
|
||||
pdf_reader = PyPDF2.PdfReader(pdf_path)
|
||||
page_list = []
|
||||
for page_num in range(len(pdf_reader.pages)):
|
||||
page = pdf_reader.pages[page_num]
|
||||
page_text = page.extract_text()
|
||||
token_length = litellm.token_counter(model=model, text=page_text)
|
||||
page_list.append((page_text, token_length))
|
||||
return page_list
|
||||
elif pdf_parser == "PyMuPDF":
|
||||
if isinstance(pdf_path, BytesIO):
|
||||
pdf_stream = pdf_path
|
||||
doc = pymupdf.open(stream=pdf_stream, filetype="pdf")
|
||||
elif isinstance(pdf_path, str) and os.path.isfile(pdf_path) and pdf_path.lower().endswith(".pdf"):
|
||||
doc = pymupdf.open(pdf_path)
|
||||
page_list = []
|
||||
for page in doc:
|
||||
page_text = page.get_text()
|
||||
token_length = litellm.token_counter(model=model, text=page_text)
|
||||
page_list.append((page_text, token_length))
|
||||
return page_list
|
||||
else:
|
||||
raise ValueError(f"Unsupported PDF parser: {pdf_parser}")
|
||||
reader = PyPDF2.PdfReader(doc)
|
||||
return [(p.extract_text() or "") for p in reader.pages]
|
||||
|
||||
if pdf_parser == "pypdfium2":
|
||||
try:
|
||||
import pypdfium2 as pdfium
|
||||
except ImportError as e:
|
||||
raise ImportError(
|
||||
"pdf_parser='pypdfium2' requires the optional dependency. "
|
||||
"Install it with: pip install pypdfium2"
|
||||
) from e
|
||||
source = doc.getvalue() if isinstance(doc, BytesIO) else str(doc)
|
||||
pdf = pdfium.PdfDocument(source)
|
||||
try:
|
||||
pages = []
|
||||
for i in range(len(pdf)):
|
||||
page = pdf[i]
|
||||
tp = page.get_textpage()
|
||||
try:
|
||||
text = (tp.get_text_bounded() or "").replace("\r\n", "\n")
|
||||
finally:
|
||||
tp.close()
|
||||
page.close()
|
||||
pages.append(text)
|
||||
return pages
|
||||
finally:
|
||||
pdf.close()
|
||||
|
||||
if pdf_parser == "PyMuPDF":
|
||||
if isinstance(doc, BytesIO):
|
||||
d = pymupdf.open(stream=doc, filetype="pdf")
|
||||
else:
|
||||
d = pymupdf.open(str(doc))
|
||||
try:
|
||||
return [p.get_text() for p in d]
|
||||
finally:
|
||||
d.close()
|
||||
|
||||
raise ValueError(
|
||||
f"Unsupported pdf_parser={pdf_parser!r}. Choose from {SUPPORTED_PDF_PARSERS}."
|
||||
)
|
||||
|
||||
|
||||
def get_page_tokens(pdf_path, model=None, pdf_parser="PyPDF2"):
|
||||
pages = read_pdf_pages(pdf_path, pdf_parser=pdf_parser)
|
||||
return [(text, litellm.token_counter(model=model, text=text)) for text in pages]
|
||||
|
||||
|
||||
|
||||
|
|
|
|||
|
|
@ -1,6 +1,7 @@
|
|||
litellm==1.83.7
|
||||
# openai-agents # optional: required for examples/agentic_vectorless_rag_demo.py
|
||||
pymupdf==1.26.4
|
||||
# pypdfium2 # optional: enables pdf_parser="pypdfium2" (cleaner text, faster, Apache 2.0)
|
||||
PyPDF2==3.0.1
|
||||
python-dotenv==1.2.2
|
||||
pyyaml==6.0.2
|
||||
|
|
|
|||
|
|
@ -28,7 +28,9 @@ if __name__ == "__main__":
|
|||
help='Whether to add doc description to the doc')
|
||||
parser.add_argument('--if-add-node-text', type=str, default=None,
|
||||
help='Whether to add text to the node')
|
||||
|
||||
parser.add_argument('--pdf-parser', type=str, default=None,
|
||||
help='PDF text extractor: PyPDF2 (default), pypdfium2 (requires `pip install pypdfium2`), or PyMuPDF')
|
||||
|
||||
# Markdown specific arguments
|
||||
parser.add_argument('--if-thinning', type=str, default='no',
|
||||
help='Whether to apply tree thinning for markdown (markdown only)')
|
||||
|
|
@ -61,6 +63,7 @@ if __name__ == "__main__":
|
|||
'if_add_node_summary': args.if_add_node_summary,
|
||||
'if_add_doc_description': args.if_add_doc_description,
|
||||
'if_add_node_text': args.if_add_node_text,
|
||||
'pdf_parser': args.pdf_parser,
|
||||
}
|
||||
opt = ConfigLoader().load({k: v for k, v in user_opt.items() if v is not None})
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue