mirror of
https://github.com/VectifyAI/PageIndex.git
synced 2026-06-09 19:45:15 +02:00
Centralize default parser as DEFAULT_PDF_PARSER constant
This commit is contained in:
parent
1629ef4318
commit
ec1aaca4c9
5 changed files with 12 additions and 11 deletions
|
|
@ -8,7 +8,7 @@ from pathlib import Path
|
|||
from .page_index import page_index
|
||||
from .page_index_md import md_to_tree
|
||||
from .retrieve import get_document, get_document_structure, get_page_content
|
||||
from .utils import ConfigLoader, read_pdf_pages, remove_fields
|
||||
from .utils import ConfigLoader, DEFAULT_PDF_PARSER, read_pdf_pages, remove_fields
|
||||
|
||||
META_INDEX = "_meta.json"
|
||||
|
||||
|
|
@ -31,7 +31,7 @@ class PageIndexClient:
|
|||
For agent-based QA, see examples/agentic_vectorless_rag_demo.py.
|
||||
"""
|
||||
def __init__(self, api_key: str = None, model: str = None, retrieve_model: str = None,
|
||||
workspace: str = None, pdf_parser: str = "PyPDF2"):
|
||||
workspace: str = None, pdf_parser: str = DEFAULT_PDF_PARSER):
|
||||
if api_key:
|
||||
os.environ["OPENAI_API_KEY"] = api_key
|
||||
elif not os.getenv("OPENAI_API_KEY") and os.getenv("CHATGPT_API_KEY"):
|
||||
|
|
|
|||
|
|
@ -1063,7 +1063,7 @@ async def tree_parser(page_list, opt, doc=None, logger=None):
|
|||
return toc_tree
|
||||
|
||||
|
||||
def page_index_main(doc, opt=None, pdf_parser="PyPDF2"):
|
||||
def page_index_main(doc, opt=None, pdf_parser=DEFAULT_PDF_PARSER):
|
||||
logger = JsonLogger(doc)
|
||||
|
||||
is_valid_pdf = (
|
||||
|
|
@ -1112,7 +1112,7 @@ def page_index_main(doc, opt=None, pdf_parser="PyPDF2"):
|
|||
|
||||
def page_index(doc, model=None, toc_check_page_num=None, max_page_num_each_node=None, max_token_num_each_node=None,
|
||||
if_add_node_id=None, if_add_node_summary=None, if_add_doc_description=None, if_add_node_text=None,
|
||||
pdf_parser="PyPDF2"):
|
||||
pdf_parser=DEFAULT_PDF_PARSER):
|
||||
|
||||
user_opt = {
|
||||
arg: value for arg, value in locals().items()
|
||||
|
|
|
|||
|
|
@ -1,9 +1,9 @@
|
|||
import json
|
||||
|
||||
try:
|
||||
from .utils import get_number_of_pages, read_pdf_pages, remove_fields
|
||||
from .utils import DEFAULT_PDF_PARSER, get_number_of_pages, read_pdf_pages, remove_fields
|
||||
except ImportError:
|
||||
from utils import get_number_of_pages, read_pdf_pages, remove_fields
|
||||
from utils import DEFAULT_PDF_PARSER, get_number_of_pages, read_pdf_pages, remove_fields
|
||||
|
||||
|
||||
# ── Helpers ──────────────────────────────────────────────────────────────────
|
||||
|
|
@ -41,7 +41,7 @@ def _get_pdf_page_content(doc_info: dict, page_nums: list[int]) -> list[dict]:
|
|||
{'page': p, 'content': page_map[p]}
|
||||
for p in page_nums if p in page_map
|
||||
]
|
||||
parser = doc_info.get('pdf_parser') or 'PyPDF2'
|
||||
parser = doc_info.get('pdf_parser') or DEFAULT_PDF_PARSER
|
||||
all_pages = read_pdf_pages(doc_info['path'], pdf_parser=parser)
|
||||
total = len(all_pages)
|
||||
valid_pages = [p for p in page_nums if 1 <= p <= total]
|
||||
|
|
|
|||
|
|
@ -385,9 +385,10 @@ def add_preface_if_needed(data):
|
|||
|
||||
|
||||
SUPPORTED_PDF_PARSERS = ("PyPDF2", "pypdfium2", "PyMuPDF")
|
||||
DEFAULT_PDF_PARSER = SUPPORTED_PDF_PARSERS[0]
|
||||
|
||||
|
||||
def read_pdf_pages(doc, pdf_parser="PyPDF2"):
|
||||
def read_pdf_pages(doc, pdf_parser=DEFAULT_PDF_PARSER):
|
||||
"""Return a list of per-page text strings using the selected parser.
|
||||
|
||||
`doc` may be a file path (str/Path) or a BytesIO. `pdf_parser` is one of
|
||||
|
|
@ -438,7 +439,7 @@ def read_pdf_pages(doc, pdf_parser="PyPDF2"):
|
|||
)
|
||||
|
||||
|
||||
def get_page_tokens(pdf_path, model=None, pdf_parser="PyPDF2"):
|
||||
def get_page_tokens(pdf_path, model=None, pdf_parser=DEFAULT_PDF_PARSER):
|
||||
pages = read_pdf_pages(pdf_path, pdf_parser=pdf_parser)
|
||||
return [(text, litellm.token_counter(model=model, text=text)) for text in pages]
|
||||
|
||||
|
|
|
|||
|
|
@ -3,7 +3,7 @@ import os
|
|||
import json
|
||||
from pageindex import *
|
||||
from pageindex.page_index_md import md_to_tree
|
||||
from pageindex.utils import ConfigLoader
|
||||
from pageindex.utils import ConfigLoader, DEFAULT_PDF_PARSER
|
||||
|
||||
if __name__ == "__main__":
|
||||
# Set up argument parser
|
||||
|
|
@ -67,7 +67,7 @@ if __name__ == "__main__":
|
|||
opt = ConfigLoader().load({k: v for k, v in user_opt.items() if v is not None})
|
||||
|
||||
# Process the PDF
|
||||
toc_with_page_number = page_index_main(args.pdf_path, opt, pdf_parser=args.pdf_parser or "PyPDF2")
|
||||
toc_with_page_number = page_index_main(args.pdf_path, opt, pdf_parser=args.pdf_parser or DEFAULT_PDF_PARSER)
|
||||
print('Parsing done, saving to file...')
|
||||
|
||||
# Save results
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue