mirror of
https://github.com/VectifyAI/PageIndex.git
synced 2026-06-09 19:45:15 +02:00
Take pdf_parser out of ConfigLoader, use plain function arg
This commit is contained in:
parent
de58581900
commit
1629ef4318
4 changed files with 11 additions and 18 deletions
|
|
@ -31,7 +31,7 @@ class PageIndexClient:
|
|||
For agent-based QA, see examples/agentic_vectorless_rag_demo.py.
|
||||
"""
|
||||
def __init__(self, api_key: str = None, model: str = None, retrieve_model: str = None,
|
||||
workspace: str = None, pdf_parser: str = None):
|
||||
workspace: str = None, pdf_parser: str = "PyPDF2"):
|
||||
if api_key:
|
||||
os.environ["OPENAI_API_KEY"] = api_key
|
||||
elif not os.getenv("OPENAI_API_KEY") and os.getenv("CHATGPT_API_KEY"):
|
||||
|
|
@ -42,12 +42,10 @@ class PageIndexClient:
|
|||
overrides["model"] = model
|
||||
if retrieve_model:
|
||||
overrides["retrieve_model"] = retrieve_model
|
||||
if pdf_parser:
|
||||
overrides["pdf_parser"] = pdf_parser
|
||||
opt = ConfigLoader().load(overrides or None)
|
||||
self.model = opt.model
|
||||
self.retrieve_model = _normalize_retrieve_model(opt.retrieve_model or self.model)
|
||||
self.pdf_parser = opt.pdf_parser
|
||||
self.pdf_parser = pdf_parser
|
||||
if self.workspace:
|
||||
self.workspace.mkdir(parents=True, exist_ok=True)
|
||||
self.documents = {}
|
||||
|
|
|
|||
|
|
@ -1063,18 +1063,18 @@ async def tree_parser(page_list, opt, doc=None, logger=None):
|
|||
return toc_tree
|
||||
|
||||
|
||||
def page_index_main(doc, opt=None):
|
||||
def page_index_main(doc, opt=None, pdf_parser="PyPDF2"):
|
||||
logger = JsonLogger(doc)
|
||||
|
||||
|
||||
is_valid_pdf = (
|
||||
(isinstance(doc, str) and os.path.isfile(doc) and doc.lower().endswith(".pdf")) or
|
||||
(isinstance(doc, str) and os.path.isfile(doc) and doc.lower().endswith(".pdf")) or
|
||||
isinstance(doc, BytesIO)
|
||||
)
|
||||
if not is_valid_pdf:
|
||||
raise ValueError("Unsupported input type. Expected a PDF file path or BytesIO object.")
|
||||
|
||||
print('Parsing PDF...')
|
||||
page_list = get_page_tokens(doc, model=opt.model, pdf_parser=opt.pdf_parser)
|
||||
page_list = get_page_tokens(doc, model=opt.model, pdf_parser=pdf_parser)
|
||||
|
||||
logger.info({'total_page_number': len(page_list)})
|
||||
logger.info({'total_token': sum([page[1] for page in page_list])})
|
||||
|
|
@ -1112,14 +1112,14 @@ def page_index_main(doc, opt=None):
|
|||
|
||||
def page_index(doc, model=None, toc_check_page_num=None, max_page_num_each_node=None, max_token_num_each_node=None,
|
||||
if_add_node_id=None, if_add_node_summary=None, if_add_doc_description=None, if_add_node_text=None,
|
||||
pdf_parser=None):
|
||||
pdf_parser="PyPDF2"):
|
||||
|
||||
user_opt = {
|
||||
arg: value for arg, value in locals().items()
|
||||
if arg != "doc" and value is not None
|
||||
if arg not in ("doc", "pdf_parser") and value is not None
|
||||
}
|
||||
opt = ConfigLoader().load(user_opt)
|
||||
return page_index_main(doc, opt)
|
||||
return page_index_main(doc, opt, pdf_parser=pdf_parser)
|
||||
|
||||
|
||||
def validate_and_truncate_physical_indices(toc_with_page_number, page_list_length, start_index=1, logger=None):
|
||||
|
|
|
|||
|
|
@ -685,14 +685,10 @@ def format_structure(structure, order=None):
|
|||
|
||||
|
||||
class ConfigLoader:
|
||||
# Code-side defaults for non-tuning settings (kept out of config.yaml).
|
||||
# yaml entries override these if present.
|
||||
_CODE_DEFAULTS = {"pdf_parser": "PyPDF2"}
|
||||
|
||||
def __init__(self, default_path: str = None):
|
||||
if default_path is None:
|
||||
default_path = Path(__file__).parent / "config.yaml"
|
||||
self._default_dict = {**self._CODE_DEFAULTS, **self._load_yaml(default_path)}
|
||||
self._default_dict = self._load_yaml(default_path)
|
||||
|
||||
@staticmethod
|
||||
def _load_yaml(path):
|
||||
|
|
|
|||
|
|
@ -63,12 +63,11 @@ if __name__ == "__main__":
|
|||
'if_add_node_summary': args.if_add_node_summary,
|
||||
'if_add_doc_description': args.if_add_doc_description,
|
||||
'if_add_node_text': args.if_add_node_text,
|
||||
'pdf_parser': args.pdf_parser,
|
||||
}
|
||||
opt = ConfigLoader().load({k: v for k, v in user_opt.items() if v is not None})
|
||||
|
||||
# Process the PDF
|
||||
toc_with_page_number = page_index_main(args.pdf_path, opt)
|
||||
toc_with_page_number = page_index_main(args.pdf_path, opt, pdf_parser=args.pdf_parser or "PyPDF2")
|
||||
print('Parsing done, saving to file...')
|
||||
|
||||
# Save results
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue