mirror of
https://github.com/VectifyAI/PageIndex.git
synced 2026-06-30 20:39:43 +02:00
Default behavior unchanged. Users can opt in via pdf_parser="pypdfium2" for cleaner text extraction (no broken words, correct Unicode) and 3-5x faster parsing. PyPDF2 remains the only required dependency; pypdfium2 is lazy-imported.
11 lines
No EOL
435 B
YAML
11 lines
No EOL
435 B
YAML
model: "gpt-4o-2024-11-20"
|
|
# model: "anthropic/claude-sonnet-4-6"
|
|
retrieve_model: "gpt-5.4" # defaults to `model` if not set
|
|
toc_check_page_num: 20
|
|
max_page_num_each_node: 10
|
|
max_token_num_each_node: 20000
|
|
if_add_node_id: "yes"
|
|
if_add_node_summary: "yes"
|
|
if_add_doc_description: "no"
|
|
if_add_node_text: "no"
|
|
pdf_parser: "PyPDF2" # text extractor: "PyPDF2" (default, no extra install), "pypdfium2" (pip install pypdfium2), or "PyMuPDF" |