mirror of
https://github.com/VectifyAI/PageIndex.git
synced 2026-04-24 23:56:21 +02:00
fix model
This commit is contained in:
parent
6d1b505541
commit
4b4ae4d51d
5 changed files with 6 additions and 6 deletions
|
|
@ -123,7 +123,7 @@ python3 run_pageindex.py --pdf_path /path/to/your/document.pdf
|
|||
You can customize the processing with additional optional arguments:
|
||||
|
||||
```
|
||||
--model OpenAI model to use (default: gpt-4.1)
|
||||
--model OpenAI model to use (default: gpt-4o-2024-11-20)
|
||||
--toc-check-pages Pages to check for table of contents (default: 20)
|
||||
--max-pages-per-node Max pages per node (default: 10)
|
||||
--max-tokens-per-node Max tokens per node (default: 20000)
|
||||
|
|
|
|||
|
|
@ -1,4 +1,4 @@
|
|||
model: "gpt-4.1"
|
||||
model: "gpt-4o-2024-11-20"
|
||||
toc_check_page_num: 20
|
||||
max_page_num_each_node: 10
|
||||
max_token_num_each_node: 20000
|
||||
|
|
|
|||
|
|
@ -496,7 +496,7 @@ def remove_first_physical_index_section(text):
|
|||
return text
|
||||
|
||||
### add verify completeness
|
||||
def generate_toc_continue(toc_content, part, model="gpt-4.1"):
|
||||
def generate_toc_continue(toc_content, part, model="gpt-4o-2024-11-20"):
|
||||
print('start generate_toc_continue')
|
||||
prompt = """
|
||||
You are an expert in extracting hierarchical tree structure.
|
||||
|
|
@ -729,7 +729,7 @@ def check_toc(page_list, opt=None):
|
|||
|
||||
|
||||
################### fix incorrect toc #########################################################
|
||||
def single_toc_item_index_fixer(section_title, content, model="gpt-4.1"):
|
||||
def single_toc_item_index_fixer(section_title, content, model="gpt-4o-2024-11-20"):
|
||||
tob_extractor_prompt = """
|
||||
You are given a section title and several pages of a document, your job is to find the physical index of the start page of the section in the partial document.
|
||||
|
||||
|
|
|
|||
|
|
@ -410,7 +410,7 @@ def add_preface_if_needed(data):
|
|||
|
||||
|
||||
|
||||
def get_page_tokens(pdf_path, model="gpt-4.1", pdf_parser="PyPDF2"):
|
||||
def get_page_tokens(pdf_path, model="gpt-4o-2024-11-20", pdf_parser="PyPDF2"):
|
||||
enc = tiktoken.encoding_for_model(model)
|
||||
if pdf_parser == "PyPDF2":
|
||||
pdf_reader = PyPDF2.PdfReader(pdf_path)
|
||||
|
|
|
|||
|
|
@ -10,7 +10,7 @@ if __name__ == "__main__":
|
|||
parser.add_argument('--pdf_path', type=str, help='Path to the PDF file')
|
||||
parser.add_argument('--md_path', type=str, help='Path to the Markdown file')
|
||||
|
||||
parser.add_argument('--model', type=str, default='gpt-4.1', help='Model to use')
|
||||
parser.add_argument('--model', type=str, default='gpt-4o-2024-11-20', help='Model to use')
|
||||
|
||||
parser.add_argument('--toc-check-pages', type=int, default=20,
|
||||
help='Number of pages to check for table of contents (PDF only)')
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue