mirror of
https://github.com/VectifyAI/PageIndex.git
synced 2026-04-24 23:56:21 +02:00
Integrate LiteLLM for multi-provider LLM support (#168)
* Integrate litellm for multi-provider LLM support * recover the default config yaml * Use litellm.acompletion for native async support * fix tob * Rename llm_complete/allm_complete to llm_completion/llm_acompletion, remove unused llm_complete_stream * Pin litellm to version 1.82.0 * resolve comments * args from cli is used to overrides config.yaml * Fix get_page_tokens hardcoded model default Pass opt.model to get_page_tokens so tokenization respects the configured model instead of always using gpt-4o-2024-11-20. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com> * Remove explicit openai dependency from requirements.txt openai is no longer directly imported; it comes in as a transitive dependency of litellm. Pinning it explicitly risks version conflicts. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com> * Restore openai==1.101.0 pin in requirements.txt litellm==1.82.0 and openai-agents have conflicting openai version requirements, but openai==1.101.0 works at runtime for both. The pin is necessary to prevent litellm from pulling in openai>=2.x which would break openai-agents. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com> * Remove explicit openai dependency from requirements.txt openai is not directly used; it comes in as a transitive dependency of litellm. No openai-agents in this branch so no pin needed. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com> * fix an litellm error log * resolve comments --------- Co-authored-by: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
parent
4b4b20f9c4
commit
2403be8f27
5 changed files with 78 additions and 104 deletions
|
|
@ -1,4 +1,5 @@
|
||||||
model: "gpt-4o-2024-11-20"
|
model: "gpt-4o-2024-11-20"
|
||||||
|
# model: "anthropic/claude-sonnet-4-6"
|
||||||
toc_check_page_num: 20
|
toc_check_page_num: 20
|
||||||
max_page_num_each_node: 10
|
max_page_num_each_node: 10
|
||||||
max_token_num_each_node: 20000
|
max_token_num_each_node: 20000
|
||||||
|
|
|
||||||
|
|
@ -36,7 +36,7 @@ async def check_title_appearance(item, page_list, start_index=1, model=None):
|
||||||
}}
|
}}
|
||||||
Directly return the final JSON structure. Do not output anything else."""
|
Directly return the final JSON structure. Do not output anything else."""
|
||||||
|
|
||||||
response = await ChatGPT_API_async(model=model, prompt=prompt)
|
response = await llm_acompletion(model=model, prompt=prompt)
|
||||||
response = extract_json(response)
|
response = extract_json(response)
|
||||||
if 'answer' in response:
|
if 'answer' in response:
|
||||||
answer = response['answer']
|
answer = response['answer']
|
||||||
|
|
@ -64,7 +64,7 @@ async def check_title_appearance_in_start(title, page_text, model=None, logger=N
|
||||||
}}
|
}}
|
||||||
Directly return the final JSON structure. Do not output anything else."""
|
Directly return the final JSON structure. Do not output anything else."""
|
||||||
|
|
||||||
response = await ChatGPT_API_async(model=model, prompt=prompt)
|
response = await llm_acompletion(model=model, prompt=prompt)
|
||||||
response = extract_json(response)
|
response = extract_json(response)
|
||||||
if logger:
|
if logger:
|
||||||
logger.info(f"Response: {response}")
|
logger.info(f"Response: {response}")
|
||||||
|
|
@ -116,7 +116,7 @@ def toc_detector_single_page(content, model=None):
|
||||||
Directly return the final JSON structure. Do not output anything else.
|
Directly return the final JSON structure. Do not output anything else.
|
||||||
Please note: abstract,summary, notation list, figure list, table list, etc. are not table of contents."""
|
Please note: abstract,summary, notation list, figure list, table list, etc. are not table of contents."""
|
||||||
|
|
||||||
response = ChatGPT_API(model=model, prompt=prompt)
|
response = llm_completion(model=model, prompt=prompt)
|
||||||
# print('response', response)
|
# print('response', response)
|
||||||
json_content = extract_json(response)
|
json_content = extract_json(response)
|
||||||
return json_content['toc_detected']
|
return json_content['toc_detected']
|
||||||
|
|
@ -135,7 +135,7 @@ def check_if_toc_extraction_is_complete(content, toc, model=None):
|
||||||
Directly return the final JSON structure. Do not output anything else."""
|
Directly return the final JSON structure. Do not output anything else."""
|
||||||
|
|
||||||
prompt = prompt + '\n Document:\n' + content + '\n Table of contents:\n' + toc
|
prompt = prompt + '\n Document:\n' + content + '\n Table of contents:\n' + toc
|
||||||
response = ChatGPT_API(model=model, prompt=prompt)
|
response = llm_completion(model=model, prompt=prompt)
|
||||||
json_content = extract_json(response)
|
json_content = extract_json(response)
|
||||||
return json_content['completed']
|
return json_content['completed']
|
||||||
|
|
||||||
|
|
@ -153,7 +153,7 @@ def check_if_toc_transformation_is_complete(content, toc, model=None):
|
||||||
Directly return the final JSON structure. Do not output anything else."""
|
Directly return the final JSON structure. Do not output anything else."""
|
||||||
|
|
||||||
prompt = prompt + '\n Raw Table of contents:\n' + content + '\n Cleaned Table of contents:\n' + toc
|
prompt = prompt + '\n Raw Table of contents:\n' + content + '\n Cleaned Table of contents:\n' + toc
|
||||||
response = ChatGPT_API(model=model, prompt=prompt)
|
response = llm_completion(model=model, prompt=prompt)
|
||||||
json_content = extract_json(response)
|
json_content = extract_json(response)
|
||||||
return json_content['completed']
|
return json_content['completed']
|
||||||
|
|
||||||
|
|
@ -165,7 +165,7 @@ def extract_toc_content(content, model=None):
|
||||||
|
|
||||||
Directly return the full table of contents content. Do not output anything else."""
|
Directly return the full table of contents content. Do not output anything else."""
|
||||||
|
|
||||||
response, finish_reason = ChatGPT_API_with_finish_reason(model=model, prompt=prompt)
|
response, finish_reason = llm_completion(model=model, prompt=prompt, return_finish_reason=True)
|
||||||
|
|
||||||
if_complete = check_if_toc_transformation_is_complete(content, response, model)
|
if_complete = check_if_toc_transformation_is_complete(content, response, model)
|
||||||
if if_complete == "yes" and finish_reason == "finished":
|
if if_complete == "yes" and finish_reason == "finished":
|
||||||
|
|
@ -176,7 +176,7 @@ def extract_toc_content(content, model=None):
|
||||||
{"role": "assistant", "content": response},
|
{"role": "assistant", "content": response},
|
||||||
]
|
]
|
||||||
prompt = f"""please continue the generation of table of contents , directly output the remaining part of the structure"""
|
prompt = f"""please continue the generation of table of contents , directly output the remaining part of the structure"""
|
||||||
new_response, finish_reason = ChatGPT_API_with_finish_reason(model=model, prompt=prompt, chat_history=chat_history)
|
new_response, finish_reason = llm_completion(model=model, prompt=prompt, chat_history=chat_history, return_finish_reason=True)
|
||||||
response = response + new_response
|
response = response + new_response
|
||||||
if_complete = check_if_toc_transformation_is_complete(content, response, model)
|
if_complete = check_if_toc_transformation_is_complete(content, response, model)
|
||||||
|
|
||||||
|
|
@ -193,7 +193,7 @@ def extract_toc_content(content, model=None):
|
||||||
{"role": "assistant", "content": response},
|
{"role": "assistant", "content": response},
|
||||||
]
|
]
|
||||||
prompt = f"""please continue the generation of table of contents , directly output the remaining part of the structure"""
|
prompt = f"""please continue the generation of table of contents , directly output the remaining part of the structure"""
|
||||||
new_response, finish_reason = ChatGPT_API_with_finish_reason(model=model, prompt=prompt, chat_history=chat_history)
|
new_response, finish_reason = llm_completion(model=model, prompt=prompt, chat_history=chat_history, return_finish_reason=True)
|
||||||
response = response + new_response
|
response = response + new_response
|
||||||
if_complete = check_if_toc_transformation_is_complete(content, response, model)
|
if_complete = check_if_toc_transformation_is_complete(content, response, model)
|
||||||
|
|
||||||
|
|
@ -215,7 +215,7 @@ def detect_page_index(toc_content, model=None):
|
||||||
}}
|
}}
|
||||||
Directly return the final JSON structure. Do not output anything else."""
|
Directly return the final JSON structure. Do not output anything else."""
|
||||||
|
|
||||||
response = ChatGPT_API(model=model, prompt=prompt)
|
response = llm_completion(model=model, prompt=prompt)
|
||||||
json_content = extract_json(response)
|
json_content = extract_json(response)
|
||||||
return json_content['page_index_given_in_toc']
|
return json_content['page_index_given_in_toc']
|
||||||
|
|
||||||
|
|
@ -264,7 +264,7 @@ def toc_index_extractor(toc, content, model=None):
|
||||||
Directly return the final JSON structure. Do not output anything else."""
|
Directly return the final JSON structure. Do not output anything else."""
|
||||||
|
|
||||||
prompt = toc_extractor_prompt + '\nTable of contents:\n' + str(toc) + '\nDocument pages:\n' + content
|
prompt = toc_extractor_prompt + '\nTable of contents:\n' + str(toc) + '\nDocument pages:\n' + content
|
||||||
response = ChatGPT_API(model=model, prompt=prompt)
|
response = llm_completion(model=model, prompt=prompt)
|
||||||
json_content = extract_json(response)
|
json_content = extract_json(response)
|
||||||
return json_content
|
return json_content
|
||||||
|
|
||||||
|
|
@ -292,7 +292,7 @@ def toc_transformer(toc_content, model=None):
|
||||||
Directly return the final JSON structure, do not output anything else. """
|
Directly return the final JSON structure, do not output anything else. """
|
||||||
|
|
||||||
prompt = init_prompt + '\n Given table of contents\n:' + toc_content
|
prompt = init_prompt + '\n Given table of contents\n:' + toc_content
|
||||||
last_complete, finish_reason = ChatGPT_API_with_finish_reason(model=model, prompt=prompt)
|
last_complete, finish_reason = llm_completion(model=model, prompt=prompt, return_finish_reason=True)
|
||||||
if_complete = check_if_toc_transformation_is_complete(toc_content, last_complete, model)
|
if_complete = check_if_toc_transformation_is_complete(toc_content, last_complete, model)
|
||||||
if if_complete == "yes" and finish_reason == "finished":
|
if if_complete == "yes" and finish_reason == "finished":
|
||||||
last_complete = extract_json(last_complete)
|
last_complete = extract_json(last_complete)
|
||||||
|
|
@ -300,7 +300,12 @@ def toc_transformer(toc_content, model=None):
|
||||||
return cleaned_response
|
return cleaned_response
|
||||||
|
|
||||||
last_complete = get_json_content(last_complete)
|
last_complete = get_json_content(last_complete)
|
||||||
|
attempt = 0
|
||||||
|
max_attempts = 5
|
||||||
while not (if_complete == "yes" and finish_reason == "finished"):
|
while not (if_complete == "yes" and finish_reason == "finished"):
|
||||||
|
attempt += 1
|
||||||
|
if attempt > max_attempts:
|
||||||
|
raise Exception('Failed to complete toc transformation after maximum retries')
|
||||||
position = last_complete.rfind('}')
|
position = last_complete.rfind('}')
|
||||||
if position != -1:
|
if position != -1:
|
||||||
last_complete = last_complete[:position+2]
|
last_complete = last_complete[:position+2]
|
||||||
|
|
@ -316,7 +321,7 @@ def toc_transformer(toc_content, model=None):
|
||||||
|
|
||||||
Please continue the json structure, directly output the remaining part of the json structure."""
|
Please continue the json structure, directly output the remaining part of the json structure."""
|
||||||
|
|
||||||
new_complete, finish_reason = ChatGPT_API_with_finish_reason(model=model, prompt=prompt)
|
new_complete, finish_reason = llm_completion(model=model, prompt=prompt, return_finish_reason=True)
|
||||||
|
|
||||||
if new_complete.startswith('```json'):
|
if new_complete.startswith('```json'):
|
||||||
new_complete = get_json_content(new_complete)
|
new_complete = get_json_content(new_complete)
|
||||||
|
|
@ -477,7 +482,7 @@ def add_page_number_to_toc(part, structure, model=None):
|
||||||
Directly return the final JSON structure. Do not output anything else."""
|
Directly return the final JSON structure. Do not output anything else."""
|
||||||
|
|
||||||
prompt = fill_prompt_seq + f"\n\nCurrent Partial Document:\n{part}\n\nGiven Structure\n{json.dumps(structure, indent=2)}\n"
|
prompt = fill_prompt_seq + f"\n\nCurrent Partial Document:\n{part}\n\nGiven Structure\n{json.dumps(structure, indent=2)}\n"
|
||||||
current_json_raw = ChatGPT_API(model=model, prompt=prompt)
|
current_json_raw = llm_completion(model=model, prompt=prompt)
|
||||||
json_result = extract_json(current_json_raw)
|
json_result = extract_json(current_json_raw)
|
||||||
|
|
||||||
for item in json_result:
|
for item in json_result:
|
||||||
|
|
@ -499,7 +504,7 @@ def remove_first_physical_index_section(text):
|
||||||
return text
|
return text
|
||||||
|
|
||||||
### add verify completeness
|
### add verify completeness
|
||||||
def generate_toc_continue(toc_content, part, model="gpt-4o-2024-11-20"):
|
def generate_toc_continue(toc_content, part, model=None):
|
||||||
print('start generate_toc_continue')
|
print('start generate_toc_continue')
|
||||||
prompt = """
|
prompt = """
|
||||||
You are an expert in extracting hierarchical tree structure.
|
You are an expert in extracting hierarchical tree structure.
|
||||||
|
|
@ -527,7 +532,7 @@ def generate_toc_continue(toc_content, part, model="gpt-4o-2024-11-20"):
|
||||||
Directly return the additional part of the final JSON structure. Do not output anything else."""
|
Directly return the additional part of the final JSON structure. Do not output anything else."""
|
||||||
|
|
||||||
prompt = prompt + '\nGiven text\n:' + part + '\nPrevious tree structure\n:' + json.dumps(toc_content, indent=2)
|
prompt = prompt + '\nGiven text\n:' + part + '\nPrevious tree structure\n:' + json.dumps(toc_content, indent=2)
|
||||||
response, finish_reason = ChatGPT_API_with_finish_reason(model=model, prompt=prompt)
|
response, finish_reason = llm_completion(model=model, prompt=prompt, return_finish_reason=True)
|
||||||
if finish_reason == 'finished':
|
if finish_reason == 'finished':
|
||||||
return extract_json(response)
|
return extract_json(response)
|
||||||
else:
|
else:
|
||||||
|
|
@ -561,7 +566,7 @@ def generate_toc_init(part, model=None):
|
||||||
Directly return the final JSON structure. Do not output anything else."""
|
Directly return the final JSON structure. Do not output anything else."""
|
||||||
|
|
||||||
prompt = prompt + '\nGiven text\n:' + part
|
prompt = prompt + '\nGiven text\n:' + part
|
||||||
response, finish_reason = ChatGPT_API_with_finish_reason(model=model, prompt=prompt)
|
response, finish_reason = llm_completion(model=model, prompt=prompt, return_finish_reason=True)
|
||||||
|
|
||||||
if finish_reason == 'finished':
|
if finish_reason == 'finished':
|
||||||
return extract_json(response)
|
return extract_json(response)
|
||||||
|
|
@ -732,7 +737,7 @@ def check_toc(page_list, opt=None):
|
||||||
|
|
||||||
|
|
||||||
################### fix incorrect toc #########################################################
|
################### fix incorrect toc #########################################################
|
||||||
def single_toc_item_index_fixer(section_title, content, model="gpt-4o-2024-11-20"):
|
async def single_toc_item_index_fixer(section_title, content, model=None):
|
||||||
toc_extractor_prompt = """
|
toc_extractor_prompt = """
|
||||||
You are given a section title and several pages of a document, your job is to find the physical index of the start page of the section in the partial document.
|
You are given a section title and several pages of a document, your job is to find the physical index of the start page of the section in the partial document.
|
||||||
|
|
||||||
|
|
@ -746,7 +751,7 @@ def single_toc_item_index_fixer(section_title, content, model="gpt-4o-2024-11-20
|
||||||
Directly return the final JSON structure. Do not output anything else."""
|
Directly return the final JSON structure. Do not output anything else."""
|
||||||
|
|
||||||
prompt = toc_extractor_prompt + '\nSection Title:\n' + str(section_title) + '\nDocument pages:\n' + content
|
prompt = toc_extractor_prompt + '\nSection Title:\n' + str(section_title) + '\nDocument pages:\n' + content
|
||||||
response = ChatGPT_API(model=model, prompt=prompt)
|
response = await llm_acompletion(model=model, prompt=prompt)
|
||||||
json_content = extract_json(response)
|
json_content = extract_json(response)
|
||||||
return convert_physical_index_to_int(json_content['physical_index'])
|
return convert_physical_index_to_int(json_content['physical_index'])
|
||||||
|
|
||||||
|
|
@ -815,7 +820,7 @@ async def fix_incorrect_toc(toc_with_page_number, page_list, incorrect_results,
|
||||||
continue
|
continue
|
||||||
content_range = ''.join(page_contents)
|
content_range = ''.join(page_contents)
|
||||||
|
|
||||||
physical_index_int = single_toc_item_index_fixer(incorrect_item['title'], content_range, model)
|
physical_index_int = await single_toc_item_index_fixer(incorrect_item['title'], content_range, model)
|
||||||
|
|
||||||
# Check if the result is correct
|
# Check if the result is correct
|
||||||
check_item = incorrect_item.copy()
|
check_item = incorrect_item.copy()
|
||||||
|
|
@ -1069,7 +1074,7 @@ def page_index_main(doc, opt=None):
|
||||||
raise ValueError("Unsupported input type. Expected a PDF file path or BytesIO object.")
|
raise ValueError("Unsupported input type. Expected a PDF file path or BytesIO object.")
|
||||||
|
|
||||||
print('Parsing PDF...')
|
print('Parsing PDF...')
|
||||||
page_list = get_page_tokens(doc)
|
page_list = get_page_tokens(doc, model=opt.model)
|
||||||
|
|
||||||
logger.info({'total_page_number': len(page_list)})
|
logger.info({'total_page_number': len(page_list)})
|
||||||
logger.info({'total_token': sum([page[1] for page in page_list])})
|
logger.info({'total_token': sum([page[1] for page in page_list])})
|
||||||
|
|
|
||||||
|
|
@ -1,5 +1,4 @@
|
||||||
import tiktoken
|
import litellm
|
||||||
import openai
|
|
||||||
import logging
|
import logging
|
||||||
import os
|
import os
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
|
|
@ -17,95 +16,65 @@ import yaml
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from types import SimpleNamespace as config
|
from types import SimpleNamespace as config
|
||||||
|
|
||||||
CHATGPT_API_KEY = os.getenv("CHATGPT_API_KEY")
|
# Backward compatibility: support CHATGPT_API_KEY as alias for OPENAI_API_KEY
|
||||||
|
if not os.getenv("OPENAI_API_KEY") and os.getenv("CHATGPT_API_KEY"):
|
||||||
|
os.environ["OPENAI_API_KEY"] = os.getenv("CHATGPT_API_KEY")
|
||||||
|
|
||||||
|
litellm.drop_params = True
|
||||||
|
|
||||||
def count_tokens(text, model=None):
|
def count_tokens(text, model=None):
|
||||||
if not text:
|
if not text:
|
||||||
return 0
|
return 0
|
||||||
enc = tiktoken.encoding_for_model(model)
|
return litellm.token_counter(model=model, text=text)
|
||||||
tokens = enc.encode(text)
|
|
||||||
return len(tokens)
|
|
||||||
|
|
||||||
def ChatGPT_API_with_finish_reason(model, prompt, api_key=CHATGPT_API_KEY, chat_history=None):
|
|
||||||
|
def llm_completion(model, prompt, chat_history=None, return_finish_reason=False):
|
||||||
max_retries = 10
|
max_retries = 10
|
||||||
client = openai.OpenAI(api_key=api_key)
|
messages = list(chat_history) + [{"role": "user", "content": prompt}] if chat_history else [{"role": "user", "content": prompt}]
|
||||||
for i in range(max_retries):
|
for i in range(max_retries):
|
||||||
try:
|
try:
|
||||||
if chat_history:
|
response = litellm.completion(
|
||||||
messages = chat_history
|
|
||||||
messages.append({"role": "user", "content": prompt})
|
|
||||||
else:
|
|
||||||
messages = [{"role": "user", "content": prompt}]
|
|
||||||
|
|
||||||
response = client.chat.completions.create(
|
|
||||||
model=model,
|
model=model,
|
||||||
messages=messages,
|
messages=messages,
|
||||||
temperature=0,
|
temperature=0,
|
||||||
)
|
)
|
||||||
if response.choices[0].finish_reason == "length":
|
content = response.choices[0].message.content
|
||||||
return response.choices[0].message.content, "max_output_reached"
|
if return_finish_reason:
|
||||||
else:
|
finish_reason = "max_output_reached" if response.choices[0].finish_reason == "length" else "finished"
|
||||||
return response.choices[0].message.content, "finished"
|
return content, finish_reason
|
||||||
|
return content
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print('************* Retrying *************')
|
print('************* Retrying *************')
|
||||||
logging.error(f"Error: {e}")
|
logging.error(f"Error: {e}")
|
||||||
if i < max_retries - 1:
|
if i < max_retries - 1:
|
||||||
time.sleep(1) # Wait for 1秒 before retrying
|
time.sleep(1)
|
||||||
else:
|
else:
|
||||||
logging.error('Max retries reached for prompt: ' + prompt)
|
logging.error('Max retries reached for prompt: ' + prompt)
|
||||||
return "", "error"
|
if return_finish_reason:
|
||||||
|
return "", "error"
|
||||||
|
return ""
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def ChatGPT_API(model, prompt, api_key=CHATGPT_API_KEY, chat_history=None):
|
async def llm_acompletion(model, prompt):
|
||||||
max_retries = 10
|
max_retries = 10
|
||||||
client = openai.OpenAI(api_key=api_key)
|
messages = [{"role": "user", "content": prompt}]
|
||||||
for i in range(max_retries):
|
for i in range(max_retries):
|
||||||
try:
|
try:
|
||||||
if chat_history:
|
response = await litellm.acompletion(
|
||||||
messages = chat_history
|
|
||||||
messages.append({"role": "user", "content": prompt})
|
|
||||||
else:
|
|
||||||
messages = [{"role": "user", "content": prompt}]
|
|
||||||
|
|
||||||
response = client.chat.completions.create(
|
|
||||||
model=model,
|
model=model,
|
||||||
messages=messages,
|
messages=messages,
|
||||||
temperature=0,
|
temperature=0,
|
||||||
)
|
)
|
||||||
|
|
||||||
return response.choices[0].message.content
|
return response.choices[0].message.content
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print('************* Retrying *************')
|
print('************* Retrying *************')
|
||||||
logging.error(f"Error: {e}")
|
logging.error(f"Error: {e}")
|
||||||
if i < max_retries - 1:
|
if i < max_retries - 1:
|
||||||
time.sleep(1) # Wait for 1秒 before retrying
|
await asyncio.sleep(1)
|
||||||
else:
|
else:
|
||||||
logging.error('Max retries reached for prompt: ' + prompt)
|
logging.error('Max retries reached for prompt: ' + prompt)
|
||||||
return "Error"
|
return ""
|
||||||
|
|
||||||
|
|
||||||
async def ChatGPT_API_async(model, prompt, api_key=CHATGPT_API_KEY):
|
|
||||||
max_retries = 10
|
|
||||||
messages = [{"role": "user", "content": prompt}]
|
|
||||||
for i in range(max_retries):
|
|
||||||
try:
|
|
||||||
async with openai.AsyncOpenAI(api_key=api_key) as client:
|
|
||||||
response = await client.chat.completions.create(
|
|
||||||
model=model,
|
|
||||||
messages=messages,
|
|
||||||
temperature=0,
|
|
||||||
)
|
|
||||||
return response.choices[0].message.content
|
|
||||||
except Exception as e:
|
|
||||||
print('************* Retrying *************')
|
|
||||||
logging.error(f"Error: {e}")
|
|
||||||
if i < max_retries - 1:
|
|
||||||
await asyncio.sleep(1) # Wait for 1s before retrying
|
|
||||||
else:
|
|
||||||
logging.error('Max retries reached for prompt: ' + prompt)
|
|
||||||
return "Error"
|
|
||||||
|
|
||||||
|
|
||||||
def get_json_content(response):
|
def get_json_content(response):
|
||||||
|
|
@ -410,15 +379,14 @@ def add_preface_if_needed(data):
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def get_page_tokens(pdf_path, model="gpt-4o-2024-11-20", pdf_parser="PyPDF2"):
|
def get_page_tokens(pdf_path, model=None, pdf_parser="PyPDF2"):
|
||||||
enc = tiktoken.encoding_for_model(model)
|
|
||||||
if pdf_parser == "PyPDF2":
|
if pdf_parser == "PyPDF2":
|
||||||
pdf_reader = PyPDF2.PdfReader(pdf_path)
|
pdf_reader = PyPDF2.PdfReader(pdf_path)
|
||||||
page_list = []
|
page_list = []
|
||||||
for page_num in range(len(pdf_reader.pages)):
|
for page_num in range(len(pdf_reader.pages)):
|
||||||
page = pdf_reader.pages[page_num]
|
page = pdf_reader.pages[page_num]
|
||||||
page_text = page.extract_text()
|
page_text = page.extract_text()
|
||||||
token_length = len(enc.encode(page_text))
|
token_length = litellm.token_counter(model=model, text=page_text)
|
||||||
page_list.append((page_text, token_length))
|
page_list.append((page_text, token_length))
|
||||||
return page_list
|
return page_list
|
||||||
elif pdf_parser == "PyMuPDF":
|
elif pdf_parser == "PyMuPDF":
|
||||||
|
|
@ -430,7 +398,7 @@ def get_page_tokens(pdf_path, model="gpt-4o-2024-11-20", pdf_parser="PyPDF2"):
|
||||||
page_list = []
|
page_list = []
|
||||||
for page in doc:
|
for page in doc:
|
||||||
page_text = page.get_text()
|
page_text = page.get_text()
|
||||||
token_length = len(enc.encode(page_text))
|
token_length = litellm.token_counter(model=model, text=page_text)
|
||||||
page_list.append((page_text, token_length))
|
page_list.append((page_text, token_length))
|
||||||
return page_list
|
return page_list
|
||||||
else:
|
else:
|
||||||
|
|
@ -533,7 +501,7 @@ def remove_structure_text(data):
|
||||||
def check_token_limit(structure, limit=110000):
|
def check_token_limit(structure, limit=110000):
|
||||||
list = structure_to_list(structure)
|
list = structure_to_list(structure)
|
||||||
for node in list:
|
for node in list:
|
||||||
num_tokens = count_tokens(node['text'], model='gpt-4o')
|
num_tokens = count_tokens(node['text'], model=None)
|
||||||
if num_tokens > limit:
|
if num_tokens > limit:
|
||||||
print(f"Node ID: {node['node_id']} has {num_tokens} tokens")
|
print(f"Node ID: {node['node_id']} has {num_tokens} tokens")
|
||||||
print("Start Index:", node['start_index'])
|
print("Start Index:", node['start_index'])
|
||||||
|
|
@ -609,7 +577,7 @@ async def generate_node_summary(node, model=None):
|
||||||
|
|
||||||
Directly return the description, do not include any other text.
|
Directly return the description, do not include any other text.
|
||||||
"""
|
"""
|
||||||
response = await ChatGPT_API_async(model, prompt)
|
response = await llm_acompletion(model, prompt)
|
||||||
return response
|
return response
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -654,7 +622,7 @@ def generate_doc_description(structure, model=None):
|
||||||
|
|
||||||
Directly return the description, do not include any other text.
|
Directly return the description, do not include any other text.
|
||||||
"""
|
"""
|
||||||
response = ChatGPT_API(model, prompt)
|
response = llm_completion(model, prompt)
|
||||||
return response
|
return response
|
||||||
|
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -1,6 +1,5 @@
|
||||||
openai==1.101.0
|
litellm==1.82.0
|
||||||
pymupdf==1.26.4
|
pymupdf==1.26.4
|
||||||
PyPDF2==3.0.1
|
PyPDF2==3.0.1
|
||||||
python-dotenv==1.1.0
|
python-dotenv==1.1.0
|
||||||
tiktoken==0.11.0
|
|
||||||
pyyaml==6.0.2
|
pyyaml==6.0.2
|
||||||
|
|
|
||||||
|
|
@ -3,6 +3,7 @@ import os
|
||||||
import json
|
import json
|
||||||
from pageindex import *
|
from pageindex import *
|
||||||
from pageindex.page_index_md import md_to_tree
|
from pageindex.page_index_md import md_to_tree
|
||||||
|
from pageindex.utils import ConfigLoader
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
# Set up argument parser
|
# Set up argument parser
|
||||||
|
|
@ -10,22 +11,22 @@ if __name__ == "__main__":
|
||||||
parser.add_argument('--pdf_path', type=str, help='Path to the PDF file')
|
parser.add_argument('--pdf_path', type=str, help='Path to the PDF file')
|
||||||
parser.add_argument('--md_path', type=str, help='Path to the Markdown file')
|
parser.add_argument('--md_path', type=str, help='Path to the Markdown file')
|
||||||
|
|
||||||
parser.add_argument('--model', type=str, default='gpt-4o-2024-11-20', help='Model to use')
|
parser.add_argument('--model', type=str, default=None, help='Model to use (overrides config.yaml)')
|
||||||
|
|
||||||
parser.add_argument('--toc-check-pages', type=int, default=20,
|
parser.add_argument('--toc-check-pages', type=int, default=None,
|
||||||
help='Number of pages to check for table of contents (PDF only)')
|
help='Number of pages to check for table of contents (PDF only)')
|
||||||
parser.add_argument('--max-pages-per-node', type=int, default=10,
|
parser.add_argument('--max-pages-per-node', type=int, default=None,
|
||||||
help='Maximum number of pages per node (PDF only)')
|
help='Maximum number of pages per node (PDF only)')
|
||||||
parser.add_argument('--max-tokens-per-node', type=int, default=20000,
|
parser.add_argument('--max-tokens-per-node', type=int, default=None,
|
||||||
help='Maximum number of tokens per node (PDF only)')
|
help='Maximum number of tokens per node (PDF only)')
|
||||||
|
|
||||||
parser.add_argument('--if-add-node-id', type=str, default='yes',
|
parser.add_argument('--if-add-node-id', type=str, default=None,
|
||||||
help='Whether to add node id to the node')
|
help='Whether to add node id to the node')
|
||||||
parser.add_argument('--if-add-node-summary', type=str, default='yes',
|
parser.add_argument('--if-add-node-summary', type=str, default=None,
|
||||||
help='Whether to add summary to the node')
|
help='Whether to add summary to the node')
|
||||||
parser.add_argument('--if-add-doc-description', type=str, default='no',
|
parser.add_argument('--if-add-doc-description', type=str, default=None,
|
||||||
help='Whether to add doc description to the doc')
|
help='Whether to add doc description to the doc')
|
||||||
parser.add_argument('--if-add-node-text', type=str, default='no',
|
parser.add_argument('--if-add-node-text', type=str, default=None,
|
||||||
help='Whether to add text to the node')
|
help='Whether to add text to the node')
|
||||||
|
|
||||||
# Markdown specific arguments
|
# Markdown specific arguments
|
||||||
|
|
@ -51,17 +52,17 @@ if __name__ == "__main__":
|
||||||
raise ValueError(f"PDF file not found: {args.pdf_path}")
|
raise ValueError(f"PDF file not found: {args.pdf_path}")
|
||||||
|
|
||||||
# Process PDF file
|
# Process PDF file
|
||||||
# Configure options
|
user_opt = {
|
||||||
opt = config(
|
'model': args.model,
|
||||||
model=args.model,
|
'toc_check_page_num': args.toc_check_pages,
|
||||||
toc_check_page_num=args.toc_check_pages,
|
'max_page_num_each_node': args.max_pages_per_node,
|
||||||
max_page_num_each_node=args.max_pages_per_node,
|
'max_token_num_each_node': args.max_tokens_per_node,
|
||||||
max_token_num_each_node=args.max_tokens_per_node,
|
'if_add_node_id': args.if_add_node_id,
|
||||||
if_add_node_id=args.if_add_node_id,
|
'if_add_node_summary': args.if_add_node_summary,
|
||||||
if_add_node_summary=args.if_add_node_summary,
|
'if_add_doc_description': args.if_add_doc_description,
|
||||||
if_add_doc_description=args.if_add_doc_description,
|
'if_add_node_text': args.if_add_node_text,
|
||||||
if_add_node_text=args.if_add_node_text
|
}
|
||||||
)
|
opt = ConfigLoader().load({k: v for k, v in user_opt.items() if v is not None})
|
||||||
|
|
||||||
# Process the PDF
|
# Process the PDF
|
||||||
toc_with_page_number = page_index_main(args.pdf_path, opt)
|
toc_with_page_number = page_index_main(args.pdf_path, opt)
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue