fix: rename tob_extractor_prompt typo to toc_extractor_prompt (#109)

Signed-off-by: Matías Insaurralde <matias@insaurral.de>
This commit is contained in:
Matias Insaurralde 2026-02-27 04:16:19 -03:00 committed by GitHub
parent 89bcb9240a
commit cf52a678a3
No known key found for this signature in database
GPG key ID: B5690EEEBB952194

View file

@ -239,7 +239,7 @@ def toc_extractor(page_list, toc_page_list, model):
def toc_index_extractor(toc, content, model=None):
print('start toc_index_extractor')
tob_extractor_prompt = """
toc_extractor_prompt = """
You are given a table of contents in a json format and several pages of a document, your job is to add the physical_index to the table of contents in the json format.
The provided pages contains tags like <physical_index_X> and <physical_index_X> to indicate the physical location of the page X.
@ -260,7 +260,7 @@ def toc_index_extractor(toc, content, model=None):
If the section is not in the provided pages, do not add the physical_index to it.
Directly return the final JSON structure. Do not output anything else."""
prompt = tob_extractor_prompt + '\nTable of contents:\n' + str(toc) + '\nDocument pages:\n' + content
prompt = toc_extractor_prompt + '\nTable of contents:\n' + str(toc) + '\nDocument pages:\n' + content
response = ChatGPT_API(model=model, prompt=prompt)
json_content = extract_json(response)
return json_content
@ -730,7 +730,7 @@ def check_toc(page_list, opt=None):
################### fix incorrect toc #########################################################
def single_toc_item_index_fixer(section_title, content, model="gpt-4o-2024-11-20"):
tob_extractor_prompt = """
toc_extractor_prompt = """
You are given a section title and several pages of a document, your job is to find the physical index of the start page of the section in the partial document.
The provided pages contains tags like <physical_index_X> and <physical_index_X> to indicate the physical location of the page X.
@ -742,7 +742,7 @@ def single_toc_item_index_fixer(section_title, content, model="gpt-4o-2024-11-20
}
Directly return the final JSON structure. Do not output anything else."""
prompt = tob_extractor_prompt + '\nSection Title:\n' + str(section_title) + '\nDocument pages:\n' + content
prompt = toc_extractor_prompt + '\nSection Title:\n' + str(section_title) + '\nDocument pages:\n' + content
response = ChatGPT_API(model=model, prompt=prompt)
json_content = extract_json(response)
return convert_physical_index_to_int(json_content['physical_index'])