mirror of
https://github.com/VectifyAI/PageIndex.git
synced 2026-05-31 19:15:15 +02:00
disable process_toc_no_page_numbers
This commit is contained in:
parent
d6b92e0372
commit
23d1614291
5 changed files with 238 additions and 160 deletions
BIN
docs/earthmover.pdf
Normal file
BIN
docs/earthmover.pdf
Normal file
Binary file not shown.
|
|
@ -27,7 +27,7 @@ def check_title_appearance(item, page_list, start_index=1, model=None):
|
|||
prompt = f"""
|
||||
Your job is to check if the given section appears or starts in the given page_text.
|
||||
|
||||
Note: ignore any space inconsistency in the page_text.
|
||||
Note: do fuzzy matching, ignore any space inconsistency in the page_text.
|
||||
|
||||
The given section title is {title}.
|
||||
The given page_text is {page_text}.
|
||||
|
|
@ -178,7 +178,7 @@ def extract_toc_content(content, model=None):
|
|||
prompt = f"""please continue the generation of table of contents , directly output the remaining part of the structure"""
|
||||
new_response, finish_reason = ChatGPT_API_with_finish_reason(model=model, prompt=prompt, chat_history=chat_history)
|
||||
response = response + new_response
|
||||
if_complete = check_if_toc_transformation_is_complete(content, response)
|
||||
if_complete = check_if_toc_transformation_is_complete(content, response, model)
|
||||
|
||||
while not (if_complete == "yes" and finish_reason == "finished"):
|
||||
chat_history = [
|
||||
|
|
@ -188,7 +188,7 @@ def extract_toc_content(content, model=None):
|
|||
prompt = f"""please continue the generation of table of contents , directly output the remaining part of the structure"""
|
||||
new_response, finish_reason = ChatGPT_API_with_finish_reason(model=model, prompt=prompt, chat_history=chat_history)
|
||||
response = response + new_response
|
||||
if_complete = check_if_toc_transformation_is_complete(content, response)
|
||||
if_complete = check_if_toc_transformation_is_complete(content, response, model)
|
||||
|
||||
# Optional: Add a maximum retry limit to prevent infinite loops
|
||||
if len(chat_history) > 5: # Arbitrary limit of 10 attempts
|
||||
|
|
@ -207,6 +207,7 @@ def detect_page_index(toc_content, model=None):
|
|||
|
||||
Reply format:
|
||||
{{
|
||||
"thinking": <why do you think there are page numbers/indices given within the table of contents>
|
||||
"page_index_given_in_toc": "<yes or no>"
|
||||
}}
|
||||
Directly return the final JSON structure. Do not output anything else."""
|
||||
|
|
@ -318,7 +319,7 @@ def toc_transformer(toc_content, model=None):
|
|||
new_complete = get_json_content(new_complete)
|
||||
last_complete = last_complete+new_complete
|
||||
|
||||
if_complete = check_if_toc_transformation_is_complete(toc_content, last_complete)
|
||||
if_complete = check_if_toc_transformation_is_complete(toc_content, last_complete, model)
|
||||
|
||||
|
||||
last_complete = json.loads(last_complete)
|
||||
|
|
@ -615,7 +616,7 @@ def process_toc_with_page_numbers(toc_content, toc_page_list, page_list, model=N
|
|||
|
||||
start_page_index = toc_page_list[-1] + 1
|
||||
main_content = ""
|
||||
for page_index in range(start_page_index, start_page_index + 20):
|
||||
for page_index in range(start_page_index, min(start_page_index + opt.toc_check_page_num, len(page_list))):
|
||||
main_content += f"<physical_index_{page_index+1}>\n{page_list[page_index][0]}\n<physical_index_{page_index+1}>\n\n"
|
||||
|
||||
toc_with_physical_index = toc_index_extractor(toc_no_page_number, main_content, model)
|
||||
|
|
@ -784,10 +785,7 @@ def fix_incorrect_toc(toc_with_page_number, page_list, incorrect_results, start_
|
|||
page_contents.append(page_text)
|
||||
content_range = ''.join(page_contents)
|
||||
|
||||
physical_index = single_toc_item_index_fixer(incorrect_item['title'], content_range, model)
|
||||
|
||||
# Convert to int for checking
|
||||
physical_index_int = convert_physical_index_to_int(physical_index)
|
||||
physical_index_int = single_toc_item_index_fixer(incorrect_item['title'], content_range, model)
|
||||
|
||||
# Check if the result is correct
|
||||
check_item = incorrect_item.copy()
|
||||
|
|
@ -978,33 +976,23 @@ def tree_parser(page_list, opt, logger=None):
|
|||
check_toc_result = check_toc(page_list, opt)
|
||||
logger.info(check_toc_result)
|
||||
|
||||
if check_toc_result['toc_content'] is None:
|
||||
if check_toc_result['toc_content'] is not None and check_toc_result['page_index_given_in_toc'] == 'yes':
|
||||
toc_with_page_number = meta_processor(
|
||||
page_list,
|
||||
mode='process_toc_with_page_numbers',
|
||||
start_index=1,
|
||||
toc_content=check_toc_result['toc_content'],
|
||||
toc_page_list=check_toc_result['toc_page_list'],
|
||||
opt=opt,
|
||||
logger=logger)
|
||||
else:
|
||||
toc_with_page_number = meta_processor(
|
||||
page_list,
|
||||
mode='process_no_toc',
|
||||
start_index=1,
|
||||
opt=opt,
|
||||
logger=logger)
|
||||
else:
|
||||
if check_toc_result['page_index_given_in_toc'] == 'yes':
|
||||
toc_with_page_number = meta_processor(
|
||||
page_list,
|
||||
mode='process_toc_with_page_numbers',
|
||||
start_index=1,
|
||||
toc_content=check_toc_result['toc_content'],
|
||||
toc_page_list=check_toc_result['toc_page_list'],
|
||||
opt=opt,
|
||||
logger=logger)
|
||||
else:
|
||||
toc_with_page_number = meta_processor(
|
||||
page_list,
|
||||
mode='process_toc_no_page_numbers',
|
||||
start_index=1,
|
||||
toc_content=check_toc_result['toc_content'],
|
||||
toc_page_list=check_toc_result['toc_page_list'],
|
||||
opt=opt,
|
||||
logger=logger)
|
||||
|
||||
|
||||
toc_with_page_number = add_preface_if_needed(toc_with_page_number)
|
||||
toc_with_page_number = check_title_appearance_in_start_parallel(toc_with_page_number, page_list, model=opt.model, logger=logger)
|
||||
toc_tree = post_processing(toc_with_page_number, len(page_list))
|
||||
|
|
@ -1026,6 +1014,12 @@ def page_index_main(doc, opt=None):
|
|||
|
||||
print('Parsing PDF...')
|
||||
page_list = get_page_tokens(doc)
|
||||
### store text in page_list to file with their physical index
|
||||
with open(f'./logs/{os.path.basename(doc)}_page_list.txt', 'w', encoding='utf-8') as f:
|
||||
for page_index, page_text in enumerate(page_list):
|
||||
page_text = f"<physical_index_{page_index+1}>\n{page_text[0]}\n<physical_index_{page_index+1}>\n\n"
|
||||
f.write(page_text)
|
||||
|
||||
logger.info({'total_page_number': len(page_list)})
|
||||
logger.info({'total_token': sum([page[1] for page in page_list])})
|
||||
|
||||
|
|
|
|||
137
results/earthmover_structure.json
Normal file
137
results/earthmover_structure.json
Normal file
|
|
@ -0,0 +1,137 @@
|
|||
{
|
||||
"doc_name": "earthmover.pdf",
|
||||
"structure": [
|
||||
{
|
||||
"title": "Earth Mover\u2019s Distance based Similarity Search at Scale",
|
||||
"start_index": 1,
|
||||
"end_index": 1,
|
||||
"node_id": "0000"
|
||||
},
|
||||
{
|
||||
"title": "ABSTRACT",
|
||||
"start_index": 1,
|
||||
"end_index": 1,
|
||||
"node_id": "0001"
|
||||
},
|
||||
{
|
||||
"title": "INTRODUCTION",
|
||||
"start_index": 1,
|
||||
"end_index": 2,
|
||||
"node_id": "0002"
|
||||
},
|
||||
{
|
||||
"title": "PRELIMINARIES",
|
||||
"start_index": 2,
|
||||
"end_index": 2,
|
||||
"nodes": [
|
||||
{
|
||||
"title": "Computing the EMD",
|
||||
"start_index": 3,
|
||||
"end_index": 3,
|
||||
"node_id": "0004"
|
||||
},
|
||||
{
|
||||
"title": "Filter-and-Refinement Framework",
|
||||
"start_index": 3,
|
||||
"end_index": 4,
|
||||
"node_id": "0005"
|
||||
}
|
||||
],
|
||||
"node_id": "0003"
|
||||
},
|
||||
{
|
||||
"title": "SCALING UP SSP",
|
||||
"start_index": 4,
|
||||
"end_index": 5,
|
||||
"node_id": "0006"
|
||||
},
|
||||
{
|
||||
"title": "BOOSTING THE REFINEMENT PHASE",
|
||||
"start_index": 5,
|
||||
"end_index": 5,
|
||||
"nodes": [
|
||||
{
|
||||
"title": "Analysis of EMD Calculation",
|
||||
"start_index": 5,
|
||||
"end_index": 6,
|
||||
"node_id": "0008"
|
||||
},
|
||||
{
|
||||
"title": "Progressive Bounding",
|
||||
"start_index": 6,
|
||||
"end_index": 6,
|
||||
"node_id": "0009"
|
||||
},
|
||||
{
|
||||
"title": "Sensitivity to Refinement Order",
|
||||
"start_index": 6,
|
||||
"end_index": 7,
|
||||
"node_id": "0010"
|
||||
},
|
||||
{
|
||||
"title": "Dynamic Refinement Ordering",
|
||||
"start_index": 7,
|
||||
"end_index": 8,
|
||||
"node_id": "0011"
|
||||
},
|
||||
{
|
||||
"title": "Running Upper Bound",
|
||||
"start_index": 8,
|
||||
"end_index": 8,
|
||||
"node_id": "0012"
|
||||
}
|
||||
],
|
||||
"node_id": "0007"
|
||||
},
|
||||
{
|
||||
"title": "EXPERIMENTAL EVALUATION",
|
||||
"start_index": 8,
|
||||
"end_index": 9,
|
||||
"nodes": [
|
||||
{
|
||||
"title": "Performance Improvement",
|
||||
"start_index": 9,
|
||||
"end_index": 10,
|
||||
"node_id": "0014"
|
||||
},
|
||||
{
|
||||
"title": "Scalability Experiments",
|
||||
"start_index": 10,
|
||||
"end_index": 11,
|
||||
"node_id": "0015"
|
||||
},
|
||||
{
|
||||
"title": "Parameter Tuning in DRO",
|
||||
"start_index": 11,
|
||||
"end_index": 12,
|
||||
"node_id": "0016"
|
||||
}
|
||||
],
|
||||
"node_id": "0013"
|
||||
},
|
||||
{
|
||||
"title": "RELATED WORK",
|
||||
"start_index": 12,
|
||||
"end_index": 12,
|
||||
"node_id": "0017"
|
||||
},
|
||||
{
|
||||
"title": "CONCLUSION",
|
||||
"start_index": 12,
|
||||
"end_index": 12,
|
||||
"node_id": "0018"
|
||||
},
|
||||
{
|
||||
"title": "ACKNOWLEDGMENT",
|
||||
"start_index": 12,
|
||||
"end_index": 12,
|
||||
"node_id": "0019"
|
||||
},
|
||||
{
|
||||
"title": "REFERENCES",
|
||||
"start_index": 12,
|
||||
"end_index": 12,
|
||||
"node_id": "0020"
|
||||
}
|
||||
]
|
||||
}
|
||||
|
|
@ -2,78 +2,80 @@
|
|||
"doc_name": "four-lectures.pdf",
|
||||
"structure": [
|
||||
{
|
||||
"title": "Preface",
|
||||
"title": "Four Lectures on Standard ML",
|
||||
"start_index": 1,
|
||||
"end_index": 1,
|
||||
"node_id": "0000"
|
||||
},
|
||||
{
|
||||
"title": "ML at a Glance",
|
||||
"start_index": 2,
|
||||
"end_index": 2,
|
||||
"nodes": [
|
||||
{
|
||||
"title": "An ML session",
|
||||
"title": "ML at a Glance",
|
||||
"start_index": 2,
|
||||
"end_index": 3,
|
||||
"node_id": "0002"
|
||||
},
|
||||
{
|
||||
"title": "Types and Values",
|
||||
"start_index": 3,
|
||||
"end_index": 4,
|
||||
"node_id": "0003"
|
||||
},
|
||||
{
|
||||
"title": "Recursive Functions",
|
||||
"start_index": 4,
|
||||
"end_index": 4,
|
||||
"node_id": "0004"
|
||||
},
|
||||
{
|
||||
"title": "Raising Exceptions",
|
||||
"start_index": 4,
|
||||
"end_index": 5,
|
||||
"node_id": "0005"
|
||||
},
|
||||
{
|
||||
"title": "Structures",
|
||||
"start_index": 5,
|
||||
"end_index": 6,
|
||||
"node_id": "0006"
|
||||
},
|
||||
{
|
||||
"title": "Signatures",
|
||||
"start_index": 6,
|
||||
"end_index": 7,
|
||||
"node_id": "0007"
|
||||
},
|
||||
{
|
||||
"title": "Coercive Signature Matching",
|
||||
"start_index": 7,
|
||||
"end_index": 8,
|
||||
"node_id": "0008"
|
||||
},
|
||||
{
|
||||
"title": "Functor Declaration",
|
||||
"start_index": 8,
|
||||
"end_index": 9,
|
||||
"node_id": "0009"
|
||||
},
|
||||
{
|
||||
"title": "Functor Application",
|
||||
"start_index": 9,
|
||||
"end_index": 9,
|
||||
"node_id": "0010"
|
||||
},
|
||||
{
|
||||
"title": "Summary",
|
||||
"start_index": 9,
|
||||
"end_index": 9,
|
||||
"node_id": "0011"
|
||||
"end_index": 2,
|
||||
"nodes": [
|
||||
{
|
||||
"title": "An ML session",
|
||||
"start_index": 2,
|
||||
"end_index": 3,
|
||||
"node_id": "0002"
|
||||
},
|
||||
{
|
||||
"title": "Types and Values",
|
||||
"start_index": 3,
|
||||
"end_index": 4,
|
||||
"node_id": "0003"
|
||||
},
|
||||
{
|
||||
"title": "Recursive Functions",
|
||||
"start_index": 4,
|
||||
"end_index": 4,
|
||||
"node_id": "0004"
|
||||
},
|
||||
{
|
||||
"title": "Raising Exceptions",
|
||||
"start_index": 4,
|
||||
"end_index": 5,
|
||||
"node_id": "0005"
|
||||
},
|
||||
{
|
||||
"title": "Structures",
|
||||
"start_index": 5,
|
||||
"end_index": 6,
|
||||
"node_id": "0006"
|
||||
},
|
||||
{
|
||||
"title": "Signatures",
|
||||
"start_index": 6,
|
||||
"end_index": 7,
|
||||
"node_id": "0007"
|
||||
},
|
||||
{
|
||||
"title": "Coercive Signature Matching",
|
||||
"start_index": 7,
|
||||
"end_index": 8,
|
||||
"node_id": "0008"
|
||||
},
|
||||
{
|
||||
"title": "Functor Declaration",
|
||||
"start_index": 8,
|
||||
"end_index": 9,
|
||||
"node_id": "0009"
|
||||
},
|
||||
{
|
||||
"title": "Functor Application",
|
||||
"start_index": 9,
|
||||
"end_index": 9,
|
||||
"node_id": "0010"
|
||||
},
|
||||
{
|
||||
"title": "Summary",
|
||||
"start_index": 9,
|
||||
"end_index": 9,
|
||||
"node_id": "0011"
|
||||
}
|
||||
],
|
||||
"node_id": "0001"
|
||||
}
|
||||
],
|
||||
"node_id": "0001"
|
||||
"node_id": "0000"
|
||||
},
|
||||
{
|
||||
"title": "Programming with ML Modules",
|
||||
|
|
@ -264,70 +266,14 @@
|
|||
{
|
||||
"title": "Appendix A: The Bare Interpreter",
|
||||
"start_index": 44,
|
||||
"end_index": 44,
|
||||
"nodes": [
|
||||
{
|
||||
"title": "Syntax",
|
||||
"start_index": 44,
|
||||
"end_index": 44,
|
||||
"node_id": "0043"
|
||||
},
|
||||
{
|
||||
"title": "Parsing",
|
||||
"start_index": 44,
|
||||
"end_index": 45,
|
||||
"node_id": "0044"
|
||||
},
|
||||
{
|
||||
"title": "Environments",
|
||||
"start_index": 45,
|
||||
"end_index": 46,
|
||||
"node_id": "0045"
|
||||
},
|
||||
{
|
||||
"title": "Evaluation",
|
||||
"start_index": 46,
|
||||
"end_index": 46,
|
||||
"node_id": "0046"
|
||||
},
|
||||
{
|
||||
"title": "Type Checking",
|
||||
"start_index": 46,
|
||||
"end_index": 46,
|
||||
"node_id": "0047"
|
||||
},
|
||||
{
|
||||
"title": "The Interpreter",
|
||||
"start_index": 46,
|
||||
"end_index": 47,
|
||||
"node_id": "0048"
|
||||
},
|
||||
{
|
||||
"title": "The Evaluator",
|
||||
"start_index": 47,
|
||||
"end_index": 49,
|
||||
"node_id": "0049"
|
||||
},
|
||||
{
|
||||
"title": "The Typechecker",
|
||||
"start_index": 49,
|
||||
"end_index": 50,
|
||||
"node_id": "0050"
|
||||
},
|
||||
{
|
||||
"title": "The Basics",
|
||||
"start_index": 50,
|
||||
"end_index": 52,
|
||||
"node_id": "0051"
|
||||
}
|
||||
],
|
||||
"end_index": 52,
|
||||
"node_id": "0042"
|
||||
},
|
||||
{
|
||||
"title": "Appendix B: Files",
|
||||
"start_index": 53,
|
||||
"end_index": 53,
|
||||
"node_id": "0052"
|
||||
"node_id": "0043"
|
||||
}
|
||||
]
|
||||
}
|
||||
15
utils.py
15
utils.py
|
|
@ -492,24 +492,25 @@ def check_token_limit(structure, limit=110000):
|
|||
print("Start Index:", node['start_index'])
|
||||
print("End Index:", node['end_index'])
|
||||
print("Title:", node['title'])
|
||||
# print(node['text'])
|
||||
print("\n")
|
||||
|
||||
|
||||
def convert_physical_index_to_int(data):
|
||||
if isinstance(data, list):
|
||||
for i in range(len(data)):
|
||||
if isinstance(data[i]['physical_index'], str):
|
||||
if data[i]['physical_index'].startswith('<physical_index_'):
|
||||
data[i]['physical_index'] = int(data[i]['physical_index'].split('_')[-1].rstrip('>').strip())
|
||||
elif data[i]['physical_index'].startswith('physical_index_'):
|
||||
data[i]['physical_index'] = int(data[i]['physical_index'].split('_')[-1].strip())
|
||||
# Check if item is a dictionary and has 'physical_index' key
|
||||
if isinstance(data[i], dict) and 'physical_index' in data[i]:
|
||||
if isinstance(data[i]['physical_index'], str):
|
||||
if data[i]['physical_index'].startswith('<physical_index_'):
|
||||
data[i]['physical_index'] = int(data[i]['physical_index'].split('_')[-1].rstrip('>').strip())
|
||||
elif data[i]['physical_index'].startswith('physical_index_'):
|
||||
data[i]['physical_index'] = int(data[i]['physical_index'].split('_')[-1].strip())
|
||||
elif isinstance(data, str):
|
||||
if data.startswith('<physical_index_'):
|
||||
data = int(data.split('_')[-1].rstrip('>').strip())
|
||||
elif data.startswith('physical_index_'):
|
||||
data = int(data.split('_')[-1].strip())
|
||||
###check data is int
|
||||
# Check data is int
|
||||
if isinstance(data, int):
|
||||
return data
|
||||
else:
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue