mirror of
https://github.com/VectifyAI/PageIndex.git
synced 2026-04-24 23:56:21 +02:00
fix: handle TOC items exceeding document length
This commit is contained in:
parent
febcec60b9
commit
1679600c9a
3 changed files with 186 additions and 22 deletions
BIN
docs/2023-annual-report-truncated.pdf
Normal file
BIN
docs/2023-annual-report-truncated.pdf
Normal file
Binary file not shown.
|
|
@ -665,8 +665,13 @@ def process_none_page_numbers(toc_items, page_list, start_index=1, model=None):
|
||||||
|
|
||||||
page_contents = []
|
page_contents = []
|
||||||
for page_index in range(prev_physical_index, next_physical_index+1):
|
for page_index in range(prev_physical_index, next_physical_index+1):
|
||||||
page_text = f"<physical_index_{page_index}>\n{page_list[page_index-start_index][0]}\n<physical_index_{page_index}>\n\n"
|
# Add bounds checking to prevent IndexError
|
||||||
page_contents.append(page_text)
|
list_index = page_index - start_index
|
||||||
|
if list_index >= 0 and list_index < len(page_list):
|
||||||
|
page_text = f"<physical_index_{page_index}>\n{page_list[list_index][0]}\n<physical_index_{page_index}>\n\n"
|
||||||
|
page_contents.append(page_text)
|
||||||
|
else:
|
||||||
|
continue
|
||||||
|
|
||||||
item_copy = copy.deepcopy(item)
|
item_copy = copy.deepcopy(item)
|
||||||
del item_copy['page']
|
del item_copy['page']
|
||||||
|
|
@ -754,12 +759,25 @@ async def fix_incorrect_toc(toc_with_page_number, page_list, incorrect_results,
|
||||||
# Helper function to process and check a single incorrect item
|
# Helper function to process and check a single incorrect item
|
||||||
async def process_and_check_item(incorrect_item):
|
async def process_and_check_item(incorrect_item):
|
||||||
list_index = incorrect_item['list_index']
|
list_index = incorrect_item['list_index']
|
||||||
|
|
||||||
|
# Check if list_index is valid
|
||||||
|
if list_index < 0 or list_index >= len(toc_with_page_number):
|
||||||
|
# Return an invalid result for out-of-bounds indices
|
||||||
|
return {
|
||||||
|
'list_index': list_index,
|
||||||
|
'title': incorrect_item['title'],
|
||||||
|
'physical_index': incorrect_item.get('physical_index'),
|
||||||
|
'is_valid': False
|
||||||
|
}
|
||||||
|
|
||||||
# Find the previous correct item
|
# Find the previous correct item
|
||||||
prev_correct = None
|
prev_correct = None
|
||||||
for i in range(list_index-1, -1, -1):
|
for i in range(list_index-1, -1, -1):
|
||||||
if i not in incorrect_indices:
|
if i not in incorrect_indices and i >= 0 and i < len(toc_with_page_number):
|
||||||
prev_correct = toc_with_page_number[i]['physical_index']
|
physical_index = toc_with_page_number[i].get('physical_index')
|
||||||
break
|
if physical_index is not None:
|
||||||
|
prev_correct = physical_index
|
||||||
|
break
|
||||||
# If no previous correct item found, use start_index
|
# If no previous correct item found, use start_index
|
||||||
if prev_correct is None:
|
if prev_correct is None:
|
||||||
prev_correct = start_index - 1
|
prev_correct = start_index - 1
|
||||||
|
|
@ -767,9 +785,11 @@ async def fix_incorrect_toc(toc_with_page_number, page_list, incorrect_results,
|
||||||
# Find the next correct item
|
# Find the next correct item
|
||||||
next_correct = None
|
next_correct = None
|
||||||
for i in range(list_index+1, len(toc_with_page_number)):
|
for i in range(list_index+1, len(toc_with_page_number)):
|
||||||
if i not in incorrect_indices:
|
if i not in incorrect_indices and i >= 0 and i < len(toc_with_page_number):
|
||||||
next_correct = toc_with_page_number[i]['physical_index']
|
physical_index = toc_with_page_number[i].get('physical_index')
|
||||||
break
|
if physical_index is not None:
|
||||||
|
next_correct = physical_index
|
||||||
|
break
|
||||||
# If no next correct item found, use end_index
|
# If no next correct item found, use end_index
|
||||||
if next_correct is None:
|
if next_correct is None:
|
||||||
next_correct = end_index
|
next_correct = end_index
|
||||||
|
|
@ -783,8 +803,13 @@ async def fix_incorrect_toc(toc_with_page_number, page_list, incorrect_results,
|
||||||
|
|
||||||
page_contents=[]
|
page_contents=[]
|
||||||
for page_index in range(prev_correct, next_correct+1):
|
for page_index in range(prev_correct, next_correct+1):
|
||||||
page_text = f"<physical_index_{page_index}>\n{page_list[page_index-start_index][0]}\n<physical_index_{page_index}>\n\n"
|
# Add bounds checking to prevent IndexError
|
||||||
page_contents.append(page_text)
|
list_index = page_index - start_index
|
||||||
|
if list_index >= 0 and list_index < len(page_list):
|
||||||
|
page_text = f"<physical_index_{page_index}>\n{page_list[list_index][0]}\n<physical_index_{page_index}>\n\n"
|
||||||
|
page_contents.append(page_text)
|
||||||
|
else:
|
||||||
|
continue
|
||||||
content_range = ''.join(page_contents)
|
content_range = ''.join(page_contents)
|
||||||
|
|
||||||
physical_index_int = single_toc_item_index_fixer(incorrect_item['title'], content_range, model)
|
physical_index_int = single_toc_item_index_fixer(incorrect_item['title'], content_range, model)
|
||||||
|
|
@ -817,7 +842,17 @@ async def fix_incorrect_toc(toc_with_page_number, page_list, incorrect_results,
|
||||||
invalid_results = []
|
invalid_results = []
|
||||||
for result in results:
|
for result in results:
|
||||||
if result['is_valid']:
|
if result['is_valid']:
|
||||||
toc_with_page_number[result['list_index']]['physical_index'] = result['physical_index']
|
# Add bounds checking to prevent IndexError
|
||||||
|
list_idx = result['list_index']
|
||||||
|
if 0 <= list_idx < len(toc_with_page_number):
|
||||||
|
toc_with_page_number[list_idx]['physical_index'] = result['physical_index']
|
||||||
|
else:
|
||||||
|
# Index is out of bounds, treat as invalid
|
||||||
|
invalid_results.append({
|
||||||
|
'list_index': result['list_index'],
|
||||||
|
'title': result['title'],
|
||||||
|
'physical_index': result['physical_index'],
|
||||||
|
})
|
||||||
else:
|
else:
|
||||||
invalid_results.append({
|
invalid_results.append({
|
||||||
'list_index': result['list_index'],
|
'list_index': result['list_index'],
|
||||||
|
|
@ -880,9 +915,11 @@ async def verify_toc(page_list, list_result, start_index=1, N=None, model=None):
|
||||||
indexed_sample_list = []
|
indexed_sample_list = []
|
||||||
for idx in sample_indices:
|
for idx in sample_indices:
|
||||||
item = list_result[idx]
|
item = list_result[idx]
|
||||||
item_with_index = item.copy()
|
# Skip items with None physical_index (these were invalidated by validate_and_truncate_physical_indices)
|
||||||
item_with_index['list_index'] = idx # Add the original index in list_result
|
if item.get('physical_index') is not None:
|
||||||
indexed_sample_list.append(item_with_index)
|
item_with_index = item.copy()
|
||||||
|
item_with_index['list_index'] = idx # Add the original index in list_result
|
||||||
|
indexed_sample_list.append(item_with_index)
|
||||||
|
|
||||||
# Run checks concurrently
|
# Run checks concurrently
|
||||||
tasks = [
|
tasks = [
|
||||||
|
|
@ -923,6 +960,14 @@ async def meta_processor(page_list, mode=None, toc_content=None, toc_page_list=N
|
||||||
toc_with_page_number = process_no_toc(page_list, start_index=start_index, model=opt.model, logger=logger)
|
toc_with_page_number = process_no_toc(page_list, start_index=start_index, model=opt.model, logger=logger)
|
||||||
|
|
||||||
toc_with_page_number = [item for item in toc_with_page_number if item.get('physical_index') is not None]
|
toc_with_page_number = [item for item in toc_with_page_number if item.get('physical_index') is not None]
|
||||||
|
|
||||||
|
toc_with_page_number = validate_and_truncate_physical_indices(
|
||||||
|
toc_with_page_number,
|
||||||
|
len(page_list),
|
||||||
|
start_index=start_index,
|
||||||
|
logger=logger
|
||||||
|
)
|
||||||
|
|
||||||
accuracy, incorrect_results = await verify_toc(page_list, toc_with_page_number, start_index=start_index, model=opt.model)
|
accuracy, incorrect_results = await verify_toc(page_list, toc_with_page_number, start_index=start_index, model=opt.model)
|
||||||
|
|
||||||
logger.info({
|
logger.info({
|
||||||
|
|
@ -954,12 +999,15 @@ async def process_large_node_recursively(node, page_list, opt=None, logger=None)
|
||||||
node_toc_tree = await meta_processor(node_page_list, mode='process_no_toc', start_index=node['start_index'], opt=opt, logger=logger)
|
node_toc_tree = await meta_processor(node_page_list, mode='process_no_toc', start_index=node['start_index'], opt=opt, logger=logger)
|
||||||
node_toc_tree = await check_title_appearance_in_start_concurrent(node_toc_tree, page_list, model=opt.model, logger=logger)
|
node_toc_tree = await check_title_appearance_in_start_concurrent(node_toc_tree, page_list, model=opt.model, logger=logger)
|
||||||
|
|
||||||
if node['title'].strip() == node_toc_tree[0]['title'].strip():
|
# Filter out items with None physical_index before post_processing
|
||||||
node['nodes'] = post_processing(node_toc_tree[1:], node['end_index'])
|
valid_node_toc_items = [item for item in node_toc_tree if item.get('physical_index') is not None]
|
||||||
node['end_index'] = node_toc_tree[1]['start_index']
|
|
||||||
|
if valid_node_toc_items and node['title'].strip() == valid_node_toc_items[0]['title'].strip():
|
||||||
|
node['nodes'] = post_processing(valid_node_toc_items[1:], node['end_index'])
|
||||||
|
node['end_index'] = valid_node_toc_items[1]['start_index'] if len(valid_node_toc_items) > 1 else node['end_index']
|
||||||
else:
|
else:
|
||||||
node['nodes'] = post_processing(node_toc_tree, node['end_index'])
|
node['nodes'] = post_processing(valid_node_toc_items, node['end_index'])
|
||||||
node['end_index'] = node_toc_tree[0]['start_index']
|
node['end_index'] = valid_node_toc_items[0]['start_index'] if valid_node_toc_items else node['end_index']
|
||||||
|
|
||||||
if 'nodes' in node and node['nodes']:
|
if 'nodes' in node and node['nodes']:
|
||||||
tasks = [
|
tasks = [
|
||||||
|
|
@ -993,7 +1041,11 @@ async def tree_parser(page_list, opt, doc=None, logger=None):
|
||||||
|
|
||||||
toc_with_page_number = add_preface_if_needed(toc_with_page_number)
|
toc_with_page_number = add_preface_if_needed(toc_with_page_number)
|
||||||
toc_with_page_number = await check_title_appearance_in_start_concurrent(toc_with_page_number, page_list, model=opt.model, logger=logger)
|
toc_with_page_number = await check_title_appearance_in_start_concurrent(toc_with_page_number, page_list, model=opt.model, logger=logger)
|
||||||
toc_tree = post_processing(toc_with_page_number, len(page_list))
|
|
||||||
|
# Filter out items with None physical_index before post_processings
|
||||||
|
valid_toc_items = [item for item in toc_with_page_number if item.get('physical_index') is not None]
|
||||||
|
|
||||||
|
toc_tree = post_processing(valid_toc_items, len(page_list))
|
||||||
tasks = [
|
tasks = [
|
||||||
process_large_node_recursively(node, page_list, opt, logger=logger)
|
process_large_node_recursively(node, page_list, opt, logger=logger)
|
||||||
for node in toc_tree
|
for node in toc_tree
|
||||||
|
|
@ -1052,5 +1104,34 @@ def page_index(doc, model=None, toc_check_page_num=None, max_page_num_each_node=
|
||||||
return page_index_main(doc, opt)
|
return page_index_main(doc, opt)
|
||||||
|
|
||||||
|
|
||||||
|
def validate_and_truncate_physical_indices(toc_with_page_number, page_list_length, start_index=1, logger=None):
|
||||||
|
"""
|
||||||
|
Validates and truncates physical indices that exceed the actual document length.
|
||||||
|
This prevents errors when TOC references pages that don't exist in the document (e.g. the file is broken or incomplete).
|
||||||
|
"""
|
||||||
|
if not toc_with_page_number:
|
||||||
|
return toc_with_page_number
|
||||||
|
|
||||||
|
max_allowed_page = page_list_length + start_index - 1
|
||||||
|
truncated_items = []
|
||||||
|
|
||||||
|
for i, item in enumerate(toc_with_page_number):
|
||||||
|
if item.get('physical_index') is not None:
|
||||||
|
original_index = item['physical_index']
|
||||||
|
if original_index > max_allowed_page:
|
||||||
|
item['physical_index'] = None
|
||||||
|
truncated_items.append({
|
||||||
|
'title': item.get('title', 'Unknown'),
|
||||||
|
'original_index': original_index
|
||||||
|
})
|
||||||
|
if logger:
|
||||||
|
logger.info(f"Removed physical_index for '{item.get('title', 'Unknown')}' (was {original_index}, too far beyond document)")
|
||||||
|
|
||||||
|
if truncated_items and logger:
|
||||||
|
logger.info(f"Total removed items: {len(truncated_items)}")
|
||||||
|
|
||||||
|
print(f"Document validation: {page_list_length} pages, max allowed index: {max_allowed_page}")
|
||||||
|
if truncated_items:
|
||||||
|
print(f"Truncated {len(truncated_items)} TOC items that exceeded document length")
|
||||||
|
|
||||||
|
return toc_with_page_number
|
||||||
83
results/2023-annual-report-truncated_structure.json
Normal file
83
results/2023-annual-report-truncated_structure.json
Normal file
|
|
@ -0,0 +1,83 @@
|
||||||
|
{
|
||||||
|
"doc_name": "2023-annual-report-truncated.pdf",
|
||||||
|
"structure": [
|
||||||
|
{
|
||||||
|
"title": "Preface",
|
||||||
|
"start_index": 1,
|
||||||
|
"end_index": 4,
|
||||||
|
"node_id": "0000"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"title": "About the Federal Reserve",
|
||||||
|
"start_index": 5,
|
||||||
|
"end_index": 7,
|
||||||
|
"node_id": "0001"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"title": "Overview",
|
||||||
|
"start_index": 7,
|
||||||
|
"end_index": 8,
|
||||||
|
"node_id": "0002"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"title": "Monetary Policy and Economic Developments",
|
||||||
|
"start_index": 9,
|
||||||
|
"end_index": 9,
|
||||||
|
"nodes": [
|
||||||
|
{
|
||||||
|
"title": "March 2024 Summary",
|
||||||
|
"start_index": 9,
|
||||||
|
"end_index": 14,
|
||||||
|
"node_id": "0004"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"title": "June 2023 Summary",
|
||||||
|
"start_index": 15,
|
||||||
|
"end_index": 20,
|
||||||
|
"node_id": "0005"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"node_id": "0003"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"title": "Financial Stability",
|
||||||
|
"start_index": 21,
|
||||||
|
"end_index": 21,
|
||||||
|
"nodes": [
|
||||||
|
{
|
||||||
|
"title": "Monitoring Financial Vulnerabilities",
|
||||||
|
"start_index": 22,
|
||||||
|
"end_index": 28,
|
||||||
|
"node_id": "0007"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"title": "Domestic and International Cooperation and Coordination",
|
||||||
|
"start_index": 28,
|
||||||
|
"end_index": 30,
|
||||||
|
"node_id": "0008"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"node_id": "0006"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"title": "Supervision and Regulation",
|
||||||
|
"start_index": 31,
|
||||||
|
"end_index": 32,
|
||||||
|
"nodes": [
|
||||||
|
{
|
||||||
|
"title": "Supervised and Regulated Institutions",
|
||||||
|
"start_index": 32,
|
||||||
|
"end_index": 35,
|
||||||
|
"node_id": "0010"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"title": "Supervisory Developments",
|
||||||
|
"start_index": 35,
|
||||||
|
"end_index": 50,
|
||||||
|
"node_id": "0011"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"node_id": "0009"
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
Loading…
Add table
Add a link
Reference in a new issue