diff --git a/docs/2023-annual-report-truncated.pdf b/docs/2023-annual-report-truncated.pdf new file mode 100644 index 0000000..fd983bd Binary files /dev/null and b/docs/2023-annual-report-truncated.pdf differ diff --git a/pageindex/page_index.py b/pageindex/page_index.py index dedc0bd..6711378 100644 --- a/pageindex/page_index.py +++ b/pageindex/page_index.py @@ -665,8 +665,13 @@ def process_none_page_numbers(toc_items, page_list, start_index=1, model=None): page_contents = [] for page_index in range(prev_physical_index, next_physical_index+1): - page_text = f"\n{page_list[page_index-start_index][0]}\n\n\n" - page_contents.append(page_text) + # Add bounds checking to prevent IndexError + list_index = page_index - start_index + if list_index >= 0 and list_index < len(page_list): + page_text = f"\n{page_list[list_index][0]}\n\n\n" + page_contents.append(page_text) + else: + continue item_copy = copy.deepcopy(item) del item_copy['page'] @@ -754,12 +759,25 @@ async def fix_incorrect_toc(toc_with_page_number, page_list, incorrect_results, # Helper function to process and check a single incorrect item async def process_and_check_item(incorrect_item): list_index = incorrect_item['list_index'] + + # Check if list_index is valid + if list_index < 0 or list_index >= len(toc_with_page_number): + # Return an invalid result for out-of-bounds indices + return { + 'list_index': list_index, + 'title': incorrect_item['title'], + 'physical_index': incorrect_item.get('physical_index'), + 'is_valid': False + } + # Find the previous correct item prev_correct = None for i in range(list_index-1, -1, -1): - if i not in incorrect_indices: - prev_correct = toc_with_page_number[i]['physical_index'] - break + if i not in incorrect_indices and i >= 0 and i < len(toc_with_page_number): + physical_index = toc_with_page_number[i].get('physical_index') + if physical_index is not None: + prev_correct = physical_index + break # If no previous correct item found, use start_index if prev_correct is None: prev_correct = start_index - 1 @@ -767,9 +785,11 @@ async def fix_incorrect_toc(toc_with_page_number, page_list, incorrect_results, # Find the next correct item next_correct = None for i in range(list_index+1, len(toc_with_page_number)): - if i not in incorrect_indices: - next_correct = toc_with_page_number[i]['physical_index'] - break + if i not in incorrect_indices and i >= 0 and i < len(toc_with_page_number): + physical_index = toc_with_page_number[i].get('physical_index') + if physical_index is not None: + next_correct = physical_index + break # If no next correct item found, use end_index if next_correct is None: next_correct = end_index @@ -783,8 +803,13 @@ async def fix_incorrect_toc(toc_with_page_number, page_list, incorrect_results, page_contents=[] for page_index in range(prev_correct, next_correct+1): - page_text = f"\n{page_list[page_index-start_index][0]}\n\n\n" - page_contents.append(page_text) + # Add bounds checking to prevent IndexError + list_index = page_index - start_index + if list_index >= 0 and list_index < len(page_list): + page_text = f"\n{page_list[list_index][0]}\n\n\n" + page_contents.append(page_text) + else: + continue content_range = ''.join(page_contents) physical_index_int = single_toc_item_index_fixer(incorrect_item['title'], content_range, model) @@ -817,7 +842,17 @@ async def fix_incorrect_toc(toc_with_page_number, page_list, incorrect_results, invalid_results = [] for result in results: if result['is_valid']: - toc_with_page_number[result['list_index']]['physical_index'] = result['physical_index'] + # Add bounds checking to prevent IndexError + list_idx = result['list_index'] + if 0 <= list_idx < len(toc_with_page_number): + toc_with_page_number[list_idx]['physical_index'] = result['physical_index'] + else: + # Index is out of bounds, treat as invalid + invalid_results.append({ + 'list_index': result['list_index'], + 'title': result['title'], + 'physical_index': result['physical_index'], + }) else: invalid_results.append({ 'list_index': result['list_index'], @@ -880,9 +915,11 @@ async def verify_toc(page_list, list_result, start_index=1, N=None, model=None): indexed_sample_list = [] for idx in sample_indices: item = list_result[idx] - item_with_index = item.copy() - item_with_index['list_index'] = idx # Add the original index in list_result - indexed_sample_list.append(item_with_index) + # Skip items with None physical_index (these were invalidated by validate_and_truncate_physical_indices) + if item.get('physical_index') is not None: + item_with_index = item.copy() + item_with_index['list_index'] = idx # Add the original index in list_result + indexed_sample_list.append(item_with_index) # Run checks concurrently tasks = [ @@ -923,6 +960,14 @@ async def meta_processor(page_list, mode=None, toc_content=None, toc_page_list=N toc_with_page_number = process_no_toc(page_list, start_index=start_index, model=opt.model, logger=logger) toc_with_page_number = [item for item in toc_with_page_number if item.get('physical_index') is not None] + + toc_with_page_number = validate_and_truncate_physical_indices( + toc_with_page_number, + len(page_list), + start_index=start_index, + logger=logger + ) + accuracy, incorrect_results = await verify_toc(page_list, toc_with_page_number, start_index=start_index, model=opt.model) logger.info({ @@ -954,12 +999,15 @@ async def process_large_node_recursively(node, page_list, opt=None, logger=None) node_toc_tree = await meta_processor(node_page_list, mode='process_no_toc', start_index=node['start_index'], opt=opt, logger=logger) node_toc_tree = await check_title_appearance_in_start_concurrent(node_toc_tree, page_list, model=opt.model, logger=logger) - if node['title'].strip() == node_toc_tree[0]['title'].strip(): - node['nodes'] = post_processing(node_toc_tree[1:], node['end_index']) - node['end_index'] = node_toc_tree[1]['start_index'] + # Filter out items with None physical_index before post_processing + valid_node_toc_items = [item for item in node_toc_tree if item.get('physical_index') is not None] + + if valid_node_toc_items and node['title'].strip() == valid_node_toc_items[0]['title'].strip(): + node['nodes'] = post_processing(valid_node_toc_items[1:], node['end_index']) + node['end_index'] = valid_node_toc_items[1]['start_index'] if len(valid_node_toc_items) > 1 else node['end_index'] else: - node['nodes'] = post_processing(node_toc_tree, node['end_index']) - node['end_index'] = node_toc_tree[0]['start_index'] + node['nodes'] = post_processing(valid_node_toc_items, node['end_index']) + node['end_index'] = valid_node_toc_items[0]['start_index'] if valid_node_toc_items else node['end_index'] if 'nodes' in node and node['nodes']: tasks = [ @@ -993,7 +1041,11 @@ async def tree_parser(page_list, opt, doc=None, logger=None): toc_with_page_number = add_preface_if_needed(toc_with_page_number) toc_with_page_number = await check_title_appearance_in_start_concurrent(toc_with_page_number, page_list, model=opt.model, logger=logger) - toc_tree = post_processing(toc_with_page_number, len(page_list)) + + # Filter out items with None physical_index before post_processings + valid_toc_items = [item for item in toc_with_page_number if item.get('physical_index') is not None] + + toc_tree = post_processing(valid_toc_items, len(page_list)) tasks = [ process_large_node_recursively(node, page_list, opt, logger=logger) for node in toc_tree @@ -1052,5 +1104,34 @@ def page_index(doc, model=None, toc_check_page_num=None, max_page_num_each_node= return page_index_main(doc, opt) - - \ No newline at end of file +def validate_and_truncate_physical_indices(toc_with_page_number, page_list_length, start_index=1, logger=None): + """ + Validates and truncates physical indices that exceed the actual document length. + This prevents errors when TOC references pages that don't exist in the document (e.g. the file is broken or incomplete). + """ + if not toc_with_page_number: + return toc_with_page_number + + max_allowed_page = page_list_length + start_index - 1 + truncated_items = [] + + for i, item in enumerate(toc_with_page_number): + if item.get('physical_index') is not None: + original_index = item['physical_index'] + if original_index > max_allowed_page: + item['physical_index'] = None + truncated_items.append({ + 'title': item.get('title', 'Unknown'), + 'original_index': original_index + }) + if logger: + logger.info(f"Removed physical_index for '{item.get('title', 'Unknown')}' (was {original_index}, too far beyond document)") + + if truncated_items and logger: + logger.info(f"Total removed items: {len(truncated_items)}") + + print(f"Document validation: {page_list_length} pages, max allowed index: {max_allowed_page}") + if truncated_items: + print(f"Truncated {len(truncated_items)} TOC items that exceeded document length") + + return toc_with_page_number \ No newline at end of file diff --git a/results/2023-annual-report-truncated_structure.json b/results/2023-annual-report-truncated_structure.json new file mode 100644 index 0000000..11ed67a --- /dev/null +++ b/results/2023-annual-report-truncated_structure.json @@ -0,0 +1,83 @@ +{ + "doc_name": "2023-annual-report-truncated.pdf", + "structure": [ + { + "title": "Preface", + "start_index": 1, + "end_index": 4, + "node_id": "0000" + }, + { + "title": "About the Federal Reserve", + "start_index": 5, + "end_index": 7, + "node_id": "0001" + }, + { + "title": "Overview", + "start_index": 7, + "end_index": 8, + "node_id": "0002" + }, + { + "title": "Monetary Policy and Economic Developments", + "start_index": 9, + "end_index": 9, + "nodes": [ + { + "title": "March 2024 Summary", + "start_index": 9, + "end_index": 14, + "node_id": "0004" + }, + { + "title": "June 2023 Summary", + "start_index": 15, + "end_index": 20, + "node_id": "0005" + } + ], + "node_id": "0003" + }, + { + "title": "Financial Stability", + "start_index": 21, + "end_index": 21, + "nodes": [ + { + "title": "Monitoring Financial Vulnerabilities", + "start_index": 22, + "end_index": 28, + "node_id": "0007" + }, + { + "title": "Domestic and International Cooperation and Coordination", + "start_index": 28, + "end_index": 30, + "node_id": "0008" + } + ], + "node_id": "0006" + }, + { + "title": "Supervision and Regulation", + "start_index": 31, + "end_index": 32, + "nodes": [ + { + "title": "Supervised and Regulated Institutions", + "start_index": 32, + "end_index": 35, + "node_id": "0010" + }, + { + "title": "Supervisory Developments", + "start_index": 35, + "end_index": 50, + "node_id": "0011" + } + ], + "node_id": "0009" + } + ] +} \ No newline at end of file