fix: handle TOC items exceeding document length

2026-04-24 23:56:21 +02:00 · 2025-05-30 03:03:20 +01:00 · 2025-05-30 03:03:20 +01:00 · 1679600c9a
commit 1679600c9a
parent febcec60b9
3 changed files with 186 additions and 22 deletions
--- a/docs/2023-annual-report-truncated.pdf
+++ b/docs/2023-annual-report-truncated.pdf
--- a/pageindex/page_index.py
+++ b/pageindex/page_index.py
@ -665,8 +665,13 @@ def process_none_page_numbers(toc_items, page_list, start_index=1, model=None):

            page_contents = []
            for page_index in range(prev_physical_index, next_physical_index+1):
-                page_text = f"<physical_index_{page_index}>\n{page_list[page_index-start_index][0]}\n<physical_index_{page_index}>\n\n"
-                page_contents.append(page_text)
+                # Add bounds checking to prevent IndexError
+                list_index = page_index - start_index
+                if list_index >= 0 and list_index < len(page_list):
+                    page_text = f"<physical_index_{page_index}>\n{page_list[list_index][0]}\n<physical_index_{page_index}>\n\n"
+                    page_contents.append(page_text)
+                else:
+                    continue

            item_copy = copy.deepcopy(item)
            del item_copy['page']
@ -754,12 +759,25 @@ async def fix_incorrect_toc(toc_with_page_number, page_list, incorrect_results,
    # Helper function to process and check a single incorrect item
    async def process_and_check_item(incorrect_item):
        list_index = incorrect_item['list_index']
+        
+        # Check if list_index is valid
+        if list_index < 0 or list_index >= len(toc_with_page_number):
+            # Return an invalid result for out-of-bounds indices
+            return {
+                'list_index': list_index,
+                'title': incorrect_item['title'],
+                'physical_index': incorrect_item.get('physical_index'),
+                'is_valid': False
+            }
+        
        # Find the previous correct item
        prev_correct = None
        for i in range(list_index-1, -1, -1):
-            if i not in incorrect_indices:
-                prev_correct = toc_with_page_number[i]['physical_index']
-                break
+            if i not in incorrect_indices and i >= 0 and i < len(toc_with_page_number):
+                physical_index = toc_with_page_number[i].get('physical_index')
+                if physical_index is not None:
+                    prev_correct = physical_index
+                    break
        # If no previous correct item found, use start_index
        if prev_correct is None:
            prev_correct = start_index - 1
@ -767,9 +785,11 @@ async def fix_incorrect_toc(toc_with_page_number, page_list, incorrect_results,
        # Find the next correct item
        next_correct = None
        for i in range(list_index+1, len(toc_with_page_number)):
-            if i not in incorrect_indices:
-                next_correct = toc_with_page_number[i]['physical_index']
-                break
+            if i not in incorrect_indices and i >= 0 and i < len(toc_with_page_number):
+                physical_index = toc_with_page_number[i].get('physical_index')
+                if physical_index is not None:
+                    next_correct = physical_index
+                    break
        # If no next correct item found, use end_index
        if next_correct is None:
            next_correct = end_index
@ -783,8 +803,13 @@ async def fix_incorrect_toc(toc_with_page_number, page_list, incorrect_results,

        page_contents=[]
        for page_index in range(prev_correct, next_correct+1):
-            page_text = f"<physical_index_{page_index}>\n{page_list[page_index-start_index][0]}\n<physical_index_{page_index}>\n\n"
-            page_contents.append(page_text)
+            # Add bounds checking to prevent IndexError
+            list_index = page_index - start_index
+            if list_index >= 0 and list_index < len(page_list):
+                page_text = f"<physical_index_{page_index}>\n{page_list[list_index][0]}\n<physical_index_{page_index}>\n\n"
+                page_contents.append(page_text)
+            else:
+                continue
        content_range = ''.join(page_contents)
        
        physical_index_int = single_toc_item_index_fixer(incorrect_item['title'], content_range, model)
@ -817,7 +842,17 @@ async def fix_incorrect_toc(toc_with_page_number, page_list, incorrect_results,
    invalid_results = []
    for result in results:
        if result['is_valid']:
-            toc_with_page_number[result['list_index']]['physical_index'] = result['physical_index']
+            # Add bounds checking to prevent IndexError
+            list_idx = result['list_index']
+            if 0 <= list_idx < len(toc_with_page_number):
+                toc_with_page_number[list_idx]['physical_index'] = result['physical_index']
+            else:
+                # Index is out of bounds, treat as invalid
+                invalid_results.append({
+                    'list_index': result['list_index'],
+                    'title': result['title'],
+                    'physical_index': result['physical_index'],
+                })
        else:
            invalid_results.append({
                'list_index': result['list_index'],
@ -880,9 +915,11 @@ async def verify_toc(page_list, list_result, start_index=1, N=None, model=None):
    indexed_sample_list = []
    for idx in sample_indices:
        item = list_result[idx]
-        item_with_index = item.copy()
-        item_with_index['list_index'] = idx  # Add the original index in list_result
-        indexed_sample_list.append(item_with_index)
+        # Skip items with None physical_index (these were invalidated by validate_and_truncate_physical_indices)
+        if item.get('physical_index') is not None:
+            item_with_index = item.copy()
+            item_with_index['list_index'] = idx  # Add the original index in list_result
+            indexed_sample_list.append(item_with_index)

    # Run checks concurrently
    tasks = [
@ -923,6 +960,14 @@ async def meta_processor(page_list, mode=None, toc_content=None, toc_page_list=N
        toc_with_page_number = process_no_toc(page_list, start_index=start_index, model=opt.model, logger=logger)
            
    toc_with_page_number = [item for item in toc_with_page_number if item.get('physical_index') is not None] 
+    
+    toc_with_page_number = validate_and_truncate_physical_indices(
+        toc_with_page_number, 
+        len(page_list), 
+        start_index=start_index, 
+        logger=logger
+    )
+    
    accuracy, incorrect_results = await verify_toc(page_list, toc_with_page_number, start_index=start_index, model=opt.model)
        
    logger.info({
@ -954,12 +999,15 @@ async def process_large_node_recursively(node, page_list, opt=None, logger=None)
        node_toc_tree = await meta_processor(node_page_list, mode='process_no_toc', start_index=node['start_index'], opt=opt, logger=logger)
        node_toc_tree = await check_title_appearance_in_start_concurrent(node_toc_tree, page_list, model=opt.model, logger=logger)
        
-        if node['title'].strip() == node_toc_tree[0]['title'].strip():
-            node['nodes'] = post_processing(node_toc_tree[1:], node['end_index'])
-            node['end_index'] = node_toc_tree[1]['start_index']
+        # Filter out items with None physical_index before post_processing
+        valid_node_toc_items = [item for item in node_toc_tree if item.get('physical_index') is not None]
+        
+        if valid_node_toc_items and node['title'].strip() == valid_node_toc_items[0]['title'].strip():
+            node['nodes'] = post_processing(valid_node_toc_items[1:], node['end_index'])
+            node['end_index'] = valid_node_toc_items[1]['start_index'] if len(valid_node_toc_items) > 1 else node['end_index']
        else:
-            node['nodes'] = post_processing(node_toc_tree, node['end_index'])
-            node['end_index'] = node_toc_tree[0]['start_index']
+            node['nodes'] = post_processing(valid_node_toc_items, node['end_index'])
+            node['end_index'] = valid_node_toc_items[0]['start_index'] if valid_node_toc_items else node['end_index']
        
    if 'nodes' in node and node['nodes']:
        tasks = [
@ -993,7 +1041,11 @@ async def tree_parser(page_list, opt, doc=None, logger=None):

    toc_with_page_number = add_preface_if_needed(toc_with_page_number)
    toc_with_page_number = await check_title_appearance_in_start_concurrent(toc_with_page_number, page_list, model=opt.model, logger=logger)
-    toc_tree = post_processing(toc_with_page_number, len(page_list))
+    
+    # Filter out items with None physical_index before post_processings
+    valid_toc_items = [item for item in toc_with_page_number if item.get('physical_index') is not None]
+    
+    toc_tree = post_processing(valid_toc_items, len(page_list))
    tasks = [
        process_large_node_recursively(node, page_list, opt, logger=logger)
        for node in toc_tree
@ -1052,5 +1104,34 @@ def page_index(doc, model=None, toc_check_page_num=None, max_page_num_each_node=
    return page_index_main(doc, opt)


-
-    
+def validate_and_truncate_physical_indices(toc_with_page_number, page_list_length, start_index=1, logger=None):
+    """
+    Validates and truncates physical indices that exceed the actual document length.
+    This prevents errors when TOC references pages that don't exist in the document (e.g. the file is broken or incomplete).
+    """
+    if not toc_with_page_number:
+        return toc_with_page_number
+    
+    max_allowed_page = page_list_length + start_index - 1
+    truncated_items = []
+    
+    for i, item in enumerate(toc_with_page_number):
+        if item.get('physical_index') is not None:
+            original_index = item['physical_index']
+            if original_index > max_allowed_page:
+                item['physical_index'] = None
+                truncated_items.append({
+                    'title': item.get('title', 'Unknown'),
+                    'original_index': original_index
+                })
+                if logger:
+                    logger.info(f"Removed physical_index for '{item.get('title', 'Unknown')}' (was {original_index}, too far beyond document)")
+    
+    if truncated_items and logger:
+        logger.info(f"Total removed items: {len(truncated_items)}")
+        
+    print(f"Document validation: {page_list_length} pages, max allowed index: {max_allowed_page}")
+    if truncated_items:
+        print(f"Truncated {len(truncated_items)} TOC items that exceeded document length")
+     
+    return toc_with_page_number
--- a/results/2023-annual-report-truncated_structure.json
+++ b/results/2023-annual-report-truncated_structure.json
@ -0,0 +1,83 @@
+{
+  "doc_name": "2023-annual-report-truncated.pdf",
+  "structure": [
+    {
+      "title": "Preface",
+      "start_index": 1,
+      "end_index": 4,
+      "node_id": "0000"
+    },
+    {
+      "title": "About the Federal Reserve",
+      "start_index": 5,
+      "end_index": 7,
+      "node_id": "0001"
+    },
+    {
+      "title": "Overview",
+      "start_index": 7,
+      "end_index": 8,
+      "node_id": "0002"
+    },
+    {
+      "title": "Monetary Policy and Economic Developments",
+      "start_index": 9,
+      "end_index": 9,
+      "nodes": [
+        {
+          "title": "March 2024 Summary",
+          "start_index": 9,
+          "end_index": 14,
+          "node_id": "0004"
+        },
+        {
+          "title": "June 2023 Summary",
+          "start_index": 15,
+          "end_index": 20,
+          "node_id": "0005"
+        }
+      ],
+      "node_id": "0003"
+    },
+    {
+      "title": "Financial Stability",
+      "start_index": 21,
+      "end_index": 21,
+      "nodes": [
+        {
+          "title": "Monitoring Financial Vulnerabilities",
+          "start_index": 22,
+          "end_index": 28,
+          "node_id": "0007"
+        },
+        {
+          "title": "Domestic and International Cooperation and Coordination",
+          "start_index": 28,
+          "end_index": 30,
+          "node_id": "0008"
+        }
+      ],
+      "node_id": "0006"
+    },
+    {
+      "title": "Supervision and Regulation",
+      "start_index": 31,
+      "end_index": 32,
+      "nodes": [
+        {
+          "title": "Supervised and Regulated Institutions",
+          "start_index": 32,
+          "end_index": 35,
+          "node_id": "0010"
+        },
+        {
+          "title": "Supervisory Developments",
+          "start_index": 35,
+          "end_index": 50,
+          "node_id": "0011"
+        }
+      ],
+      "node_id": "0009"
+    }
+  ]
+}