fix: handle TOC items exceeding document length

2026-04-24 23:56:21 +02:00 · 2025-05-30 03:03:20 +01:00 · 2025-05-30 03:03:20 +01:00 · 1679600c9a
commit 1679600c9a
parent febcec60b9
3 changed files with 186 additions and 22 deletions
--- a/docs/2023-annual-report-truncated.pdf
+++ b/docs/2023-annual-report-truncated.pdf
--- a/pageindex/page_index.py
+++ b/pageindex/page_index.py
@ -665,8 +665,13 @@ def process_none_page_numbers(toc_items, page_list, start_index=1, model=None):
            page_contents = []
            for page_index in range(prev_physical_index, next_physical_index+1):
-                page_text = f"<physical_index_{page_index}>\n{page_list[page_index-start_index][0]}\n<physical_index_{page_index}>\n\n"
+                # Add bounds checking to prevent IndexError
-                page_contents.append(page_text)
+                list_index = page_index - start_index
                if list_index >= 0 and list_index < len(page_list):
                    page_text = f"<physical_index_{page_index}>\n{page_list[list_index][0]}\n<physical_index_{page_index}>\n\n"
                    page_contents.append(page_text)
                else:
                    continue
            item_copy = copy.deepcopy(item)
            del item_copy['page']
@ -754,12 +759,25 @@ async def fix_incorrect_toc(toc_with_page_number, page_list, incorrect_results,
    # Helper function to process and check a single incorrect item
    async def process_and_check_item(incorrect_item):
        list_index = incorrect_item['list_index']
        # Check if list_index is valid
        if list_index < 0 or list_index >= len(toc_with_page_number):
            # Return an invalid result for out-of-bounds indices
            return {
                'list_index': list_index,
                'title': incorrect_item['title'],
                'physical_index': incorrect_item.get('physical_index'),
                'is_valid': False
            }
        # Find the previous correct item
        prev_correct = None
        for i in range(list_index-1, -1, -1):
-            if i not in incorrect_indices:
+            if i not in incorrect_indices and i >= 0 and i < len(toc_with_page_number):
-                prev_correct = toc_with_page_number[i]['physical_index']
+                physical_index = toc_with_page_number[i].get('physical_index')
-                break
+                if physical_index is not None:
                    prev_correct = physical_index
                    break
        # If no previous correct item found, use start_index
        if prev_correct is None:
            prev_correct = start_index - 1
@ -767,9 +785,11 @@ async def fix_incorrect_toc(toc_with_page_number, page_list, incorrect_results,
        # Find the next correct item
        next_correct = None
        for i in range(list_index+1, len(toc_with_page_number)):
-            if i not in incorrect_indices:
+            if i not in incorrect_indices and i >= 0 and i < len(toc_with_page_number):
-                next_correct = toc_with_page_number[i]['physical_index']
+                physical_index = toc_with_page_number[i].get('physical_index')
-                break
+                if physical_index is not None:
                    next_correct = physical_index
                    break
        # If no next correct item found, use end_index
        if next_correct is None:
            next_correct = end_index
@ -783,8 +803,13 @@ async def fix_incorrect_toc(toc_with_page_number, page_list, incorrect_results,
        page_contents=[]
        for page_index in range(prev_correct, next_correct+1):
-            page_text = f"<physical_index_{page_index}>\n{page_list[page_index-start_index][0]}\n<physical_index_{page_index}>\n\n"
+            # Add bounds checking to prevent IndexError
-            page_contents.append(page_text)
+            list_index = page_index - start_index
            if list_index >= 0 and list_index < len(page_list):
                page_text = f"<physical_index_{page_index}>\n{page_list[list_index][0]}\n<physical_index_{page_index}>\n\n"
                page_contents.append(page_text)
            else:
                continue
        content_range = ''.join(page_contents)
        physical_index_int = single_toc_item_index_fixer(incorrect_item['title'], content_range, model)
@ -817,7 +842,17 @@ async def fix_incorrect_toc(toc_with_page_number, page_list, incorrect_results,
    invalid_results = []
    for result in results:
        if result['is_valid']:
-            toc_with_page_number[result['list_index']]['physical_index'] = result['physical_index']
+            # Add bounds checking to prevent IndexError
            list_idx = result['list_index']
            if 0 <= list_idx < len(toc_with_page_number):
                toc_with_page_number[list_idx]['physical_index'] = result['physical_index']
            else:
                # Index is out of bounds, treat as invalid
                invalid_results.append({
                    'list_index': result['list_index'],
                    'title': result['title'],
                    'physical_index': result['physical_index'],
                })
        else:
            invalid_results.append({
                'list_index': result['list_index'],
@ -880,9 +915,11 @@ async def verify_toc(page_list, list_result, start_index=1, N=None, model=None):
    indexed_sample_list = []
    for idx in sample_indices:
        item = list_result[idx]
-        item_with_index = item.copy()
+        # Skip items with None physical_index (these were invalidated by validate_and_truncate_physical_indices)
-        item_with_index['list_index'] = idx  # Add the original index in list_result
+        if item.get('physical_index') is not None:
-        indexed_sample_list.append(item_with_index)
+            item_with_index = item.copy()
            item_with_index['list_index'] = idx  # Add the original index in list_result
            indexed_sample_list.append(item_with_index)
    # Run checks concurrently
    tasks = [
@ -923,6 +960,14 @@ async def meta_processor(page_list, mode=None, toc_content=None, toc_page_list=N
        toc_with_page_number = process_no_toc(page_list, start_index=start_index, model=opt.model, logger=logger)
    toc_with_page_number = [item for item in toc_with_page_number if item.get('physical_index') is not None] 
    toc_with_page_number = validate_and_truncate_physical_indices(
        toc_with_page_number, 
        len(page_list), 
        start_index=start_index, 
        logger=logger
    )
    accuracy, incorrect_results = await verify_toc(page_list, toc_with_page_number, start_index=start_index, model=opt.model)
    logger.info({
@ -954,12 +999,15 @@ async def process_large_node_recursively(node, page_list, opt=None, logger=None)
        node_toc_tree = await meta_processor(node_page_list, mode='process_no_toc', start_index=node['start_index'], opt=opt, logger=logger)
        node_toc_tree = await check_title_appearance_in_start_concurrent(node_toc_tree, page_list, model=opt.model, logger=logger)
-        if node['title'].strip() == node_toc_tree[0]['title'].strip():
+        # Filter out items with None physical_index before post_processing
-            node['nodes'] = post_processing(node_toc_tree[1:], node['end_index'])
+        valid_node_toc_items = [item for item in node_toc_tree if item.get('physical_index') is not None]
-            node['end_index'] = node_toc_tree[1]['start_index']
+        
        if valid_node_toc_items and node['title'].strip() == valid_node_toc_items[0]['title'].strip():
            node['nodes'] = post_processing(valid_node_toc_items[1:], node['end_index'])
            node['end_index'] = valid_node_toc_items[1]['start_index'] if len(valid_node_toc_items) > 1 else node['end_index']
        else:
-            node['nodes'] = post_processing(node_toc_tree, node['end_index'])
+            node['nodes'] = post_processing(valid_node_toc_items, node['end_index'])
-            node['end_index'] = node_toc_tree[0]['start_index']
+            node['end_index'] = valid_node_toc_items[0]['start_index'] if valid_node_toc_items else node['end_index']
    if 'nodes' in node and node['nodes']:
        tasks = [
@ -993,7 +1041,11 @@ async def tree_parser(page_list, opt, doc=None, logger=None):
    toc_with_page_number = add_preface_if_needed(toc_with_page_number)
    toc_with_page_number = await check_title_appearance_in_start_concurrent(toc_with_page_number, page_list, model=opt.model, logger=logger)
-    toc_tree = post_processing(toc_with_page_number, len(page_list))
+    
    # Filter out items with None physical_index before post_processings
    valid_toc_items = [item for item in toc_with_page_number if item.get('physical_index') is not None]
    toc_tree = post_processing(valid_toc_items, len(page_list))
    tasks = [
        process_large_node_recursively(node, page_list, opt, logger=logger)
        for node in toc_tree
@ -1052,5 +1104,34 @@ def page_index(doc, model=None, toc_check_page_num=None, max_page_num_each_node=
    return page_index_main(doc, opt)
-
+def validate_and_truncate_physical_indices(toc_with_page_number, page_list_length, start_index=1, logger=None):
-    
+    """
    Validates and truncates physical indices that exceed the actual document length.
    This prevents errors when TOC references pages that don't exist in the document (e.g. the file is broken or incomplete).
    """
    if not toc_with_page_number:
        return toc_with_page_number
    max_allowed_page = page_list_length + start_index - 1
    truncated_items = []
    for i, item in enumerate(toc_with_page_number):
        if item.get('physical_index') is not None:
            original_index = item['physical_index']
            if original_index > max_allowed_page:
                item['physical_index'] = None
                truncated_items.append({
                    'title': item.get('title', 'Unknown'),
                    'original_index': original_index
                })
                if logger:
                    logger.info(f"Removed physical_index for '{item.get('title', 'Unknown')}' (was {original_index}, too far beyond document)")
    if truncated_items and logger:
        logger.info(f"Total removed items: {len(truncated_items)}")
    print(f"Document validation: {page_list_length} pages, max allowed index: {max_allowed_page}")
    if truncated_items:
        print(f"Truncated {len(truncated_items)} TOC items that exceeded document length")
    return toc_with_page_number
--- a/results/2023-annual-report-truncated_structure.json
+++ b/results/2023-annual-report-truncated_structure.json
@ -0,0 +1,83 @@
 {
  "doc_name": "2023-annual-report-truncated.pdf",
  "structure": [
    {
      "title": "Preface",
      "start_index": 1,
      "end_index": 4,
      "node_id": "0000"
    },
    {
      "title": "About the Federal Reserve",
      "start_index": 5,
      "end_index": 7,
      "node_id": "0001"
    },
    {
      "title": "Overview",
      "start_index": 7,
      "end_index": 8,
      "node_id": "0002"
    },
    {
      "title": "Monetary Policy and Economic Developments",
      "start_index": 9,
      "end_index": 9,
      "nodes": [
        {
          "title": "March 2024 Summary",
          "start_index": 9,
          "end_index": 14,
          "node_id": "0004"
        },
        {
          "title": "June 2023 Summary",
          "start_index": 15,
          "end_index": 20,
          "node_id": "0005"
        }
      ],
      "node_id": "0003"
    },
    {
      "title": "Financial Stability",
      "start_index": 21,
      "end_index": 21,
      "nodes": [
        {
          "title": "Monitoring Financial Vulnerabilities",
          "start_index": 22,
          "end_index": 28,
          "node_id": "0007"
        },
        {
          "title": "Domestic and International Cooperation and Coordination",
          "start_index": 28,
          "end_index": 30,
          "node_id": "0008"
        }
      ],
      "node_id": "0006"
    },
    {
      "title": "Supervision and Regulation",
      "start_index": 31,
      "end_index": 32,
      "nodes": [
        {
          "title": "Supervised and Regulated Institutions",
          "start_index": 32,
          "end_index": 35,
          "node_id": "0010"
        },
        {
          "title": "Supervisory Developments",
          "start_index": 35,
          "end_index": 50,
          "node_id": "0011"
        }
      ],
      "node_id": "0009"
    }
  ]
 }