disable process_toc_no_page_numbers

2026-07-15 21:11:05 +02:00 · 2025-04-06 19:29:01 +08:00 · 2025-04-06 19:29:01 +08:00 · 23d1614291
commit 23d1614291
parent d6b92e0372
5 changed files with 238 additions and 160 deletions
--- a/docs/earthmover.pdf
+++ b/docs/earthmover.pdf
--- a/page_index.py
+++ b/page_index.py
@ -27,7 +27,7 @@ def check_title_appearance(item, page_list, start_index=1, model=None):
    prompt = f"""
    Your job is to check if the given section appears or starts in the given page_text.

-    Note: ignore any space inconsistency in the page_text.
+    Note: do fuzzy matching, ignore any space inconsistency in the page_text.

    The given section title is {title}.
    The given page_text is {page_text}.
@ -178,7 +178,7 @@ def extract_toc_content(content, model=None):
    prompt = f"""please continue the generation of table of contents , directly output the remaining part of the structure"""
    new_response, finish_reason = ChatGPT_API_with_finish_reason(model=model, prompt=prompt, chat_history=chat_history)
    response = response + new_response
-    if_complete = check_if_toc_transformation_is_complete(content, response)
+    if_complete = check_if_toc_transformation_is_complete(content, response, model)
    
    while not (if_complete == "yes" and finish_reason == "finished"):
        chat_history = [
@ -188,7 +188,7 @@ def extract_toc_content(content, model=None):
        prompt = f"""please continue the generation of table of contents , directly output the remaining part of the structure"""
        new_response, finish_reason = ChatGPT_API_with_finish_reason(model=model, prompt=prompt, chat_history=chat_history)
        response = response + new_response
-        if_complete = check_if_toc_transformation_is_complete(content, response)
+        if_complete = check_if_toc_transformation_is_complete(content, response, model)
        
        # Optional: Add a maximum retry limit to prevent infinite loops
        if len(chat_history) > 5:  # Arbitrary limit of 10 attempts
@ -207,6 +207,7 @@ def detect_page_index(toc_content, model=None):

    Reply format:
    {{
+        "thinking": <why do you think there are page numbers/indices given within the table of contents>
        "page_index_given_in_toc": "<yes or no>"
    }}
    Directly return the final JSON structure. Do not output anything else."""
@ -318,7 +319,7 @@ def toc_transformer(toc_content, model=None):
            new_complete =  get_json_content(new_complete)
            last_complete = last_complete+new_complete

-        if_complete = check_if_toc_transformation_is_complete(toc_content, last_complete)
+        if_complete = check_if_toc_transformation_is_complete(toc_content, last_complete, model)
        

    last_complete = json.loads(last_complete)
@ -615,7 +616,7 @@ def process_toc_with_page_numbers(toc_content, toc_page_list, page_list, model=N
    
    start_page_index = toc_page_list[-1] + 1
    main_content = ""
-    for page_index in range(start_page_index, start_page_index + 20):
+    for page_index in range(start_page_index, min(start_page_index + opt.toc_check_page_num, len(page_list))):
        main_content += f"<physical_index_{page_index+1}>\n{page_list[page_index][0]}\n<physical_index_{page_index+1}>\n\n"

    toc_with_physical_index = toc_index_extractor(toc_no_page_number, main_content, model)
@ -784,10 +785,7 @@ def fix_incorrect_toc(toc_with_page_number, page_list, incorrect_results, start_
            page_contents.append(page_text)
        content_range = ''.join(page_contents)
        
-        physical_index = single_toc_item_index_fixer(incorrect_item['title'], content_range, model)
-        
-        # Convert to int for checking
-        physical_index_int = convert_physical_index_to_int(physical_index)
+        physical_index_int = single_toc_item_index_fixer(incorrect_item['title'], content_range, model)
        
        # Check if the result is correct
        check_item = incorrect_item.copy()
@ -978,33 +976,23 @@ def tree_parser(page_list, opt, logger=None):
    check_toc_result = check_toc(page_list, opt)    
    logger.info(check_toc_result)

-    if check_toc_result['toc_content'] is None:
+    if check_toc_result['toc_content'] is not None and check_toc_result['page_index_given_in_toc'] == 'yes':
+        toc_with_page_number = meta_processor(
+            page_list, 
+            mode='process_toc_with_page_numbers', 
+            start_index=1, 
+            toc_content=check_toc_result['toc_content'], 
+            toc_page_list=check_toc_result['toc_page_list'], 
+            opt=opt,
+            logger=logger)
+    else:
        toc_with_page_number = meta_processor(
            page_list, 
            mode='process_no_toc', 
            start_index=1, 
            opt=opt,
            logger=logger)
-    else:
-        if check_toc_result['page_index_given_in_toc'] == 'yes':
-            toc_with_page_number = meta_processor(
-                page_list, 
-                mode='process_toc_with_page_numbers', 
-                start_index=1, 
-                toc_content=check_toc_result['toc_content'], 
-                toc_page_list=check_toc_result['toc_page_list'], 
-                opt=opt,
-                logger=logger)
-        else:
-            toc_with_page_number = meta_processor(
-                page_list, 
-                mode='process_toc_no_page_numbers',
-                start_index=1,
-                toc_content=check_toc_result['toc_content'],
-                toc_page_list=check_toc_result['toc_page_list'], 
-                opt=opt,
-                logger=logger)
-    
+
    toc_with_page_number = add_preface_if_needed(toc_with_page_number)
    toc_with_page_number = check_title_appearance_in_start_parallel(toc_with_page_number, page_list, model=opt.model, logger=logger)
    toc_tree = post_processing(toc_with_page_number, len(page_list))
@ -1026,6 +1014,12 @@ def page_index_main(doc, opt=None):

    print('Parsing PDF...')
    page_list = get_page_tokens(doc)
+    ### store text in page_list to file with their physical index
+    with open(f'./logs/{os.path.basename(doc)}_page_list.txt', 'w', encoding='utf-8') as f:
+        for page_index, page_text in enumerate(page_list):
+            page_text = f"<physical_index_{page_index+1}>\n{page_text[0]}\n<physical_index_{page_index+1}>\n\n"
+            f.write(page_text)
+
    logger.info({'total_page_number': len(page_list)})
    logger.info({'total_token': sum([page[1] for page in page_list])})
    
--- a/results/earthmover_structure.json
+++ b/results/earthmover_structure.json
@ -0,0 +1,137 @@
+{
+  "doc_name": "earthmover.pdf",
+  "structure": [
+    {
+      "title": "Earth Mover\u2019s Distance based Similarity Search at Scale",
+      "start_index": 1,
+      "end_index": 1,
+      "node_id": "0000"
+    },
+    {
+      "title": "ABSTRACT",
+      "start_index": 1,
+      "end_index": 1,
+      "node_id": "0001"
+    },
+    {
+      "title": "INTRODUCTION",
+      "start_index": 1,
+      "end_index": 2,
+      "node_id": "0002"
+    },
+    {
+      "title": "PRELIMINARIES",
+      "start_index": 2,
+      "end_index": 2,
+      "nodes": [
+        {
+          "title": "Computing the EMD",
+          "start_index": 3,
+          "end_index": 3,
+          "node_id": "0004"
+        },
+        {
+          "title": "Filter-and-Refinement Framework",
+          "start_index": 3,
+          "end_index": 4,
+          "node_id": "0005"
+        }
+      ],
+      "node_id": "0003"
+    },
+    {
+      "title": "SCALING UP SSP",
+      "start_index": 4,
+      "end_index": 5,
+      "node_id": "0006"
+    },
+    {
+      "title": "BOOSTING THE REFINEMENT PHASE",
+      "start_index": 5,
+      "end_index": 5,
+      "nodes": [
+        {
+          "title": "Analysis of EMD Calculation",
+          "start_index": 5,
+          "end_index": 6,
+          "node_id": "0008"
+        },
+        {
+          "title": "Progressive Bounding",
+          "start_index": 6,
+          "end_index": 6,
+          "node_id": "0009"
+        },
+        {
+          "title": "Sensitivity to Refinement Order",
+          "start_index": 6,
+          "end_index": 7,
+          "node_id": "0010"
+        },
+        {
+          "title": "Dynamic Refinement Ordering",
+          "start_index": 7,
+          "end_index": 8,
+          "node_id": "0011"
+        },
+        {
+          "title": "Running Upper Bound",
+          "start_index": 8,
+          "end_index": 8,
+          "node_id": "0012"
+        }
+      ],
+      "node_id": "0007"
+    },
+    {
+      "title": "EXPERIMENTAL EVALUATION",
+      "start_index": 8,
+      "end_index": 9,
+      "nodes": [
+        {
+          "title": "Performance Improvement",
+          "start_index": 9,
+          "end_index": 10,
+          "node_id": "0014"
+        },
+        {
+          "title": "Scalability Experiments",
+          "start_index": 10,
+          "end_index": 11,
+          "node_id": "0015"
+        },
+        {
+          "title": "Parameter Tuning in DRO",
+          "start_index": 11,
+          "end_index": 12,
+          "node_id": "0016"
+        }
+      ],
+      "node_id": "0013"
+    },
+    {
+      "title": "RELATED WORK",
+      "start_index": 12,
+      "end_index": 12,
+      "node_id": "0017"
+    },
+    {
+      "title": "CONCLUSION",
+      "start_index": 12,
+      "end_index": 12,
+      "node_id": "0018"
+    },
+    {
+      "title": "ACKNOWLEDGMENT",
+      "start_index": 12,
+      "end_index": 12,
+      "node_id": "0019"
+    },
+    {
+      "title": "REFERENCES",
+      "start_index": 12,
+      "end_index": 12,
+      "node_id": "0020"
+    }
+  ]
+}
--- a/results/four-lectures_structure.json
+++ b/results/four-lectures_structure.json
@ -2,78 +2,80 @@
  "doc_name": "four-lectures.pdf",
  "structure": [
    {
-      "title": "Preface",
+      "title": "Four Lectures on Standard ML",
      "start_index": 1,
      "end_index": 1,
-      "node_id": "0000"
-    },
-    {
-      "title": "ML at a Glance",
-      "start_index": 2,
-      "end_index": 2,
      "nodes": [
        {
-          "title": "An ML session",
+          "title": "ML at a Glance",
          "start_index": 2,
-          "end_index": 3,
-          "node_id": "0002"
-        },
-        {
-          "title": "Types and Values",
-          "start_index": 3,
-          "end_index": 4,
-          "node_id": "0003"
-        },
-        {
-          "title": "Recursive Functions",
-          "start_index": 4,
-          "end_index": 4,
-          "node_id": "0004"
-        },
-        {
-          "title": "Raising Exceptions",
-          "start_index": 4,
-          "end_index": 5,
-          "node_id": "0005"
-        },
-        {
-          "title": "Structures",
-          "start_index": 5,
-          "end_index": 6,
-          "node_id": "0006"
-        },
-        {
-          "title": "Signatures",
-          "start_index": 6,
-          "end_index": 7,
-          "node_id": "0007"
-        },
-        {
-          "title": "Coercive Signature Matching",
-          "start_index": 7,
-          "end_index": 8,
-          "node_id": "0008"
-        },
-        {
-          "title": "Functor Declaration",
-          "start_index": 8,
-          "end_index": 9,
-          "node_id": "0009"
-        },
-        {
-          "title": "Functor Application",
-          "start_index": 9,
-          "end_index": 9,
-          "node_id": "0010"
-        },
-        {
-          "title": "Summary",
-          "start_index": 9,
-          "end_index": 9,
-          "node_id": "0011"
+          "end_index": 2,
+          "nodes": [
+            {
+              "title": "An ML session",
+              "start_index": 2,
+              "end_index": 3,
+              "node_id": "0002"
+            },
+            {
+              "title": "Types and Values",
+              "start_index": 3,
+              "end_index": 4,
+              "node_id": "0003"
+            },
+            {
+              "title": "Recursive Functions",
+              "start_index": 4,
+              "end_index": 4,
+              "node_id": "0004"
+            },
+            {
+              "title": "Raising Exceptions",
+              "start_index": 4,
+              "end_index": 5,
+              "node_id": "0005"
+            },
+            {
+              "title": "Structures",
+              "start_index": 5,
+              "end_index": 6,
+              "node_id": "0006"
+            },
+            {
+              "title": "Signatures",
+              "start_index": 6,
+              "end_index": 7,
+              "node_id": "0007"
+            },
+            {
+              "title": "Coercive Signature Matching",
+              "start_index": 7,
+              "end_index": 8,
+              "node_id": "0008"
+            },
+            {
+              "title": "Functor Declaration",
+              "start_index": 8,
+              "end_index": 9,
+              "node_id": "0009"
+            },
+            {
+              "title": "Functor Application",
+              "start_index": 9,
+              "end_index": 9,
+              "node_id": "0010"
+            },
+            {
+              "title": "Summary",
+              "start_index": 9,
+              "end_index": 9,
+              "node_id": "0011"
+            }
+          ],
+          "node_id": "0001"
        }
      ],
-      "node_id": "0001"
+      "node_id": "0000"
    },
    {
      "title": "Programming with ML Modules",
@ -264,70 +266,14 @@
    {
      "title": "Appendix A: The Bare Interpreter",
      "start_index": 44,
-      "end_index": 44,
-      "nodes": [
-        {
-          "title": "Syntax",
-          "start_index": 44,
-          "end_index": 44,
-          "node_id": "0043"
-        },
-        {
-          "title": "Parsing",
-          "start_index": 44,
-          "end_index": 45,
-          "node_id": "0044"
-        },
-        {
-          "title": "Environments",
-          "start_index": 45,
-          "end_index": 46,
-          "node_id": "0045"
-        },
-        {
-          "title": "Evaluation",
-          "start_index": 46,
-          "end_index": 46,
-          "node_id": "0046"
-        },
-        {
-          "title": "Type Checking",
-          "start_index": 46,
-          "end_index": 46,
-          "node_id": "0047"
-        },
-        {
-          "title": "The Interpreter",
-          "start_index": 46,
-          "end_index": 47,
-          "node_id": "0048"
-        },
-        {
-          "title": "The Evaluator",
-          "start_index": 47,
-          "end_index": 49,
-          "node_id": "0049"
-        },
-        {
-          "title": "The Typechecker",
-          "start_index": 49,
-          "end_index": 50,
-          "node_id": "0050"
-        },
-        {
-          "title": "The Basics",
-          "start_index": 50,
-          "end_index": 52,
-          "node_id": "0051"
-        }
-      ],
+      "end_index": 52,
      "node_id": "0042"
    },
    {
      "title": "Appendix B: Files",
      "start_index": 53,
      "end_index": 53,
-      "node_id": "0052"
+      "node_id": "0043"
    }
  ]
 }
--- a/utils.py
+++ b/utils.py
@ -492,24 +492,25 @@ def check_token_limit(structure, limit=110000):
            print("Start Index:", node['start_index'])
            print("End Index:", node['end_index'])
            print("Title:", node['title'])
-            # print(node['text'])
            print("\n")


 def convert_physical_index_to_int(data):
    if isinstance(data, list):
        for i in range(len(data)):
-            if isinstance(data[i]['physical_index'], str):
-                if data[i]['physical_index'].startswith('<physical_index_'):
-                    data[i]['physical_index'] = int(data[i]['physical_index'].split('_')[-1].rstrip('>').strip())
-                elif data[i]['physical_index'].startswith('physical_index_'):
-                    data[i]['physical_index'] = int(data[i]['physical_index'].split('_')[-1].strip())
+            # Check if item is a dictionary and has 'physical_index' key
+            if isinstance(data[i], dict) and 'physical_index' in data[i]:
+                if isinstance(data[i]['physical_index'], str):
+                    if data[i]['physical_index'].startswith('<physical_index_'):
+                        data[i]['physical_index'] = int(data[i]['physical_index'].split('_')[-1].rstrip('>').strip())
+                    elif data[i]['physical_index'].startswith('physical_index_'):
+                        data[i]['physical_index'] = int(data[i]['physical_index'].split('_')[-1].strip())
    elif isinstance(data, str):
        if data.startswith('<physical_index_'):
            data = int(data.split('_')[-1].rstrip('>').strip())
        elif data.startswith('physical_index_'):
            data = int(data.split('_')[-1].strip())
-        ###check data is int
+        # Check data is int
        if isinstance(data, int):
            return data
        else: