diff --git a/docs/earthmover.pdf b/docs/earthmover.pdf new file mode 100644 index 0000000..eb5d5e4 Binary files /dev/null and b/docs/earthmover.pdf differ diff --git a/page_index.py b/page_index.py index eb4a5f6..7dc44cb 100644 --- a/page_index.py +++ b/page_index.py @@ -27,7 +27,7 @@ def check_title_appearance(item, page_list, start_index=1, model=None): prompt = f""" Your job is to check if the given section appears or starts in the given page_text. - Note: ignore any space inconsistency in the page_text. + Note: do fuzzy matching, ignore any space inconsistency in the page_text. The given section title is {title}. The given page_text is {page_text}. @@ -178,7 +178,7 @@ def extract_toc_content(content, model=None): prompt = f"""please continue the generation of table of contents , directly output the remaining part of the structure""" new_response, finish_reason = ChatGPT_API_with_finish_reason(model=model, prompt=prompt, chat_history=chat_history) response = response + new_response - if_complete = check_if_toc_transformation_is_complete(content, response) + if_complete = check_if_toc_transformation_is_complete(content, response, model) while not (if_complete == "yes" and finish_reason == "finished"): chat_history = [ @@ -188,7 +188,7 @@ def extract_toc_content(content, model=None): prompt = f"""please continue the generation of table of contents , directly output the remaining part of the structure""" new_response, finish_reason = ChatGPT_API_with_finish_reason(model=model, prompt=prompt, chat_history=chat_history) response = response + new_response - if_complete = check_if_toc_transformation_is_complete(content, response) + if_complete = check_if_toc_transformation_is_complete(content, response, model) # Optional: Add a maximum retry limit to prevent infinite loops if len(chat_history) > 5: # Arbitrary limit of 10 attempts @@ -207,6 +207,7 @@ def detect_page_index(toc_content, model=None): Reply format: {{ + "thinking": "page_index_given_in_toc": "" }} Directly return the final JSON structure. Do not output anything else.""" @@ -318,7 +319,7 @@ def toc_transformer(toc_content, model=None): new_complete = get_json_content(new_complete) last_complete = last_complete+new_complete - if_complete = check_if_toc_transformation_is_complete(toc_content, last_complete) + if_complete = check_if_toc_transformation_is_complete(toc_content, last_complete, model) last_complete = json.loads(last_complete) @@ -615,7 +616,7 @@ def process_toc_with_page_numbers(toc_content, toc_page_list, page_list, model=N start_page_index = toc_page_list[-1] + 1 main_content = "" - for page_index in range(start_page_index, start_page_index + 20): + for page_index in range(start_page_index, min(start_page_index + opt.toc_check_page_num, len(page_list))): main_content += f"\n{page_list[page_index][0]}\n\n\n" toc_with_physical_index = toc_index_extractor(toc_no_page_number, main_content, model) @@ -784,10 +785,7 @@ def fix_incorrect_toc(toc_with_page_number, page_list, incorrect_results, start_ page_contents.append(page_text) content_range = ''.join(page_contents) - physical_index = single_toc_item_index_fixer(incorrect_item['title'], content_range, model) - - # Convert to int for checking - physical_index_int = convert_physical_index_to_int(physical_index) + physical_index_int = single_toc_item_index_fixer(incorrect_item['title'], content_range, model) # Check if the result is correct check_item = incorrect_item.copy() @@ -978,33 +976,23 @@ def tree_parser(page_list, opt, logger=None): check_toc_result = check_toc(page_list, opt) logger.info(check_toc_result) - if check_toc_result['toc_content'] is None: + if check_toc_result['toc_content'] is not None and check_toc_result['page_index_given_in_toc'] == 'yes': + toc_with_page_number = meta_processor( + page_list, + mode='process_toc_with_page_numbers', + start_index=1, + toc_content=check_toc_result['toc_content'], + toc_page_list=check_toc_result['toc_page_list'], + opt=opt, + logger=logger) + else: toc_with_page_number = meta_processor( page_list, mode='process_no_toc', start_index=1, opt=opt, logger=logger) - else: - if check_toc_result['page_index_given_in_toc'] == 'yes': - toc_with_page_number = meta_processor( - page_list, - mode='process_toc_with_page_numbers', - start_index=1, - toc_content=check_toc_result['toc_content'], - toc_page_list=check_toc_result['toc_page_list'], - opt=opt, - logger=logger) - else: - toc_with_page_number = meta_processor( - page_list, - mode='process_toc_no_page_numbers', - start_index=1, - toc_content=check_toc_result['toc_content'], - toc_page_list=check_toc_result['toc_page_list'], - opt=opt, - logger=logger) - + toc_with_page_number = add_preface_if_needed(toc_with_page_number) toc_with_page_number = check_title_appearance_in_start_parallel(toc_with_page_number, page_list, model=opt.model, logger=logger) toc_tree = post_processing(toc_with_page_number, len(page_list)) @@ -1026,6 +1014,12 @@ def page_index_main(doc, opt=None): print('Parsing PDF...') page_list = get_page_tokens(doc) + ### store text in page_list to file with their physical index + with open(f'./logs/{os.path.basename(doc)}_page_list.txt', 'w', encoding='utf-8') as f: + for page_index, page_text in enumerate(page_list): + page_text = f"\n{page_text[0]}\n\n\n" + f.write(page_text) + logger.info({'total_page_number': len(page_list)}) logger.info({'total_token': sum([page[1] for page in page_list])}) diff --git a/results/earthmover_structure.json b/results/earthmover_structure.json new file mode 100644 index 0000000..a570bc4 --- /dev/null +++ b/results/earthmover_structure.json @@ -0,0 +1,137 @@ +{ + "doc_name": "earthmover.pdf", + "structure": [ + { + "title": "Earth Mover\u2019s Distance based Similarity Search at Scale", + "start_index": 1, + "end_index": 1, + "node_id": "0000" + }, + { + "title": "ABSTRACT", + "start_index": 1, + "end_index": 1, + "node_id": "0001" + }, + { + "title": "INTRODUCTION", + "start_index": 1, + "end_index": 2, + "node_id": "0002" + }, + { + "title": "PRELIMINARIES", + "start_index": 2, + "end_index": 2, + "nodes": [ + { + "title": "Computing the EMD", + "start_index": 3, + "end_index": 3, + "node_id": "0004" + }, + { + "title": "Filter-and-Refinement Framework", + "start_index": 3, + "end_index": 4, + "node_id": "0005" + } + ], + "node_id": "0003" + }, + { + "title": "SCALING UP SSP", + "start_index": 4, + "end_index": 5, + "node_id": "0006" + }, + { + "title": "BOOSTING THE REFINEMENT PHASE", + "start_index": 5, + "end_index": 5, + "nodes": [ + { + "title": "Analysis of EMD Calculation", + "start_index": 5, + "end_index": 6, + "node_id": "0008" + }, + { + "title": "Progressive Bounding", + "start_index": 6, + "end_index": 6, + "node_id": "0009" + }, + { + "title": "Sensitivity to Refinement Order", + "start_index": 6, + "end_index": 7, + "node_id": "0010" + }, + { + "title": "Dynamic Refinement Ordering", + "start_index": 7, + "end_index": 8, + "node_id": "0011" + }, + { + "title": "Running Upper Bound", + "start_index": 8, + "end_index": 8, + "node_id": "0012" + } + ], + "node_id": "0007" + }, + { + "title": "EXPERIMENTAL EVALUATION", + "start_index": 8, + "end_index": 9, + "nodes": [ + { + "title": "Performance Improvement", + "start_index": 9, + "end_index": 10, + "node_id": "0014" + }, + { + "title": "Scalability Experiments", + "start_index": 10, + "end_index": 11, + "node_id": "0015" + }, + { + "title": "Parameter Tuning in DRO", + "start_index": 11, + "end_index": 12, + "node_id": "0016" + } + ], + "node_id": "0013" + }, + { + "title": "RELATED WORK", + "start_index": 12, + "end_index": 12, + "node_id": "0017" + }, + { + "title": "CONCLUSION", + "start_index": 12, + "end_index": 12, + "node_id": "0018" + }, + { + "title": "ACKNOWLEDGMENT", + "start_index": 12, + "end_index": 12, + "node_id": "0019" + }, + { + "title": "REFERENCES", + "start_index": 12, + "end_index": 12, + "node_id": "0020" + } + ] +} \ No newline at end of file diff --git a/results/four-lectures_structure.json b/results/four-lectures_structure.json index cf73815..1c1cbef 100644 --- a/results/four-lectures_structure.json +++ b/results/four-lectures_structure.json @@ -2,78 +2,80 @@ "doc_name": "four-lectures.pdf", "structure": [ { - "title": "Preface", + "title": "Four Lectures on Standard ML", "start_index": 1, "end_index": 1, - "node_id": "0000" - }, - { - "title": "ML at a Glance", - "start_index": 2, - "end_index": 2, "nodes": [ { - "title": "An ML session", + "title": "ML at a Glance", "start_index": 2, - "end_index": 3, - "node_id": "0002" - }, - { - "title": "Types and Values", - "start_index": 3, - "end_index": 4, - "node_id": "0003" - }, - { - "title": "Recursive Functions", - "start_index": 4, - "end_index": 4, - "node_id": "0004" - }, - { - "title": "Raising Exceptions", - "start_index": 4, - "end_index": 5, - "node_id": "0005" - }, - { - "title": "Structures", - "start_index": 5, - "end_index": 6, - "node_id": "0006" - }, - { - "title": "Signatures", - "start_index": 6, - "end_index": 7, - "node_id": "0007" - }, - { - "title": "Coercive Signature Matching", - "start_index": 7, - "end_index": 8, - "node_id": "0008" - }, - { - "title": "Functor Declaration", - "start_index": 8, - "end_index": 9, - "node_id": "0009" - }, - { - "title": "Functor Application", - "start_index": 9, - "end_index": 9, - "node_id": "0010" - }, - { - "title": "Summary", - "start_index": 9, - "end_index": 9, - "node_id": "0011" + "end_index": 2, + "nodes": [ + { + "title": "An ML session", + "start_index": 2, + "end_index": 3, + "node_id": "0002" + }, + { + "title": "Types and Values", + "start_index": 3, + "end_index": 4, + "node_id": "0003" + }, + { + "title": "Recursive Functions", + "start_index": 4, + "end_index": 4, + "node_id": "0004" + }, + { + "title": "Raising Exceptions", + "start_index": 4, + "end_index": 5, + "node_id": "0005" + }, + { + "title": "Structures", + "start_index": 5, + "end_index": 6, + "node_id": "0006" + }, + { + "title": "Signatures", + "start_index": 6, + "end_index": 7, + "node_id": "0007" + }, + { + "title": "Coercive Signature Matching", + "start_index": 7, + "end_index": 8, + "node_id": "0008" + }, + { + "title": "Functor Declaration", + "start_index": 8, + "end_index": 9, + "node_id": "0009" + }, + { + "title": "Functor Application", + "start_index": 9, + "end_index": 9, + "node_id": "0010" + }, + { + "title": "Summary", + "start_index": 9, + "end_index": 9, + "node_id": "0011" + } + ], + "node_id": "0001" } ], - "node_id": "0001" + "node_id": "0000" }, { "title": "Programming with ML Modules", @@ -264,70 +266,14 @@ { "title": "Appendix A: The Bare Interpreter", "start_index": 44, - "end_index": 44, - "nodes": [ - { - "title": "Syntax", - "start_index": 44, - "end_index": 44, - "node_id": "0043" - }, - { - "title": "Parsing", - "start_index": 44, - "end_index": 45, - "node_id": "0044" - }, - { - "title": "Environments", - "start_index": 45, - "end_index": 46, - "node_id": "0045" - }, - { - "title": "Evaluation", - "start_index": 46, - "end_index": 46, - "node_id": "0046" - }, - { - "title": "Type Checking", - "start_index": 46, - "end_index": 46, - "node_id": "0047" - }, - { - "title": "The Interpreter", - "start_index": 46, - "end_index": 47, - "node_id": "0048" - }, - { - "title": "The Evaluator", - "start_index": 47, - "end_index": 49, - "node_id": "0049" - }, - { - "title": "The Typechecker", - "start_index": 49, - "end_index": 50, - "node_id": "0050" - }, - { - "title": "The Basics", - "start_index": 50, - "end_index": 52, - "node_id": "0051" - } - ], + "end_index": 52, "node_id": "0042" }, { "title": "Appendix B: Files", "start_index": 53, "end_index": 53, - "node_id": "0052" + "node_id": "0043" } ] } \ No newline at end of file diff --git a/utils.py b/utils.py index 6306aee..f8a39a0 100644 --- a/utils.py +++ b/utils.py @@ -492,24 +492,25 @@ def check_token_limit(structure, limit=110000): print("Start Index:", node['start_index']) print("End Index:", node['end_index']) print("Title:", node['title']) - # print(node['text']) print("\n") def convert_physical_index_to_int(data): if isinstance(data, list): for i in range(len(data)): - if isinstance(data[i]['physical_index'], str): - if data[i]['physical_index'].startswith('').strip()) - elif data[i]['physical_index'].startswith('physical_index_'): - data[i]['physical_index'] = int(data[i]['physical_index'].split('_')[-1].strip()) + # Check if item is a dictionary and has 'physical_index' key + if isinstance(data[i], dict) and 'physical_index' in data[i]: + if isinstance(data[i]['physical_index'], str): + if data[i]['physical_index'].startswith('').strip()) + elif data[i]['physical_index'].startswith('physical_index_'): + data[i]['physical_index'] = int(data[i]['physical_index'].split('_')[-1].strip()) elif isinstance(data, str): if data.startswith('').strip()) elif data.startswith('physical_index_'): data = int(data.split('_')[-1].strip()) - ###check data is int + # Check data is int if isinstance(data, int): return data else: