disable process_toc_no_page_numbers

This commit is contained in:
mingtian 2025-04-06 19:29:01 +08:00
parent d6b92e0372
commit 23d1614291
5 changed files with 238 additions and 160 deletions

BIN
docs/earthmover.pdf Normal file

Binary file not shown.

View file

@ -27,7 +27,7 @@ def check_title_appearance(item, page_list, start_index=1, model=None):
prompt = f""" prompt = f"""
Your job is to check if the given section appears or starts in the given page_text. Your job is to check if the given section appears or starts in the given page_text.
Note: ignore any space inconsistency in the page_text. Note: do fuzzy matching, ignore any space inconsistency in the page_text.
The given section title is {title}. The given section title is {title}.
The given page_text is {page_text}. The given page_text is {page_text}.
@ -178,7 +178,7 @@ def extract_toc_content(content, model=None):
prompt = f"""please continue the generation of table of contents , directly output the remaining part of the structure""" prompt = f"""please continue the generation of table of contents , directly output the remaining part of the structure"""
new_response, finish_reason = ChatGPT_API_with_finish_reason(model=model, prompt=prompt, chat_history=chat_history) new_response, finish_reason = ChatGPT_API_with_finish_reason(model=model, prompt=prompt, chat_history=chat_history)
response = response + new_response response = response + new_response
if_complete = check_if_toc_transformation_is_complete(content, response) if_complete = check_if_toc_transformation_is_complete(content, response, model)
while not (if_complete == "yes" and finish_reason == "finished"): while not (if_complete == "yes" and finish_reason == "finished"):
chat_history = [ chat_history = [
@ -188,7 +188,7 @@ def extract_toc_content(content, model=None):
prompt = f"""please continue the generation of table of contents , directly output the remaining part of the structure""" prompt = f"""please continue the generation of table of contents , directly output the remaining part of the structure"""
new_response, finish_reason = ChatGPT_API_with_finish_reason(model=model, prompt=prompt, chat_history=chat_history) new_response, finish_reason = ChatGPT_API_with_finish_reason(model=model, prompt=prompt, chat_history=chat_history)
response = response + new_response response = response + new_response
if_complete = check_if_toc_transformation_is_complete(content, response) if_complete = check_if_toc_transformation_is_complete(content, response, model)
# Optional: Add a maximum retry limit to prevent infinite loops # Optional: Add a maximum retry limit to prevent infinite loops
if len(chat_history) > 5: # Arbitrary limit of 10 attempts if len(chat_history) > 5: # Arbitrary limit of 10 attempts
@ -207,6 +207,7 @@ def detect_page_index(toc_content, model=None):
Reply format: Reply format:
{{ {{
"thinking": <why do you think there are page numbers/indices given within the table of contents>
"page_index_given_in_toc": "<yes or no>" "page_index_given_in_toc": "<yes or no>"
}} }}
Directly return the final JSON structure. Do not output anything else.""" Directly return the final JSON structure. Do not output anything else."""
@ -318,7 +319,7 @@ def toc_transformer(toc_content, model=None):
new_complete = get_json_content(new_complete) new_complete = get_json_content(new_complete)
last_complete = last_complete+new_complete last_complete = last_complete+new_complete
if_complete = check_if_toc_transformation_is_complete(toc_content, last_complete) if_complete = check_if_toc_transformation_is_complete(toc_content, last_complete, model)
last_complete = json.loads(last_complete) last_complete = json.loads(last_complete)
@ -615,7 +616,7 @@ def process_toc_with_page_numbers(toc_content, toc_page_list, page_list, model=N
start_page_index = toc_page_list[-1] + 1 start_page_index = toc_page_list[-1] + 1
main_content = "" main_content = ""
for page_index in range(start_page_index, start_page_index + 20): for page_index in range(start_page_index, min(start_page_index + opt.toc_check_page_num, len(page_list))):
main_content += f"<physical_index_{page_index+1}>\n{page_list[page_index][0]}\n<physical_index_{page_index+1}>\n\n" main_content += f"<physical_index_{page_index+1}>\n{page_list[page_index][0]}\n<physical_index_{page_index+1}>\n\n"
toc_with_physical_index = toc_index_extractor(toc_no_page_number, main_content, model) toc_with_physical_index = toc_index_extractor(toc_no_page_number, main_content, model)
@ -784,10 +785,7 @@ def fix_incorrect_toc(toc_with_page_number, page_list, incorrect_results, start_
page_contents.append(page_text) page_contents.append(page_text)
content_range = ''.join(page_contents) content_range = ''.join(page_contents)
physical_index = single_toc_item_index_fixer(incorrect_item['title'], content_range, model) physical_index_int = single_toc_item_index_fixer(incorrect_item['title'], content_range, model)
# Convert to int for checking
physical_index_int = convert_physical_index_to_int(physical_index)
# Check if the result is correct # Check if the result is correct
check_item = incorrect_item.copy() check_item = incorrect_item.copy()
@ -978,33 +976,23 @@ def tree_parser(page_list, opt, logger=None):
check_toc_result = check_toc(page_list, opt) check_toc_result = check_toc(page_list, opt)
logger.info(check_toc_result) logger.info(check_toc_result)
if check_toc_result['toc_content'] is None: if check_toc_result['toc_content'] is not None and check_toc_result['page_index_given_in_toc'] == 'yes':
toc_with_page_number = meta_processor(
page_list,
mode='process_toc_with_page_numbers',
start_index=1,
toc_content=check_toc_result['toc_content'],
toc_page_list=check_toc_result['toc_page_list'],
opt=opt,
logger=logger)
else:
toc_with_page_number = meta_processor( toc_with_page_number = meta_processor(
page_list, page_list,
mode='process_no_toc', mode='process_no_toc',
start_index=1, start_index=1,
opt=opt, opt=opt,
logger=logger) logger=logger)
else:
if check_toc_result['page_index_given_in_toc'] == 'yes':
toc_with_page_number = meta_processor(
page_list,
mode='process_toc_with_page_numbers',
start_index=1,
toc_content=check_toc_result['toc_content'],
toc_page_list=check_toc_result['toc_page_list'],
opt=opt,
logger=logger)
else:
toc_with_page_number = meta_processor(
page_list,
mode='process_toc_no_page_numbers',
start_index=1,
toc_content=check_toc_result['toc_content'],
toc_page_list=check_toc_result['toc_page_list'],
opt=opt,
logger=logger)
toc_with_page_number = add_preface_if_needed(toc_with_page_number) toc_with_page_number = add_preface_if_needed(toc_with_page_number)
toc_with_page_number = check_title_appearance_in_start_parallel(toc_with_page_number, page_list, model=opt.model, logger=logger) toc_with_page_number = check_title_appearance_in_start_parallel(toc_with_page_number, page_list, model=opt.model, logger=logger)
toc_tree = post_processing(toc_with_page_number, len(page_list)) toc_tree = post_processing(toc_with_page_number, len(page_list))
@ -1026,6 +1014,12 @@ def page_index_main(doc, opt=None):
print('Parsing PDF...') print('Parsing PDF...')
page_list = get_page_tokens(doc) page_list = get_page_tokens(doc)
### store text in page_list to file with their physical index
with open(f'./logs/{os.path.basename(doc)}_page_list.txt', 'w', encoding='utf-8') as f:
for page_index, page_text in enumerate(page_list):
page_text = f"<physical_index_{page_index+1}>\n{page_text[0]}\n<physical_index_{page_index+1}>\n\n"
f.write(page_text)
logger.info({'total_page_number': len(page_list)}) logger.info({'total_page_number': len(page_list)})
logger.info({'total_token': sum([page[1] for page in page_list])}) logger.info({'total_token': sum([page[1] for page in page_list])})

View file

@ -0,0 +1,137 @@
{
"doc_name": "earthmover.pdf",
"structure": [
{
"title": "Earth Mover\u2019s Distance based Similarity Search at Scale",
"start_index": 1,
"end_index": 1,
"node_id": "0000"
},
{
"title": "ABSTRACT",
"start_index": 1,
"end_index": 1,
"node_id": "0001"
},
{
"title": "INTRODUCTION",
"start_index": 1,
"end_index": 2,
"node_id": "0002"
},
{
"title": "PRELIMINARIES",
"start_index": 2,
"end_index": 2,
"nodes": [
{
"title": "Computing the EMD",
"start_index": 3,
"end_index": 3,
"node_id": "0004"
},
{
"title": "Filter-and-Refinement Framework",
"start_index": 3,
"end_index": 4,
"node_id": "0005"
}
],
"node_id": "0003"
},
{
"title": "SCALING UP SSP",
"start_index": 4,
"end_index": 5,
"node_id": "0006"
},
{
"title": "BOOSTING THE REFINEMENT PHASE",
"start_index": 5,
"end_index": 5,
"nodes": [
{
"title": "Analysis of EMD Calculation",
"start_index": 5,
"end_index": 6,
"node_id": "0008"
},
{
"title": "Progressive Bounding",
"start_index": 6,
"end_index": 6,
"node_id": "0009"
},
{
"title": "Sensitivity to Refinement Order",
"start_index": 6,
"end_index": 7,
"node_id": "0010"
},
{
"title": "Dynamic Refinement Ordering",
"start_index": 7,
"end_index": 8,
"node_id": "0011"
},
{
"title": "Running Upper Bound",
"start_index": 8,
"end_index": 8,
"node_id": "0012"
}
],
"node_id": "0007"
},
{
"title": "EXPERIMENTAL EVALUATION",
"start_index": 8,
"end_index": 9,
"nodes": [
{
"title": "Performance Improvement",
"start_index": 9,
"end_index": 10,
"node_id": "0014"
},
{
"title": "Scalability Experiments",
"start_index": 10,
"end_index": 11,
"node_id": "0015"
},
{
"title": "Parameter Tuning in DRO",
"start_index": 11,
"end_index": 12,
"node_id": "0016"
}
],
"node_id": "0013"
},
{
"title": "RELATED WORK",
"start_index": 12,
"end_index": 12,
"node_id": "0017"
},
{
"title": "CONCLUSION",
"start_index": 12,
"end_index": 12,
"node_id": "0018"
},
{
"title": "ACKNOWLEDGMENT",
"start_index": 12,
"end_index": 12,
"node_id": "0019"
},
{
"title": "REFERENCES",
"start_index": 12,
"end_index": 12,
"node_id": "0020"
}
]
}

View file

@ -2,78 +2,80 @@
"doc_name": "four-lectures.pdf", "doc_name": "four-lectures.pdf",
"structure": [ "structure": [
{ {
"title": "Preface", "title": "Four Lectures on Standard ML",
"start_index": 1, "start_index": 1,
"end_index": 1, "end_index": 1,
"node_id": "0000"
},
{
"title": "ML at a Glance",
"start_index": 2,
"end_index": 2,
"nodes": [ "nodes": [
{ {
"title": "An ML session", "title": "ML at a Glance",
"start_index": 2, "start_index": 2,
"end_index": 3, "end_index": 2,
"node_id": "0002" "nodes": [
}, {
{ "title": "An ML session",
"title": "Types and Values", "start_index": 2,
"start_index": 3, "end_index": 3,
"end_index": 4, "node_id": "0002"
"node_id": "0003" },
}, {
{ "title": "Types and Values",
"title": "Recursive Functions", "start_index": 3,
"start_index": 4, "end_index": 4,
"end_index": 4, "node_id": "0003"
"node_id": "0004" },
}, {
{ "title": "Recursive Functions",
"title": "Raising Exceptions", "start_index": 4,
"start_index": 4, "end_index": 4,
"end_index": 5, "node_id": "0004"
"node_id": "0005" },
}, {
{ "title": "Raising Exceptions",
"title": "Structures", "start_index": 4,
"start_index": 5, "end_index": 5,
"end_index": 6, "node_id": "0005"
"node_id": "0006" },
}, {
{ "title": "Structures",
"title": "Signatures", "start_index": 5,
"start_index": 6, "end_index": 6,
"end_index": 7, "node_id": "0006"
"node_id": "0007" },
}, {
{ "title": "Signatures",
"title": "Coercive Signature Matching", "start_index": 6,
"start_index": 7, "end_index": 7,
"end_index": 8, "node_id": "0007"
"node_id": "0008" },
}, {
{ "title": "Coercive Signature Matching",
"title": "Functor Declaration", "start_index": 7,
"start_index": 8, "end_index": 8,
"end_index": 9, "node_id": "0008"
"node_id": "0009" },
}, {
{ "title": "Functor Declaration",
"title": "Functor Application", "start_index": 8,
"start_index": 9, "end_index": 9,
"end_index": 9, "node_id": "0009"
"node_id": "0010" },
}, {
{ "title": "Functor Application",
"title": "Summary", "start_index": 9,
"start_index": 9, "end_index": 9,
"end_index": 9, "node_id": "0010"
"node_id": "0011" },
{
"title": "Summary",
"start_index": 9,
"end_index": 9,
"node_id": "0011"
}
],
"node_id": "0001"
} }
], ],
"node_id": "0001" "node_id": "0000"
}, },
{ {
"title": "Programming with ML Modules", "title": "Programming with ML Modules",
@ -264,70 +266,14 @@
{ {
"title": "Appendix A: The Bare Interpreter", "title": "Appendix A: The Bare Interpreter",
"start_index": 44, "start_index": 44,
"end_index": 44, "end_index": 52,
"nodes": [
{
"title": "Syntax",
"start_index": 44,
"end_index": 44,
"node_id": "0043"
},
{
"title": "Parsing",
"start_index": 44,
"end_index": 45,
"node_id": "0044"
},
{
"title": "Environments",
"start_index": 45,
"end_index": 46,
"node_id": "0045"
},
{
"title": "Evaluation",
"start_index": 46,
"end_index": 46,
"node_id": "0046"
},
{
"title": "Type Checking",
"start_index": 46,
"end_index": 46,
"node_id": "0047"
},
{
"title": "The Interpreter",
"start_index": 46,
"end_index": 47,
"node_id": "0048"
},
{
"title": "The Evaluator",
"start_index": 47,
"end_index": 49,
"node_id": "0049"
},
{
"title": "The Typechecker",
"start_index": 49,
"end_index": 50,
"node_id": "0050"
},
{
"title": "The Basics",
"start_index": 50,
"end_index": 52,
"node_id": "0051"
}
],
"node_id": "0042" "node_id": "0042"
}, },
{ {
"title": "Appendix B: Files", "title": "Appendix B: Files",
"start_index": 53, "start_index": 53,
"end_index": 53, "end_index": 53,
"node_id": "0052" "node_id": "0043"
} }
] ]
} }

View file

@ -492,24 +492,25 @@ def check_token_limit(structure, limit=110000):
print("Start Index:", node['start_index']) print("Start Index:", node['start_index'])
print("End Index:", node['end_index']) print("End Index:", node['end_index'])
print("Title:", node['title']) print("Title:", node['title'])
# print(node['text'])
print("\n") print("\n")
def convert_physical_index_to_int(data): def convert_physical_index_to_int(data):
if isinstance(data, list): if isinstance(data, list):
for i in range(len(data)): for i in range(len(data)):
if isinstance(data[i]['physical_index'], str): # Check if item is a dictionary and has 'physical_index' key
if data[i]['physical_index'].startswith('<physical_index_'): if isinstance(data[i], dict) and 'physical_index' in data[i]:
data[i]['physical_index'] = int(data[i]['physical_index'].split('_')[-1].rstrip('>').strip()) if isinstance(data[i]['physical_index'], str):
elif data[i]['physical_index'].startswith('physical_index_'): if data[i]['physical_index'].startswith('<physical_index_'):
data[i]['physical_index'] = int(data[i]['physical_index'].split('_')[-1].strip()) data[i]['physical_index'] = int(data[i]['physical_index'].split('_')[-1].rstrip('>').strip())
elif data[i]['physical_index'].startswith('physical_index_'):
data[i]['physical_index'] = int(data[i]['physical_index'].split('_')[-1].strip())
elif isinstance(data, str): elif isinstance(data, str):
if data.startswith('<physical_index_'): if data.startswith('<physical_index_'):
data = int(data.split('_')[-1].rstrip('>').strip()) data = int(data.split('_')[-1].rstrip('>').strip())
elif data.startswith('physical_index_'): elif data.startswith('physical_index_'):
data = int(data.split('_')[-1].strip()) data = int(data.split('_')[-1].strip())
###check data is int # Check data is int
if isinstance(data, int): if isinstance(data, int):
return data return data
else: else: