Merge pull request #10 from rejojer/working

Working
This commit is contained in:
Mingtian Zhang 2025-04-20 23:09:28 +08:00 committed by GitHub
commit 44b643ffd3
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
3 changed files with 135 additions and 112 deletions

View file

@ -4,4 +4,5 @@ max_page_num_each_node: 10
max_token_num_each_node: 20000 max_token_num_each_node: 20000
if_add_node_id: "yes" if_add_node_id: "yes"
if_add_node_summary: "no" if_add_node_summary: "no"
if_add_doc_description: "yes" if_add_doc_description: "yes"
if_add_node_text: "no"

View file

@ -7,11 +7,10 @@ import re
from .utils import * from .utils import *
import os import os
from concurrent.futures import ThreadPoolExecutor, as_completed from concurrent.futures import ThreadPoolExecutor, as_completed
import argparse
################### check title in page ######################################################### ################### check title in page #########################################################
def check_title_appearance(item, page_list, start_index=1, model=None): async def check_title_appearance(item, page_list, start_index=1, model=None):
title=item['title'] title=item['title']
if 'physical_index' not in item or item['physical_index'] is None: if 'physical_index' not in item or item['physical_index'] is None:
return {'list_index': item.get('list_index'), 'answer': 'no', 'title':title, 'page_number': None} return {'list_index': item.get('list_index'), 'answer': 'no', 'title':title, 'page_number': None}
@ -37,7 +36,7 @@ def check_title_appearance(item, page_list, start_index=1, model=None):
}} }}
Directly return the final JSON structure. Do not output anything else.""" Directly return the final JSON structure. Do not output anything else."""
response = ChatGPT_API(model=model, prompt=prompt) response = await ChatGPT_API_async(model=model, prompt=prompt)
response = extract_json(response) response = extract_json(response)
if 'answer' in response: if 'answer' in response:
answer = response['answer'] answer = response['answer']
@ -46,9 +45,9 @@ def check_title_appearance(item, page_list, start_index=1, model=None):
return {'list_index': item['list_index'], 'answer': answer, 'title': title, 'page_number': page_number} return {'list_index': item['list_index'], 'answer': answer, 'title': title, 'page_number': page_number}
def check_title_appearance_in_start(title, page_text, model=None, logger=None): async def check_title_appearance_in_start(title, page_text, model=None, logger=None):
prompt = f""" prompt = f"""
You will be given given the current section title and the current page_text. You will be given the current section title and the current page_text.
Your job is to check if the current section starts in the beginning of the given page_text. Your job is to check if the current section starts in the beginning of the given page_text.
If there are other contents before the current section title, then the current section does not start in the beginning of the given page_text. If there are other contents before the current section title, then the current section does not start in the beginning of the given page_text.
If the current section title is the first content in the given page_text, then the current section starts in the beginning of the given page_text. If the current section title is the first content in the given page_text, then the current section starts in the beginning of the given page_text.
@ -65,36 +64,40 @@ def check_title_appearance_in_start(title, page_text, model=None, logger=None):
}} }}
Directly return the final JSON structure. Do not output anything else.""" Directly return the final JSON structure. Do not output anything else."""
response = ChatGPT_API(model=model, prompt=prompt) response = await ChatGPT_API_async(model=model, prompt=prompt)
response = extract_json(response) response = extract_json(response)
if logger: if logger:
logger.info(f"Response: {response}") logger.info(f"Response: {response}")
if 'start_begin' in response: return response.get("start_begin", "no")
return response['start_begin']
else:
return 'no'
def check_title_appearance_in_start_parallel(structure, page_list, model=None, logger=None): async def check_title_appearance_in_start_concurrent(structure, page_list, model=None, logger=None):
if logger: if logger:
logger.info(f"Checking title appearance in start parallel") logger.info("Checking title appearance in start concurrently")
with ThreadPoolExecutor(max_workers=10) as executor:
future_to_item = {
executor.submit(check_title_appearance_in_start, item['title'], page_list[item['physical_index']-1][0], model=model, logger=logger): item
for item in structure
}
# Process completed futures and attach results to items
for future in as_completed(future_to_item):
item = future_to_item[future]
try:
result = future.result()
item['appear_start'] = result
except Exception as e:
if logger:
logger.error(f"Error processing item {item['title']}: {str(e)}")
item['appear_start'] = 'no'
# skip items without physical_index
for item in structure:
if item.get('physical_index') is None:
item['appear_start'] = 'no'
# only for items with valid physical_index
tasks = []
valid_items = []
for item in structure:
if item.get('physical_index') is not None:
page_text = page_list[item['physical_index'] - 1][0]
tasks.append(check_title_appearance_in_start(item['title'], page_text, model=model, logger=logger))
valid_items.append(item)
results = await asyncio.gather(*tasks, return_exceptions=True)
for item, result in zip(valid_items, results):
if isinstance(result, Exception):
if logger:
logger.error(f"Error checking start for {item['title']}: {result}")
item['appear_start'] = 'no'
else:
item['appear_start'] = result
return structure return structure
@ -505,14 +508,15 @@ def generate_toc_continue(toc_content, part, model="gpt-4o-2024-11-20"):
For the title, you need to extract the original title from the text, only fix the space inconsistency. For the title, you need to extract the original title from the text, only fix the space inconsistency.
The provided text contains tags like <physical_index_X> and <physical_index_X> to indicate the start and end of page X. \ The provided text contains tags like <physical_index_X> and <physical_index_X> to indicate the start and end of page X. \
For the physical_index, you need to extract the physical index of the start of the section from the text. Keep the <physical_index_X> format.
The response should be in the following format. The response should be in the following format.
[ [
{ {
"structure": <structure index, "x.x.x" or None> (string), "structure": <structure index, "x.x.x"> (string),
"title": <title of the section, keep the original title>, "title": <title of the section, keep the original title>,
"physical_index": "<physical_index_X> (keep the format)" or None "physical_index": "<physical_index_X> (keep the format)"
}, },
... ...
] ]
@ -538,13 +542,15 @@ def generate_toc_init(part, model=None):
The provided text contains tags like <physical_index_X> and <physical_index_X> to indicate the start and end of page X. The provided text contains tags like <physical_index_X> and <physical_index_X> to indicate the start and end of page X.
For the physical_index, you need to extract the physical index of the start of the section from the text. Keep the <physical_index_X> format.
The response should be in the following format. The response should be in the following format.
[ [
{ {{
"structure": <structure index, "x.x.x" or None> (string), "structure": <structure index, "x.x.x"> (string),
"title": <title of the section, keep the original title>, "title": <title of the section, keep the original title>,
"physical_index": "<physical_index_X> (keep the format)" or None "physical_index": "<physical_index_X> (keep the format)"
}, }},
], ],
@ -738,7 +744,7 @@ def single_toc_item_index_fixer(section_title, content, model="gpt-4o-2024-11-20
def fix_incorrect_toc(toc_with_page_number, page_list, incorrect_results, start_index=1, model=None, logger=None): async def fix_incorrect_toc(toc_with_page_number, page_list, incorrect_results, start_index=1, model=None, logger=None):
print(f'start fix_incorrect_toc with {len(incorrect_results)} incorrect results') print(f'start fix_incorrect_toc with {len(incorrect_results)} incorrect results')
incorrect_indices = {result['list_index'] for result in incorrect_results} incorrect_indices = {result['list_index'] for result in incorrect_results}
@ -746,7 +752,7 @@ def fix_incorrect_toc(toc_with_page_number, page_list, incorrect_results, start_
incorrect_results_and_range_logs = [] incorrect_results_and_range_logs = []
# Helper function to process and check a single incorrect item # Helper function to process and check a single incorrect item
def process_and_check_item(incorrect_item): async def process_and_check_item(incorrect_item):
list_index = incorrect_item['list_index'] list_index = incorrect_item['list_index']
# Find the previous correct item # Find the previous correct item
prev_correct = None prev_correct = None
@ -786,7 +792,7 @@ def fix_incorrect_toc(toc_with_page_number, page_list, incorrect_results, start_
# Check if the result is correct # Check if the result is correct
check_item = incorrect_item.copy() check_item = incorrect_item.copy()
check_item['physical_index'] = physical_index_int check_item['physical_index'] = physical_index_int
check_result = check_title_appearance(check_item, page_list, start_index, model) check_result = await check_title_appearance(check_item, page_list, start_index, model)
return { return {
'list_index': list_index, 'list_index': list_index,
@ -794,20 +800,19 @@ def fix_incorrect_toc(toc_with_page_number, page_list, incorrect_results, start_
'physical_index': physical_index_int, 'physical_index': physical_index_int,
'is_valid': check_result['answer'] == 'yes' 'is_valid': check_result['answer'] == 'yes'
} }
# Process incorrect items concurrently
results = [] tasks = [
with ThreadPoolExecutor() as executor: process_and_check_item(item)
future_to_item = {executor.submit(process_and_check_item, item): item for item in incorrect_results} for item in incorrect_results
for future in as_completed(future_to_item): ]
item = future_to_item[future] results = await asyncio.gather(*tasks, return_exceptions=True)
for item, result in zip(incorrect_results, results):
try: if isinstance(result, Exception):
result = future.result() print(f"Processing item {item} generated an exception: {result}")
results.append(result) continue
except Exception as exc: results = [result for result in results if not isinstance(result, Exception)]
print(f"Processing item {item} generated an exception: {exc}")
# Update the toc_with_page_number with the fixed indices and check for any invalid results # Update the toc_with_page_number with the fixed indices and check for any invalid results
invalid_results = [] invalid_results = []
for result in results: for result in results:
@ -827,7 +832,7 @@ def fix_incorrect_toc(toc_with_page_number, page_list, incorrect_results, start_
def fix_incorrect_toc_with_retries(toc_with_page_number, page_list, incorrect_results, start_index=1, max_attempts=3, model=None, logger=None): async def fix_incorrect_toc_with_retries(toc_with_page_number, page_list, incorrect_results, start_index=1, max_attempts=3, model=None, logger=None):
print('start fix_incorrect_toc') print('start fix_incorrect_toc')
fix_attempt = 0 fix_attempt = 0
current_toc = toc_with_page_number current_toc = toc_with_page_number
@ -836,7 +841,7 @@ def fix_incorrect_toc_with_retries(toc_with_page_number, page_list, incorrect_re
while current_incorrect: while current_incorrect:
print(f"Fixing {len(current_incorrect)} incorrect results") print(f"Fixing {len(current_incorrect)} incorrect results")
current_toc, current_incorrect = fix_incorrect_toc(current_toc, page_list, current_incorrect, start_index, model, logger) current_toc, current_incorrect = await fix_incorrect_toc(current_toc, page_list, current_incorrect, start_index, model, logger)
fix_attempt += 1 fix_attempt += 1
if fix_attempt >= max_attempts: if fix_attempt >= max_attempts:
@ -849,7 +854,7 @@ def fix_incorrect_toc_with_retries(toc_with_page_number, page_list, incorrect_re
################### verify toc ######################################################### ################### verify toc #########################################################
def verify_toc(page_list, list_result, start_index=1, N=None, model=None): async def verify_toc(page_list, list_result, start_index=1, N=None, model=None):
print('start verify_toc') print('start verify_toc')
# Find the last non-None physical_index # Find the last non-None physical_index
last_physical_index = None last_physical_index = None
@ -879,16 +884,12 @@ def verify_toc(page_list, list_result, start_index=1, N=None, model=None):
item_with_index['list_index'] = idx # Add the original index in list_result item_with_index['list_index'] = idx # Add the original index in list_result
indexed_sample_list.append(item_with_index) indexed_sample_list.append(item_with_index)
# Run checks in parallel # Run checks concurrently
results = [] tasks = [
with ThreadPoolExecutor(max_workers=10) as executor: check_title_appearance(item, page_list, start_index, model)
future_to_item = { for item in indexed_sample_list
executor.submit(check_title_appearance, item, page_list, start_index, model): item ]
for item in indexed_sample_list results = await asyncio.gather(*tasks)
}
for future in as_completed(future_to_item):
results.append(future.result())
# Process results # Process results
correct_count = 0 correct_count = 0
@ -910,7 +911,7 @@ def verify_toc(page_list, list_result, start_index=1, N=None, model=None):
################### main process ######################################################### ################### main process #########################################################
def meta_processor(page_list, mode=None, toc_content=None, toc_page_list=None, start_index=1, opt=None, logger=None): async def meta_processor(page_list, mode=None, toc_content=None, toc_page_list=None, start_index=1, opt=None, logger=None):
print(mode) print(mode)
print(f'start_index: {start_index}') print(f'start_index: {start_index}')
@ -922,7 +923,7 @@ def meta_processor(page_list, mode=None, toc_content=None, toc_page_list=None, s
toc_with_page_number = process_no_toc(page_list, start_index=start_index, model=opt.model, logger=logger) toc_with_page_number = process_no_toc(page_list, start_index=start_index, model=opt.model, logger=logger)
toc_with_page_number = [item for item in toc_with_page_number if item.get('physical_index') is not None] toc_with_page_number = [item for item in toc_with_page_number if item.get('physical_index') is not None]
accuracy, incorrect_results = verify_toc(page_list, toc_with_page_number, start_index=start_index, model=opt.model) accuracy, incorrect_results = await verify_toc(page_list, toc_with_page_number, start_index=start_index, model=opt.model)
logger.info({ logger.info({
'mode': 'process_toc_with_page_numbers', 'mode': 'process_toc_with_page_numbers',
@ -932,26 +933,26 @@ def meta_processor(page_list, mode=None, toc_content=None, toc_page_list=None, s
if accuracy == 1.0 and len(incorrect_results) == 0: if accuracy == 1.0 and len(incorrect_results) == 0:
return toc_with_page_number return toc_with_page_number
if accuracy > 0.6 and len(incorrect_results) > 0: if accuracy > 0.6 and len(incorrect_results) > 0:
toc_with_page_number, incorrect_results = fix_incorrect_toc_with_retries(toc_with_page_number, page_list, incorrect_results,start_index=start_index, max_attempts=3, model=opt.model, logger=logger) toc_with_page_number, incorrect_results = await fix_incorrect_toc_with_retries(toc_with_page_number, page_list, incorrect_results,start_index=start_index, max_attempts=3, model=opt.model, logger=logger)
return toc_with_page_number return toc_with_page_number
else: else:
if mode == 'process_toc_with_page_numbers': if mode == 'process_toc_with_page_numbers':
return meta_processor(page_list, mode='process_toc_no_page_numbers', toc_content=toc_content, toc_page_list=toc_page_list, start_index=start_index, opt=opt, logger=logger) return await meta_processor(page_list, mode='process_toc_no_page_numbers', toc_content=toc_content, toc_page_list=toc_page_list, start_index=start_index, opt=opt, logger=logger)
elif mode == 'process_toc_no_page_numbers': elif mode == 'process_toc_no_page_numbers':
return meta_processor(page_list, mode='process_no_toc', start_index=start_index, opt=opt, logger=logger) return await meta_processor(page_list, mode='process_no_toc', start_index=start_index, opt=opt, logger=logger)
else: else:
raise Exception('Processing failed') raise Exception('Processing failed')
def process_large_node_recursively(node, page_list, opt=None, logger=None): async def process_large_node_recursively(node, page_list, opt=None, logger=None):
node_page_list = page_list[node['start_index']-1:node['end_index']-1] node_page_list = page_list[node['start_index']-1:node['end_index']]
token_num = sum([page[1] for page in node_page_list]) token_num = sum([page[1] for page in node_page_list])
if node['end_index'] - node['start_index'] > opt.max_page_num_each_node and token_num >= opt.max_token_num_each_node: if node['end_index'] - node['start_index'] > opt.max_page_num_each_node and token_num >= opt.max_token_num_each_node:
print('large node:', node['title'], 'start_index:', node['start_index'], 'end_index:', node['end_index'], 'token_num:', token_num) print('large node:', node['title'], 'start_index:', node['start_index'], 'end_index:', node['end_index'], 'token_num:', token_num)
node_toc_tree = meta_processor(node_page_list, mode='process_no_toc', start_index=node['start_index'], opt=opt, logger=logger) node_toc_tree = await meta_processor(node_page_list, mode='process_no_toc', start_index=node['start_index'], opt=opt, logger=logger)
node_toc_tree = check_title_appearance_in_start_parallel(node_toc_tree, page_list, model=opt.model, logger=logger) node_toc_tree = await check_title_appearance_in_start_concurrent(node_toc_tree, page_list, model=opt.model, logger=logger)
if node['title'].strip() == node_toc_tree[0]['title'].strip(): if node['title'].strip() == node_toc_tree[0]['title'].strip():
node['nodes'] = post_processing(node_toc_tree[1:], node['end_index']) node['nodes'] = post_processing(node_toc_tree[1:], node['end_index'])
@ -961,17 +962,20 @@ def process_large_node_recursively(node, page_list, opt=None, logger=None):
node['end_index'] = node_toc_tree[0]['start_index'] node['end_index'] = node_toc_tree[0]['start_index']
if 'nodes' in node and node['nodes']: if 'nodes' in node and node['nodes']:
for child_node in node['nodes']: tasks = [
process_large_node_recursively(child_node, page_list, opt, logger=logger) process_large_node_recursively(child_node, page_list, opt, logger=logger)
for child_node in node['nodes']
]
await asyncio.gather(*tasks)
return node return node
def tree_parser(page_list, opt, logger=None): async def tree_parser(page_list, opt, doc=None, logger=None):
check_toc_result = check_toc(page_list, opt) check_toc_result = check_toc(page_list, opt)
logger.info(check_toc_result) logger.info(check_toc_result)
if check_toc_result['toc_content'] is not None and check_toc_result['page_index_given_in_toc'] == 'yes': if check_toc_result.get("toc_content") and check_toc_result["toc_content"].strip() and check_toc_result["page_index_given_in_toc"] == "yes":
toc_with_page_number = meta_processor( toc_with_page_number = await meta_processor(
page_list, page_list,
mode='process_toc_with_page_numbers', mode='process_toc_with_page_numbers',
start_index=1, start_index=1,
@ -980,7 +984,7 @@ def tree_parser(page_list, opt, logger=None):
opt=opt, opt=opt,
logger=logger) logger=logger)
else: else:
toc_with_page_number = meta_processor( toc_with_page_number = await meta_processor(
page_list, page_list,
mode='process_no_toc', mode='process_no_toc',
start_index=1, start_index=1,
@ -988,10 +992,13 @@ def tree_parser(page_list, opt, logger=None):
logger=logger) logger=logger)
toc_with_page_number = add_preface_if_needed(toc_with_page_number) toc_with_page_number = add_preface_if_needed(toc_with_page_number)
toc_with_page_number = check_title_appearance_in_start_parallel(toc_with_page_number, page_list, model=opt.model, logger=logger) toc_with_page_number = await check_title_appearance_in_start_concurrent(toc_with_page_number, page_list, model=opt.model, logger=logger)
toc_tree = post_processing(toc_with_page_number, len(page_list)) toc_tree = post_processing(toc_with_page_number, len(page_list))
for node in toc_tree: tasks = [
process_large_node_recursively(node, page_list, opt, logger=logger) process_large_node_recursively(node, page_list, opt, logger=logger)
for node in toc_tree
]
await asyncio.gather(*tasks)
return toc_tree return toc_tree
@ -1012,13 +1019,15 @@ def page_index_main(doc, opt=None):
logger.info({'total_page_number': len(page_list)}) logger.info({'total_page_number': len(page_list)})
logger.info({'total_token': sum([page[1] for page in page_list])}) logger.info({'total_token': sum([page[1] for page in page_list])})
structure = tree_parser(page_list, opt, logger=logger) structure = asyncio.run(tree_parser(page_list, opt, doc=doc, logger=logger))
if opt.if_add_node_id == 'yes': if opt.if_add_node_id == 'yes':
write_node_id(structure) write_node_id(structure)
if opt.if_add_node_summary == 'yes': if opt.if_add_node_summary == 'yes':
add_node_text(structure, page_list) add_node_text(structure, page_list)
asyncio.run(generate_summaries_for_structure(structure, model=opt.model)) asyncio.run(generate_summaries_for_structure(structure, model=opt.model))
remove_structure_text(structure) remove_structure_text(structure)
if opt.if_add_node_text == 'yes':
add_node_text_with_labels(structure, page_list)
if opt.if_add_doc_description == 'yes': if opt.if_add_doc_description == 'yes':
doc_description = generate_doc_description(structure, model=opt.model) doc_description = generate_doc_description(structure, model=opt.model)
return { return {
@ -1033,7 +1042,7 @@ def page_index_main(doc, opt=None):
def page_index(doc, model=None, toc_check_page_num=None, max_page_num_each_node=None, max_token_num_each_node=None, def page_index(doc, model=None, toc_check_page_num=None, max_page_num_each_node=None, max_token_num_each_node=None,
if_add_node_id=None, if_add_node_summary=None, if_add_doc_description=None): if_add_node_id=None, if_add_node_summary=None, if_add_doc_description=None, if_add_node_text=None):
user_opt = { user_opt = {
arg: value for arg, value in locals().items() arg: value for arg, value in locals().items()

View file

@ -409,26 +409,31 @@ def add_preface_if_needed(data):
def get_page_tokens(pdf_path, model="gpt-4o-2024-11-20", pdf_parser="PyPDF2"): def get_page_tokens(pdf_path, model="gpt-4o-2024-11-20", pdf_parser="PyPDF2"):
enc = tiktoken.encoding_for_model(model)
if pdf_parser == "PyPDF2": if pdf_parser == "PyPDF2":
pdf_reader = PyPDF2.PdfReader(pdf_path) pdf_reader = PyPDF2.PdfReader(pdf_path)
page_list = []
for page_num in range(len(pdf_reader.pages)):
page = pdf_reader.pages[page_num]
page_text = page.extract_text()
token_length = len(enc.encode(page_text))
page_list.append((page_text, token_length))
return page_list
elif pdf_parser == "PyMuPDF": elif pdf_parser == "PyMuPDF":
pdf_reader = pymupdf.open(pdf_path) if isinstance(pdf_path, BytesIO):
pdf_stream = pdf_path
doc = pymupdf.open(stream=pdf_stream, filetype="pdf")
elif isinstance(pdf_path, str) and os.path.isfile(pdf_path) and pdf_path.lower().endswith(".pdf"):
doc = pymupdf.open(pdf_path)
page_list = []
for page in doc:
page_text = page.get_text()
token_length = len(enc.encode(page_text))
page_list.append((page_text, token_length))
return page_list
else: else:
raise ValueError(f"Unsupported PDF parser: {pdf_parser}") raise ValueError(f"Unsupported PDF parser: {pdf_parser}")
enc = tiktoken.encoding_for_model(model)
page_list = []
for page_num in range(len(pdf_reader.pages)):
page = pdf_reader.pages[page_num]
page_text = page.extract_text()
token_length = len(enc.encode(page_text))
page_list.append((page_text, token_length))
return page_list
def get_text_of_pdf_pages(pdf_pages, start_page, end_page): def get_text_of_pdf_pages(pdf_pages, start_page, end_page):
@ -437,6 +442,12 @@ def get_text_of_pdf_pages(pdf_pages, start_page, end_page):
text += pdf_pages[page_num][0] text += pdf_pages[page_num][0]
return text return text
def get_text_of_pdf_pages_with_labels(pdf_pages, start_page, end_page):
text = ""
for page_num in range(start_page-1, end_page):
text += f"<physical_index_{page_num+1}>\n{pdf_pages[page_num][0]}\n<physical_index_{page_num+1}>\n"
return text
def get_number_of_pages(pdf_path): def get_number_of_pages(pdf_path):
pdf_reader = PyPDF2.PdfReader(pdf_path) pdf_reader = PyPDF2.PdfReader(pdf_path)
num = len(pdf_reader.pages) num = len(pdf_reader.pages)
@ -534,18 +545,6 @@ def convert_page_to_int(data):
pass pass
return data return data
def write_node_id(data, node_id=0):
if isinstance(data, dict):
data['node_id'] = str(node_id).zfill(4)
node_id += 1
for key in list(data.keys()):
if 'nodes' in key:
node_id = write_node_id(data[key], node_id)
elif isinstance(data, list):
for index in range(len(data)):
node_id = write_node_id(data[index], node_id)
return node_id
def add_node_text(node, pdf_pages): def add_node_text(node, pdf_pages):
if isinstance(node, dict): if isinstance(node, dict):
@ -559,6 +558,20 @@ def add_node_text(node, pdf_pages):
add_node_text(node[index], pdf_pages) add_node_text(node[index], pdf_pages)
return return
def add_node_text_with_labels(node, pdf_pages):
if isinstance(node, dict):
start_page = node.get('start_index')
end_page = node.get('end_index')
node['text'] = get_text_of_pdf_pages_with_labels(pdf_pages, start_page, end_page)
if 'nodes' in node:
add_node_text_with_labels(node['nodes'], pdf_pages)
elif isinstance(node, list):
for index in range(len(node)):
add_node_text_with_labels(node[index], pdf_pages)
return
async def generate_node_summary(node, model=None): async def generate_node_summary(node, model=None):
prompt = f"""You are given a part of a document, your task is to generate a description of the partial document about what are main points covered in the partial document. prompt = f"""You are given a part of a document, your task is to generate a description of the partial document about what are main points covered in the partial document.