mirror of
https://github.com/VectifyAI/PageIndex.git
synced 2026-05-16 18:25:14 +02:00
add node id, node summary and doc summary
This commit is contained in:
parent
1de05d8675
commit
db746f35e0
9 changed files with 3663 additions and 2810 deletions
107
utils.py
107
utils.py
|
|
@ -10,15 +10,19 @@ import copy
|
|||
import asyncio
|
||||
import pymupdf
|
||||
from io import BytesIO
|
||||
from dotenv import load_dotenv
|
||||
load_dotenv()
|
||||
import logging
|
||||
|
||||
CHATGPT_API_KEY = os.getenv("CHATGPT_API_KEY")
|
||||
|
||||
|
||||
def count_tokens(text, model):
|
||||
enc = tiktoken.encoding_for_model(model)
|
||||
tokens = enc.encode(text)
|
||||
return len(tokens)
|
||||
|
||||
def ChatGPT_API_with_finish_reason(model, prompt, api_key, chat_history=None):
|
||||
def ChatGPT_API_with_finish_reason(model, prompt, api_key=CHATGPT_API_KEY, chat_history=None):
|
||||
max_retries = 10
|
||||
client = openai.OpenAI(api_key=api_key)
|
||||
for i in range(max_retries):
|
||||
|
|
@ -50,7 +54,7 @@ def ChatGPT_API_with_finish_reason(model, prompt, api_key, chat_history=None):
|
|||
|
||||
|
||||
|
||||
def ChatGPT_API(model, prompt, api_key, chat_history=None):
|
||||
def ChatGPT_API(model, prompt, api_key=CHATGPT_API_KEY, chat_history=None):
|
||||
max_retries = 10
|
||||
client = openai.OpenAI(api_key=api_key)
|
||||
for i in range(max_retries):
|
||||
|
|
@ -78,7 +82,7 @@ def ChatGPT_API(model, prompt, api_key, chat_history=None):
|
|||
return "Error"
|
||||
|
||||
|
||||
async def ChatGPT_API_async(model, prompt, api_key):
|
||||
async def ChatGPT_API_async(model, prompt, api_key=CHATGPT_API_KEY):
|
||||
max_retries = 10
|
||||
client = openai.AsyncOpenAI(api_key=api_key)
|
||||
for i in range(max_retries):
|
||||
|
|
@ -151,7 +155,7 @@ def write_node_id(data, node_id=0):
|
|||
data['node_id'] = str(node_id).zfill(4)
|
||||
node_id += 1
|
||||
for key in list(data.keys()):
|
||||
if 'child_nodes' in key:
|
||||
if 'nodes' in key:
|
||||
node_id = write_node_id(data[key], node_id)
|
||||
elif isinstance(data, list):
|
||||
for index in range(len(data)):
|
||||
|
|
@ -161,10 +165,10 @@ def write_node_id(data, node_id=0):
|
|||
def get_nodes(structure):
|
||||
if isinstance(structure, dict):
|
||||
structure_node = copy.deepcopy(structure)
|
||||
structure_node.pop('child_nodes', None)
|
||||
structure_node.pop('nodes', None)
|
||||
nodes = [structure_node]
|
||||
for key in list(structure.keys()):
|
||||
if 'child_nodes' in key:
|
||||
if 'nodes' in key:
|
||||
nodes.extend(get_nodes(structure[key]))
|
||||
return nodes
|
||||
elif isinstance(structure, list):
|
||||
|
|
@ -177,8 +181,8 @@ def structure_to_list(structure):
|
|||
if isinstance(structure, dict):
|
||||
nodes = []
|
||||
nodes.append(structure)
|
||||
if 'child_nodes' in structure:
|
||||
nodes.extend(structure_to_list(structure['child_nodes']))
|
||||
if 'nodes' in structure:
|
||||
nodes.extend(structure_to_list(structure['nodes']))
|
||||
return nodes
|
||||
elif isinstance(structure, list):
|
||||
nodes = []
|
||||
|
|
@ -189,14 +193,14 @@ def structure_to_list(structure):
|
|||
|
||||
def get_leaf_nodes(structure):
|
||||
if isinstance(structure, dict):
|
||||
if not structure['child_nodes']:
|
||||
if not structure['nodes']:
|
||||
structure_node = copy.deepcopy(structure)
|
||||
structure_node.pop('child_nodes', None)
|
||||
structure_node.pop('nodes', None)
|
||||
return [structure_node]
|
||||
else:
|
||||
leaf_nodes = []
|
||||
for key in list(structure.keys()):
|
||||
if 'child_nodes' in key:
|
||||
if 'nodes' in key:
|
||||
leaf_nodes.extend(get_leaf_nodes(structure[key]))
|
||||
return leaf_nodes
|
||||
elif isinstance(structure, list):
|
||||
|
|
@ -212,7 +216,7 @@ def is_leaf_node(data, node_id):
|
|||
if data.get('node_id') == node_id:
|
||||
return data
|
||||
for key in data.keys():
|
||||
if 'child_nodes' in key:
|
||||
if 'nodes' in key:
|
||||
result = find_node(data[key], node_id)
|
||||
if result:
|
||||
return result
|
||||
|
|
@ -227,7 +231,7 @@ def is_leaf_node(data, node_id):
|
|||
node = find_node(data, node_id)
|
||||
|
||||
# Check if the node is a leaf node
|
||||
if node and not node.get('child_nodes'):
|
||||
if node and not node.get('nodes'):
|
||||
return True
|
||||
return False
|
||||
|
||||
|
|
@ -353,7 +357,7 @@ def list_to_tree(data):
|
|||
'title': item.get('title'),
|
||||
'start_index': item.get('start_index'),
|
||||
'end_index': item.get('end_index'),
|
||||
'child_nodes': []
|
||||
'nodes': []
|
||||
}
|
||||
|
||||
nodes[structure] = node
|
||||
|
|
@ -364,7 +368,7 @@ def list_to_tree(data):
|
|||
if parent_structure:
|
||||
# Add as child to parent if parent exists
|
||||
if parent_structure in nodes:
|
||||
nodes[parent_structure]['child_nodes'].append(node)
|
||||
nodes[parent_structure]['nodes'].append(node)
|
||||
else:
|
||||
root_nodes.append(node)
|
||||
else:
|
||||
|
|
@ -373,10 +377,10 @@ def list_to_tree(data):
|
|||
|
||||
# Helper function to clean empty children arrays
|
||||
def clean_node(node):
|
||||
if not node['child_nodes']:
|
||||
del node['child_nodes']
|
||||
if not node['nodes']:
|
||||
del node['nodes']
|
||||
else:
|
||||
for child in node['child_nodes']:
|
||||
for child in node['nodes']:
|
||||
clean_node(child)
|
||||
return node
|
||||
|
||||
|
|
@ -424,7 +428,7 @@ def get_page_tokens(pdf_path, model="gpt-4o-2024-11-20", pdf_parser="PyPDF2"):
|
|||
def get_text_of_pdf_pages(pdf_pages, start_page, end_page):
|
||||
text = ""
|
||||
for page_num in range(start_page-1, end_page):
|
||||
text += pdf_pages[page_num]
|
||||
text += pdf_pages[page_num][0]
|
||||
return text
|
||||
|
||||
def get_number_of_pages(pdf_path):
|
||||
|
|
@ -460,8 +464,8 @@ def clean_structure_post(data):
|
|||
data.pop('page_number', None)
|
||||
data.pop('start_index', None)
|
||||
data.pop('end_index', None)
|
||||
if 'child_nodes' in data:
|
||||
clean_structure_post(data['child_nodes'])
|
||||
if 'nodes' in data:
|
||||
clean_structure_post(data['nodes'])
|
||||
elif isinstance(data, list):
|
||||
for section in data:
|
||||
clean_structure_post(section)
|
||||
|
|
@ -471,8 +475,8 @@ def clean_structure_post(data):
|
|||
def remove_structure_text(data):
|
||||
if isinstance(data, dict):
|
||||
data.pop('text', None)
|
||||
if 'child_nodes' in data:
|
||||
remove_structure_text(data['child_nodes'])
|
||||
if 'nodes' in data:
|
||||
remove_structure_text(data['nodes'])
|
||||
elif isinstance(data, list):
|
||||
for item in data:
|
||||
remove_structure_text(item)
|
||||
|
|
@ -522,3 +526,60 @@ def convert_page_to_int(data):
|
|||
# Keep original value if conversion fails
|
||||
pass
|
||||
return data
|
||||
|
||||
def write_node_id(data, node_id=0):
|
||||
if isinstance(data, dict):
|
||||
data['node_id'] = str(node_id).zfill(4)
|
||||
node_id += 1
|
||||
for key in list(data.keys()):
|
||||
if 'nodes' in key:
|
||||
node_id = write_node_id(data[key], node_id)
|
||||
elif isinstance(data, list):
|
||||
for index in range(len(data)):
|
||||
node_id = write_node_id(data[index], node_id)
|
||||
return node_id
|
||||
|
||||
|
||||
def add_node_text(node, pdf_pages):
|
||||
if isinstance(node, dict):
|
||||
start_page = node.get('start_index')
|
||||
end_page = node.get('end_index')
|
||||
node['text'] = get_text_of_pdf_pages(pdf_pages, start_page, end_page)
|
||||
if 'nodes' in node:
|
||||
add_node_text(node['nodes'], pdf_pages)
|
||||
elif isinstance(node, list):
|
||||
for index in range(len(node)):
|
||||
add_node_text(node[index], pdf_pages)
|
||||
return
|
||||
|
||||
async def generate_node_summary(node, model=None):
|
||||
prompt = f"""You are given a part of a document, your task is to generate a description of the partial document about what are main points covered in the partial document.
|
||||
|
||||
Partial Document Text: {node['text']}
|
||||
|
||||
Directly return the description, do not include any other text.
|
||||
"""
|
||||
response = await ChatGPT_API_async(model, prompt)
|
||||
return response
|
||||
|
||||
|
||||
async def generate_summaries_for_structure(structure, model=None):
|
||||
nodes = structure_to_list(structure)
|
||||
tasks = [generate_node_summary(node, model=model) for node in nodes]
|
||||
summaries = await asyncio.gather(*tasks)
|
||||
|
||||
for node, summary in zip(nodes, summaries):
|
||||
node['summary'] = summary
|
||||
return structure
|
||||
|
||||
|
||||
def generate_doc_description(structure, model=None):
|
||||
prompt = f"""Your are an expert in generating descriptions for a document.
|
||||
You are given a structure of a document. Your task is to generate a one-sentence description for the document, which makes it easy to distinguish the document from other documents.
|
||||
|
||||
Document Structure: {structure}
|
||||
|
||||
Directly return the description, do not include any other text.
|
||||
"""
|
||||
response = ChatGPT_API(model, prompt)
|
||||
return response
|
||||
Loading…
Add table
Add a link
Reference in a new issue