mirror of
https://github.com/VectifyAI/PageIndex.git
synced 2026-05-07 13:52:37 +02:00
fix notebook
This commit is contained in:
parent
3c770d833f
commit
b3836b5004
1 changed files with 39 additions and 59 deletions
|
|
@ -21,14 +21,18 @@
|
|||
" <a href=\"https://vectify.ai\">🏠 Homepage</a> • \n",
|
||||
" <a href=\"https://dash.pageindex.ai\">🖥️ Dashboard</a> • \n",
|
||||
" <a href=\"https://docs.pageindex.ai/quickstart\">📚 API Docs</a> • \n",
|
||||
" <a href=\"https://github.com/vectifyai/pageindex\">📦 GitHub</a> • \n",
|
||||
" <a href=\"https://github.com/VectifyAI/PageIndex\">📦 GitHub</a> • \n",
|
||||
" <a href=\"https://discord.com/invite/VuXuf29EUj\">💬 Discord</a> • \n",
|
||||
" <a href=\"https://ii2abc2jejf.typeform.com/to/tK3AXl8T\">✉️ Contact</a> \n",
|
||||
"</p>\n",
|
||||
"\n",
|
||||
"<p align=\"center\">\n",
|
||||
"<p align=\"center\" style=\"white-space: nowrap;\">\n",
|
||||
" <a href=\"https://github.com/VectifyAI/PageIndex/stargazers\">\n",
|
||||
" <img src=\"https://img.shields.io/github/stars/VectifyAI/PageIndex?style=for-the-badge&logo=github&label=⭐%20Star%20Us\" alt=\"Star us on GitHub\" />\n",
|
||||
" <img height=\"30\" src=\"https://img.shields.io/github/stars/VectifyAI/PageIndex?style=for-the-badge&logo=github&label=⭐%20Star%20Us\" alt=\"Star us on GitHub\" />\n",
|
||||
" </a>\n",
|
||||
" <span style=\"margin: 0 12px;\"></span>\n",
|
||||
" <a href=\"https://twitter.com/VectifyAI\">\n",
|
||||
" <img height=\"30\" src=\"https://img.shields.io/badge/Follow%20Us-000000?style=for-the-badge&logo=x&logoColor=white\" alt=\"Follow us on X\" />\n",
|
||||
" </a>\n",
|
||||
"</p>\n",
|
||||
"\n",
|
||||
|
|
@ -70,11 +74,13 @@
|
|||
"source": [
|
||||
"## 📝 Notebook Overview\n",
|
||||
"\n",
|
||||
"This notebook demonstrates a simple example of **vectorless RAG** with PageIndex. You will learn how to:\n",
|
||||
"This notebook demonstrates a simple, minimal example of **vectorless RAG** with PageIndex. You will learn how to:\n",
|
||||
"- [x] Build a PageIndex tree structure of a document\n",
|
||||
"- [x] Perform reasoning-based retrieval with tree search\n",
|
||||
"- [x] Generate answers based on the retrieved context\n",
|
||||
"\n",
|
||||
"> ⚡ Note: This is a **minimal example** to illustrate PageIndex's core philosophy and idea, not its full capabilities. More advanced examples are coming soon.\n",
|
||||
"\n",
|
||||
"---"
|
||||
]
|
||||
},
|
||||
|
|
@ -94,7 +100,7 @@
|
|||
"id": "edTfrizMFK4c"
|
||||
},
|
||||
"source": [
|
||||
"#### 0.1 Install dependencies"
|
||||
"#### 0.1 Install PageIndex"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
|
@ -106,7 +112,7 @@
|
|||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"%pip install -q --upgrade pageindex openai"
|
||||
"%pip install -q --upgrade pageindex"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
|
@ -115,7 +121,7 @@
|
|||
"id": "WVEWzPKGcG1M"
|
||||
},
|
||||
"source": [
|
||||
"#### 0.2 Setup environment"
|
||||
"#### 0.2 Setup PageIndex"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
|
@ -126,34 +132,32 @@
|
|||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import os, json, openai, requests, textwrap\n",
|
||||
"from pageindex import PageIndexClient\n",
|
||||
"from pprint import pprint\n",
|
||||
"import pageindex.utils as utils\n",
|
||||
"\n",
|
||||
"# Get your PageIndex API key from https://dash.pageindex.ai/api-keys\n",
|
||||
"PAGEINDEX_API_KEY = \"YOUR_PAGEINDEX_API_KEY\"\n",
|
||||
"OPENAI_API_KEY = \"YOUR_OPENAI_API_KEY\"\n",
|
||||
"\n",
|
||||
"pi_client = PageIndexClient(api_key=PAGEINDEX_API_KEY)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {
|
||||
"id": "AR7PLeVbcG1N"
|
||||
},
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"#### 0.3 Define utility functions"
|
||||
"#### 0.3 Setup LLM\n",
|
||||
"\n",
|
||||
"Choose your preferred LLM for reasoning-based retrieval. In this example, we use OpenAI’s GPT-4.1."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 63,
|
||||
"metadata": {
|
||||
"id": "hmj3POkDcG1N"
|
||||
},
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import openai\n",
|
||||
"OPENAI_API_KEY = \"YOUR_OPENAI_API_KEY\"\n",
|
||||
"\n",
|
||||
"async def call_llm(prompt, model=\"gpt-4.1\", temperature=0):\n",
|
||||
" client = openai.AsyncOpenAI(api_key=OPENAI_API_KEY)\n",
|
||||
" response = await client.chat.completions.create(\n",
|
||||
|
|
@ -161,35 +165,7 @@
|
|||
" messages=[{\"role\": \"user\", \"content\": prompt}],\n",
|
||||
" temperature=temperature\n",
|
||||
" )\n",
|
||||
" return response.choices[0].message.content.strip()\n",
|
||||
"\n",
|
||||
"def remove_fields(data, fields=['text'], max_len=40):\n",
|
||||
" if isinstance(data, dict):\n",
|
||||
" return {k: remove_fields(v, fields)\n",
|
||||
" for k, v in data.items() if k not in fields}\n",
|
||||
" elif isinstance(data, list):\n",
|
||||
" return [remove_fields(item, fields) for item in data]\n",
|
||||
" elif isinstance(data, str):\n",
|
||||
" return (data[:max_len] + '...') if len(data) > max_len else data\n",
|
||||
" return data\n",
|
||||
"\n",
|
||||
"def print_tree(tree, exclude_fields=['text', 'page_index']):\n",
|
||||
" cleaned_tree = remove_fields(tree.copy(), exclude_fields)\n",
|
||||
" pprint(cleaned_tree, sort_dicts=False, width=100)\n",
|
||||
"\n",
|
||||
"def show(text, width=100):\n",
|
||||
" for line in text.splitlines():\n",
|
||||
" print(textwrap.fill(line, width=width))\n",
|
||||
"\n",
|
||||
"def create_node_mapping(tree):\n",
|
||||
" \"\"\"Create a mapping of node_id to node for quick lookup\"\"\"\n",
|
||||
" def get_all_nodes(tree):\n",
|
||||
" if isinstance(tree, dict):\n",
|
||||
" return [tree] + [node for child in tree.get('nodes', []) for node in get_all_nodes(child)]\n",
|
||||
" elif isinstance(tree, list):\n",
|
||||
" return [node for item in tree for node in get_all_nodes(item)]\n",
|
||||
" return []\n",
|
||||
" return {node[\"node_id\"]: node for node in get_all_nodes(tree) if node.get(\"node_id\")}"
|
||||
" return response.choices[0].message.content.strip()"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
|
@ -226,11 +202,13 @@
|
|||
"output_type": "stream",
|
||||
"text": [
|
||||
"Downloaded https://arxiv.org/pdf/2501.12948.pdf\n",
|
||||
"Document Submitted: pi-cmek7luf400960ao3o0o8us4d\n"
|
||||
"Document Submitted: pi-cmeseq08w00vt0bo3u6tr244g\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"import os, requests\n",
|
||||
"\n",
|
||||
"# You can also use our GitHub repo to generate PageIndex tree\n",
|
||||
"# https://github.com/VectifyAI/PageIndex\n",
|
||||
"\n",
|
||||
|
|
@ -302,7 +280,7 @@
|
|||
" 'prefix_summary': '### 2.2. DeepSeek-R1-Zero: Reinforcement...',\n",
|
||||
" 'nodes': [{'title': '2.2.1. Reinforcement Learning Algorithm',\n",
|
||||
" 'node_id': '0009',\n",
|
||||
" 'summary': 'This partial document describes the Grou...'},\n",
|
||||
" 'summary': 'The partial document describes the Group...'},\n",
|
||||
" {'title': '2.2.2. Reward Modeling',\n",
|
||||
" 'node_id': '0010',\n",
|
||||
" 'summary': 'This partial document discusses the rewa...'},\n",
|
||||
|
|
@ -335,7 +313,7 @@
|
|||
" 'summary': 'This partial document presents the concl...'},\n",
|
||||
" {'title': 'References',\n",
|
||||
" 'node_id': '0020',\n",
|
||||
" 'summary': 'The partial document consists of a compr...'},\n",
|
||||
" 'summary': 'This partial document consists of the re...'},\n",
|
||||
" {'title': 'Appendix', 'node_id': '0021', 'summary': '## Appendix\\n'},\n",
|
||||
" {'title': 'A. Contributions and Acknowledgments',\n",
|
||||
" 'node_id': '0022',\n",
|
||||
|
|
@ -347,7 +325,7 @@
|
|||
"if pi_client.is_retrieval_ready(doc_id):\n",
|
||||
" tree = pi_client.get_tree(doc_id, node_summary=True)['result']\n",
|
||||
" print('Simplified Tree Structure of the Document:')\n",
|
||||
" print_tree(tree)\n",
|
||||
" utils.print_tree(tree)\n",
|
||||
"else:\n",
|
||||
" print(\"Processing document, please try again later...\")"
|
||||
]
|
||||
|
|
@ -376,9 +354,11 @@
|
|||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import json\n",
|
||||
"\n",
|
||||
"query = \"What are the conclusions in this document?\"\n",
|
||||
"\n",
|
||||
"tree_without_text = remove_fields(tree.copy(), fields=['text'])\n",
|
||||
"tree_without_text = utils.remove_fields(tree.copy(), fields=['text'])\n",
|
||||
"\n",
|
||||
"search_prompt = f\"\"\"\n",
|
||||
"You are given a question and a tree structure of a document.\n",
|
||||
|
|
@ -439,11 +419,11 @@
|
|||
}
|
||||
],
|
||||
"source": [
|
||||
"node_map = create_node_mapping(tree)\n",
|
||||
"node_map = utils.create_node_mapping(tree)\n",
|
||||
"tree_search_result_json = json.loads(tree_search_result)\n",
|
||||
"\n",
|
||||
"print('Reasoning Process:')\n",
|
||||
"show(tree_search_result_json['thinking'])\n",
|
||||
"utils.print_wrapped(tree_search_result_json['thinking'])\n",
|
||||
"\n",
|
||||
"print('\\nRetrieved Nodes:')\n",
|
||||
"for node_id in tree_search_result_json[\"node_list\"]:\n",
|
||||
|
|
@ -509,7 +489,7 @@
|
|||
"relevant_content = \"\\n\\n\".join(node_map[node_id][\"text\"] for node_id in node_list)\n",
|
||||
"\n",
|
||||
"print('Retrieved Context:\\n')\n",
|
||||
"show(relevant_content[:1000] + '...')"
|
||||
"utils.print_wrapped(relevant_content[:1000] + '...')"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
|
@ -564,7 +544,7 @@
|
|||
"\n",
|
||||
"print('Generated Answer:\\n')\n",
|
||||
"answer = await call_llm(answer_prompt)\n",
|
||||
"show(answer)"
|
||||
"utils.print_wrapped(answer)"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
|
@ -577,7 +557,7 @@
|
|||
"\n",
|
||||
"## 🎯 What's Next\n",
|
||||
"\n",
|
||||
"This notebook has demonstrated a basic, minimal example of **reasoning-based**, **vectorless** RAG with PageIndex. The workflow illustrates the core idea:\n",
|
||||
"This notebook has demonstrated a **basic**, **minimal** example of **reasoning-based**, **vectorless** RAG with PageIndex. The workflow illustrates the core idea:\n",
|
||||
"> *Generating a hierarchical tree structure from a document, reasoning over that tree structure, and extracting relevant context, without relying on a vector database or top-k similarity search*.\n",
|
||||
"\n",
|
||||
"While this notebook highlights a minimal workflow, the PageIndex framework is built to support **far more advanced** use cases. In upcoming tutorials, we will introduce:\n",
|
||||
|
|
@ -596,7 +576,7 @@
|
|||
" <a href=\"https://vectify.ai\">🏠 Homepage</a> • \n",
|
||||
" <a href=\"https://dash.pageindex.ai\">🖥️ Dashboard</a> • \n",
|
||||
" <a href=\"https://docs.pageindex.ai/quickstart\">📚 API Docs</a> • \n",
|
||||
" <a href=\"https://github.com/vectifyai/pageindex\">📦 GitHub</a> • \n",
|
||||
" <a href=\"https://github.com/VectifyAI/PageIndex\">📦 GitHub</a> • \n",
|
||||
" <a href=\"https://discord.com/invite/VuXuf29EUj\">💬 Discord</a> • \n",
|
||||
" <a href=\"https://ii2abc2jejf.typeform.com/to/tK3AXl8T\">✉️ Contact</a>\n",
|
||||
"\n",
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue