From b3836b5004beba586a012ba7a3609eef88a4108f Mon Sep 17 00:00:00 2001
From: Ray
Date: Sun, 31 Aug 2025 17:10:06 +0800
Subject: [PATCH] fix notebook
---
cookbook/pageindex_RAG_simple.ipynb | 98 ++++++++++++-----------------
1 file changed, 39 insertions(+), 59 deletions(-)
diff --git a/cookbook/pageindex_RAG_simple.ipynb b/cookbook/pageindex_RAG_simple.ipynb
index dcb1971..3ccb3f4 100644
--- a/cookbook/pageindex_RAG_simple.ipynb
+++ b/cookbook/pageindex_RAG_simple.ipynb
@@ -21,14 +21,18 @@
" π Homepage β’ \n",
" π₯οΈ Dashboard β’ \n",
" π API Docs β’ \n",
- " π¦ GitHub β’ \n",
+ " π¦ GitHub β’ \n",
" π¬ Discord β’ \n",
" βοΈ Contact \n",
"
\n",
"\n",
- "\n",
+ "
\n",
" \n",
- "
\n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ "
\n",
" \n",
"
\n",
"\n",
@@ -70,11 +74,13 @@
"source": [
"## π Notebook Overview\n",
"\n",
- "This notebook demonstrates a simple example of **vectorless RAG** with PageIndex. You will learn how to:\n",
+ "This notebook demonstrates a simple, minimal example of **vectorless RAG** with PageIndex. You will learn how to:\n",
"- [x] Build a PageIndex tree structure of a document\n",
"- [x] Perform reasoning-based retrieval with tree search\n",
"- [x] Generate answers based on the retrieved context\n",
"\n",
+ "> β‘ Note: This is a **minimal example** to illustrate PageIndex's core philosophy and idea, not its full capabilities. More advanced examples are coming soon.\n",
+ "\n",
"---"
]
},
@@ -94,7 +100,7 @@
"id": "edTfrizMFK4c"
},
"source": [
- "#### 0.1 Install dependencies"
+ "#### 0.1 Install PageIndex"
]
},
{
@@ -106,7 +112,7 @@
},
"outputs": [],
"source": [
- "%pip install -q --upgrade pageindex openai"
+ "%pip install -q --upgrade pageindex"
]
},
{
@@ -115,7 +121,7 @@
"id": "WVEWzPKGcG1M"
},
"source": [
- "#### 0.2 Setup environment"
+ "#### 0.2 Setup PageIndex"
]
},
{
@@ -126,34 +132,32 @@
},
"outputs": [],
"source": [
- "import os, json, openai, requests, textwrap\n",
"from pageindex import PageIndexClient\n",
- "from pprint import pprint\n",
+ "import pageindex.utils as utils\n",
"\n",
"# Get your PageIndex API key from https://dash.pageindex.ai/api-keys\n",
"PAGEINDEX_API_KEY = \"YOUR_PAGEINDEX_API_KEY\"\n",
- "OPENAI_API_KEY = \"YOUR_OPENAI_API_KEY\"\n",
- "\n",
"pi_client = PageIndexClient(api_key=PAGEINDEX_API_KEY)"
]
},
{
"cell_type": "markdown",
- "metadata": {
- "id": "AR7PLeVbcG1N"
- },
+ "metadata": {},
"source": [
- "#### 0.3 Define utility functions"
+ "#### 0.3 Setup LLM\n",
+ "\n",
+ "Choose your preferred LLM for reasoning-based retrieval. In this example, we use OpenAIβs GPT-4.1."
]
},
{
"cell_type": "code",
- "execution_count": 63,
- "metadata": {
- "id": "hmj3POkDcG1N"
- },
+ "execution_count": null,
+ "metadata": {},
"outputs": [],
"source": [
+ "import openai\n",
+ "OPENAI_API_KEY = \"YOUR_OPENAI_API_KEY\"\n",
+ "\n",
"async def call_llm(prompt, model=\"gpt-4.1\", temperature=0):\n",
" client = openai.AsyncOpenAI(api_key=OPENAI_API_KEY)\n",
" response = await client.chat.completions.create(\n",
@@ -161,35 +165,7 @@
" messages=[{\"role\": \"user\", \"content\": prompt}],\n",
" temperature=temperature\n",
" )\n",
- " return response.choices[0].message.content.strip()\n",
- "\n",
- "def remove_fields(data, fields=['text'], max_len=40):\n",
- " if isinstance(data, dict):\n",
- " return {k: remove_fields(v, fields)\n",
- " for k, v in data.items() if k not in fields}\n",
- " elif isinstance(data, list):\n",
- " return [remove_fields(item, fields) for item in data]\n",
- " elif isinstance(data, str):\n",
- " return (data[:max_len] + '...') if len(data) > max_len else data\n",
- " return data\n",
- "\n",
- "def print_tree(tree, exclude_fields=['text', 'page_index']):\n",
- " cleaned_tree = remove_fields(tree.copy(), exclude_fields)\n",
- " pprint(cleaned_tree, sort_dicts=False, width=100)\n",
- "\n",
- "def show(text, width=100):\n",
- " for line in text.splitlines():\n",
- " print(textwrap.fill(line, width=width))\n",
- "\n",
- "def create_node_mapping(tree):\n",
- " \"\"\"Create a mapping of node_id to node for quick lookup\"\"\"\n",
- " def get_all_nodes(tree):\n",
- " if isinstance(tree, dict):\n",
- " return [tree] + [node for child in tree.get('nodes', []) for node in get_all_nodes(child)]\n",
- " elif isinstance(tree, list):\n",
- " return [node for item in tree for node in get_all_nodes(item)]\n",
- " return []\n",
- " return {node[\"node_id\"]: node for node in get_all_nodes(tree) if node.get(\"node_id\")}"
+ " return response.choices[0].message.content.strip()"
]
},
{
@@ -226,11 +202,13 @@
"output_type": "stream",
"text": [
"Downloaded https://arxiv.org/pdf/2501.12948.pdf\n",
- "Document Submitted: pi-cmek7luf400960ao3o0o8us4d\n"
+ "Document Submitted: pi-cmeseq08w00vt0bo3u6tr244g\n"
]
}
],
"source": [
+ "import os, requests\n",
+ "\n",
"# You can also use our GitHub repo to generate PageIndex tree\n",
"# https://github.com/VectifyAI/PageIndex\n",
"\n",
@@ -302,7 +280,7 @@
" 'prefix_summary': '### 2.2. DeepSeek-R1-Zero: Reinforcement...',\n",
" 'nodes': [{'title': '2.2.1. Reinforcement Learning Algorithm',\n",
" 'node_id': '0009',\n",
- " 'summary': 'This partial document describes the Grou...'},\n",
+ " 'summary': 'The partial document describes the Group...'},\n",
" {'title': '2.2.2. Reward Modeling',\n",
" 'node_id': '0010',\n",
" 'summary': 'This partial document discusses the rewa...'},\n",
@@ -335,7 +313,7 @@
" 'summary': 'This partial document presents the concl...'},\n",
" {'title': 'References',\n",
" 'node_id': '0020',\n",
- " 'summary': 'The partial document consists of a compr...'},\n",
+ " 'summary': 'This partial document consists of the re...'},\n",
" {'title': 'Appendix', 'node_id': '0021', 'summary': '## Appendix\\n'},\n",
" {'title': 'A. Contributions and Acknowledgments',\n",
" 'node_id': '0022',\n",
@@ -347,7 +325,7 @@
"if pi_client.is_retrieval_ready(doc_id):\n",
" tree = pi_client.get_tree(doc_id, node_summary=True)['result']\n",
" print('Simplified Tree Structure of the Document:')\n",
- " print_tree(tree)\n",
+ " utils.print_tree(tree)\n",
"else:\n",
" print(\"Processing document, please try again later...\")"
]
@@ -376,9 +354,11 @@
},
"outputs": [],
"source": [
+ "import json\n",
+ "\n",
"query = \"What are the conclusions in this document?\"\n",
"\n",
- "tree_without_text = remove_fields(tree.copy(), fields=['text'])\n",
+ "tree_without_text = utils.remove_fields(tree.copy(), fields=['text'])\n",
"\n",
"search_prompt = f\"\"\"\n",
"You are given a question and a tree structure of a document.\n",
@@ -439,11 +419,11 @@
}
],
"source": [
- "node_map = create_node_mapping(tree)\n",
+ "node_map = utils.create_node_mapping(tree)\n",
"tree_search_result_json = json.loads(tree_search_result)\n",
"\n",
"print('Reasoning Process:')\n",
- "show(tree_search_result_json['thinking'])\n",
+ "utils.print_wrapped(tree_search_result_json['thinking'])\n",
"\n",
"print('\\nRetrieved Nodes:')\n",
"for node_id in tree_search_result_json[\"node_list\"]:\n",
@@ -509,7 +489,7 @@
"relevant_content = \"\\n\\n\".join(node_map[node_id][\"text\"] for node_id in node_list)\n",
"\n",
"print('Retrieved Context:\\n')\n",
- "show(relevant_content[:1000] + '...')"
+ "utils.print_wrapped(relevant_content[:1000] + '...')"
]
},
{
@@ -564,7 +544,7 @@
"\n",
"print('Generated Answer:\\n')\n",
"answer = await call_llm(answer_prompt)\n",
- "show(answer)"
+ "utils.print_wrapped(answer)"
]
},
{
@@ -577,7 +557,7 @@
"\n",
"## π― What's Next\n",
"\n",
- "This notebook has demonstrated a basic, minimal example of **reasoning-based**, **vectorless** RAG with PageIndex. The workflow illustrates the core idea:\n",
+ "This notebook has demonstrated a **basic**, **minimal** example of **reasoning-based**, **vectorless** RAG with PageIndex. The workflow illustrates the core idea:\n",
"> *Generating a hierarchical tree structure from a document, reasoning over that tree structure, and extracting relevant context, without relying on a vector database or top-k similarity search*.\n",
"\n",
"While this notebook highlights a minimal workflow, the PageIndex framework is built to support **far more advanced** use cases. In upcoming tutorials, we will introduce:\n",
@@ -596,7 +576,7 @@
" π Homepage β’ \n",
" π₯οΈ Dashboard β’ \n",
" π API Docs β’ \n",
- " π¦ GitHub β’ \n",
+ " π¦ GitHub β’ \n",
" π¬ Discord β’ \n",
" βοΈ Contact\n",
"\n",