diff --git a/cookbook/pageindex_RAG_simple.ipynb b/cookbook/pageindex_RAG_simple.ipynb index dcb1971..3ccb3f4 100644 --- a/cookbook/pageindex_RAG_simple.ipynb +++ b/cookbook/pageindex_RAG_simple.ipynb @@ -21,14 +21,18 @@ " 🏠 Homepage  β€’  \n", " πŸ–₯️ Dashboard  β€’  \n", " πŸ“š API Docs  β€’  \n", - " πŸ“¦ GitHub  β€’  \n", + " πŸ“¦ GitHub  β€’  \n", " πŸ’¬ Discord  β€’  \n", " βœ‰οΈ Contact \n", "

\n", "\n", - "

\n", + "

\n", " \n", - " \"Star\n", + " \"Star\n", + " \n", + " \n", + " \n", + " \"Follow\n", " \n", "

\n", "\n", @@ -70,11 +74,13 @@ "source": [ "## πŸ“ Notebook Overview\n", "\n", - "This notebook demonstrates a simple example of **vectorless RAG** with PageIndex. You will learn how to:\n", + "This notebook demonstrates a simple, minimal example of **vectorless RAG** with PageIndex. You will learn how to:\n", "- [x] Build a PageIndex tree structure of a document\n", "- [x] Perform reasoning-based retrieval with tree search\n", "- [x] Generate answers based on the retrieved context\n", "\n", + "> ⚑ Note: This is a **minimal example** to illustrate PageIndex's core philosophy and idea, not its full capabilities. More advanced examples are coming soon.\n", + "\n", "---" ] }, @@ -94,7 +100,7 @@ "id": "edTfrizMFK4c" }, "source": [ - "#### 0.1 Install dependencies" + "#### 0.1 Install PageIndex" ] }, { @@ -106,7 +112,7 @@ }, "outputs": [], "source": [ - "%pip install -q --upgrade pageindex openai" + "%pip install -q --upgrade pageindex" ] }, { @@ -115,7 +121,7 @@ "id": "WVEWzPKGcG1M" }, "source": [ - "#### 0.2 Setup environment" + "#### 0.2 Setup PageIndex" ] }, { @@ -126,34 +132,32 @@ }, "outputs": [], "source": [ - "import os, json, openai, requests, textwrap\n", "from pageindex import PageIndexClient\n", - "from pprint import pprint\n", + "import pageindex.utils as utils\n", "\n", "# Get your PageIndex API key from https://dash.pageindex.ai/api-keys\n", "PAGEINDEX_API_KEY = \"YOUR_PAGEINDEX_API_KEY\"\n", - "OPENAI_API_KEY = \"YOUR_OPENAI_API_KEY\"\n", - "\n", "pi_client = PageIndexClient(api_key=PAGEINDEX_API_KEY)" ] }, { "cell_type": "markdown", - "metadata": { - "id": "AR7PLeVbcG1N" - }, + "metadata": {}, "source": [ - "#### 0.3 Define utility functions" + "#### 0.3 Setup LLM\n", + "\n", + "Choose your preferred LLM for reasoning-based retrieval. In this example, we use OpenAI’s GPT-4.1." ] }, { "cell_type": "code", - "execution_count": 63, - "metadata": { - "id": "hmj3POkDcG1N" - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ + "import openai\n", + "OPENAI_API_KEY = \"YOUR_OPENAI_API_KEY\"\n", + "\n", "async def call_llm(prompt, model=\"gpt-4.1\", temperature=0):\n", " client = openai.AsyncOpenAI(api_key=OPENAI_API_KEY)\n", " response = await client.chat.completions.create(\n", @@ -161,35 +165,7 @@ " messages=[{\"role\": \"user\", \"content\": prompt}],\n", " temperature=temperature\n", " )\n", - " return response.choices[0].message.content.strip()\n", - "\n", - "def remove_fields(data, fields=['text'], max_len=40):\n", - " if isinstance(data, dict):\n", - " return {k: remove_fields(v, fields)\n", - " for k, v in data.items() if k not in fields}\n", - " elif isinstance(data, list):\n", - " return [remove_fields(item, fields) for item in data]\n", - " elif isinstance(data, str):\n", - " return (data[:max_len] + '...') if len(data) > max_len else data\n", - " return data\n", - "\n", - "def print_tree(tree, exclude_fields=['text', 'page_index']):\n", - " cleaned_tree = remove_fields(tree.copy(), exclude_fields)\n", - " pprint(cleaned_tree, sort_dicts=False, width=100)\n", - "\n", - "def show(text, width=100):\n", - " for line in text.splitlines():\n", - " print(textwrap.fill(line, width=width))\n", - "\n", - "def create_node_mapping(tree):\n", - " \"\"\"Create a mapping of node_id to node for quick lookup\"\"\"\n", - " def get_all_nodes(tree):\n", - " if isinstance(tree, dict):\n", - " return [tree] + [node for child in tree.get('nodes', []) for node in get_all_nodes(child)]\n", - " elif isinstance(tree, list):\n", - " return [node for item in tree for node in get_all_nodes(item)]\n", - " return []\n", - " return {node[\"node_id\"]: node for node in get_all_nodes(tree) if node.get(\"node_id\")}" + " return response.choices[0].message.content.strip()" ] }, { @@ -226,11 +202,13 @@ "output_type": "stream", "text": [ "Downloaded https://arxiv.org/pdf/2501.12948.pdf\n", - "Document Submitted: pi-cmek7luf400960ao3o0o8us4d\n" + "Document Submitted: pi-cmeseq08w00vt0bo3u6tr244g\n" ] } ], "source": [ + "import os, requests\n", + "\n", "# You can also use our GitHub repo to generate PageIndex tree\n", "# https://github.com/VectifyAI/PageIndex\n", "\n", @@ -302,7 +280,7 @@ " 'prefix_summary': '### 2.2. DeepSeek-R1-Zero: Reinforcement...',\n", " 'nodes': [{'title': '2.2.1. Reinforcement Learning Algorithm',\n", " 'node_id': '0009',\n", - " 'summary': 'This partial document describes the Grou...'},\n", + " 'summary': 'The partial document describes the Group...'},\n", " {'title': '2.2.2. Reward Modeling',\n", " 'node_id': '0010',\n", " 'summary': 'This partial document discusses the rewa...'},\n", @@ -335,7 +313,7 @@ " 'summary': 'This partial document presents the concl...'},\n", " {'title': 'References',\n", " 'node_id': '0020',\n", - " 'summary': 'The partial document consists of a compr...'},\n", + " 'summary': 'This partial document consists of the re...'},\n", " {'title': 'Appendix', 'node_id': '0021', 'summary': '## Appendix\\n'},\n", " {'title': 'A. Contributions and Acknowledgments',\n", " 'node_id': '0022',\n", @@ -347,7 +325,7 @@ "if pi_client.is_retrieval_ready(doc_id):\n", " tree = pi_client.get_tree(doc_id, node_summary=True)['result']\n", " print('Simplified Tree Structure of the Document:')\n", - " print_tree(tree)\n", + " utils.print_tree(tree)\n", "else:\n", " print(\"Processing document, please try again later...\")" ] @@ -376,9 +354,11 @@ }, "outputs": [], "source": [ + "import json\n", + "\n", "query = \"What are the conclusions in this document?\"\n", "\n", - "tree_without_text = remove_fields(tree.copy(), fields=['text'])\n", + "tree_without_text = utils.remove_fields(tree.copy(), fields=['text'])\n", "\n", "search_prompt = f\"\"\"\n", "You are given a question and a tree structure of a document.\n", @@ -439,11 +419,11 @@ } ], "source": [ - "node_map = create_node_mapping(tree)\n", + "node_map = utils.create_node_mapping(tree)\n", "tree_search_result_json = json.loads(tree_search_result)\n", "\n", "print('Reasoning Process:')\n", - "show(tree_search_result_json['thinking'])\n", + "utils.print_wrapped(tree_search_result_json['thinking'])\n", "\n", "print('\\nRetrieved Nodes:')\n", "for node_id in tree_search_result_json[\"node_list\"]:\n", @@ -509,7 +489,7 @@ "relevant_content = \"\\n\\n\".join(node_map[node_id][\"text\"] for node_id in node_list)\n", "\n", "print('Retrieved Context:\\n')\n", - "show(relevant_content[:1000] + '...')" + "utils.print_wrapped(relevant_content[:1000] + '...')" ] }, { @@ -564,7 +544,7 @@ "\n", "print('Generated Answer:\\n')\n", "answer = await call_llm(answer_prompt)\n", - "show(answer)" + "utils.print_wrapped(answer)" ] }, { @@ -577,7 +557,7 @@ "\n", "## 🎯 What's Next\n", "\n", - "This notebook has demonstrated a basic, minimal example of **reasoning-based**, **vectorless** RAG with PageIndex. The workflow illustrates the core idea:\n", + "This notebook has demonstrated a **basic**, **minimal** example of **reasoning-based**, **vectorless** RAG with PageIndex. The workflow illustrates the core idea:\n", "> *Generating a hierarchical tree structure from a document, reasoning over that tree structure, and extracting relevant context, without relying on a vector database or top-k similarity search*.\n", "\n", "While this notebook highlights a minimal workflow, the PageIndex framework is built to support **far more advanced** use cases. In upcoming tutorials, we will introduce:\n", @@ -596,7 +576,7 @@ " 🏠 Homepage  β€’  \n", " πŸ–₯️ Dashboard  β€’  \n", " πŸ“š API Docs  β€’  \n", - " πŸ“¦ GitHub  β€’  \n", + " πŸ“¦ GitHub  β€’  \n", " πŸ’¬ Discord  β€’  \n", " βœ‰οΈ Contact\n", "\n",