diff --git a/cookbook/pageindex_RAG_simple.ipynb b/cookbook/pageindex_RAG_simple.ipynb new file mode 100644 index 0000000..9c40bed --- /dev/null +++ b/cookbook/pageindex_RAG_simple.ipynb @@ -0,0 +1,623 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "TCh9BTedHJK1" + }, + "source": [ + "" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "nD0hb4TFHWTt" + }, + "source": [ + "
Reasoning-based RAG ✧ No Vector DB ✧ No Chunking ✧ Human-like Retrieval
\n", + "\n", + "\n", + " 🏠 Homepage • \n", + " 🖥️ Dashboard • \n", + " 📚 API Docs • \n", + " 📦 GitHub • \n", + " 💬 Discord • \n", + " ✉️ Contact \n", + "
" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "Ebvn5qfpcG1K" + }, + "source": [ + "# 🧠 Simple Vectorless RAG with PageIndex\n", + "\n", + "PageIndex generates a searchable tree structure of documents, enabling reasoning-based retrieval through tree search — without vectors.\n", + "\n", + "- **No Vectors Needed**: Uses document structure and LLM reasoning for retrieval.\n", + "- **No Chunking Needed**: Documents are organized into natural sections rather than artificial chunks.\n", + "- **No Top-K Needed**: The LLM decides how many nodes need to be retrieved.\n", + "- **Transparent Retrieval Process**: Retrieval based on reasoning — say goodbye to approximate semantic search ('vibe retrieval').\n", + "\n", + "# 📝 About this Notebook\n", + "This notebook demonstrates a simple example of **vectorless RAG** with PageIndex. You will learn:\n", + "- [x] How to generate PageIndex tree structure of a document.\n", + "- [x] How to perform retrieval with tree search.\n", + "- [x] How to generate the answer based on the retrieved context." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "7ziuTbbWcG1L" + }, + "source": [ + "# Preparation\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "edTfrizMFK4c" + }, + "source": [ + "## Install Dependencies" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true, + "id": "LaoB58wQFNDh" + }, + "outputs": [], + "source": [ + "%pip install -q --upgrade pageindex openai" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "WVEWzPKGcG1M" + }, + "source": [ + "## Setup Environment" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "StvqfcK4cG1M" + }, + "outputs": [], + "source": [ + "import os, json, openai, requests\n", + "from pageindex import PageIndexClient\n", + "from pprint import pprint\n", + "from IPython.display import Markdown, display\n", + "\n", + "PAGEINDEX_API_KEY = \"YOUR_PAGEINDEX_API_KEY\" # Get your PageIndex API key from https://dash.pageindex.ai/api-keys\n", + "OPENAI_API_KEY = \"YOUR_OPENAI_API_KEY\"\n", + "\n", + "pi_client = PageIndexClient(api_key=PAGEINDEX_API_KEY)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "AR7PLeVbcG1N" + }, + "source": [ + "## Define Utility Functions" + ] + }, + { + "cell_type": "code", + "execution_count": 137, + "metadata": { + "id": "hmj3POkDcG1N" + }, + "outputs": [], + "source": [ + "async def call_llm(prompt, model=\"gpt-4.1\", temperature=0):\n", + " client = openai.AsyncOpenAI(api_key=OPENAI_API_KEY)\n", + " response = await client.chat.completions.create(\n", + " model=model,\n", + " messages=[{\"role\": \"user\", \"content\": prompt}],\n", + " temperature=temperature\n", + " )\n", + " return response.choices[0].message.content.strip()\n", + "\n", + "def remove_fields(data, fields=['text'], max_len=50):\n", + " if isinstance(data, dict):\n", + " return {k: remove_fields(v, fields)\n", + " for k, v in data.items() if k not in fields}\n", + " elif isinstance(data, list):\n", + " return [remove_fields(item, fields) for item in data]\n", + " elif isinstance(data, str):\n", + " return (data[:max_len] + '...') if len(data) > max_len else data\n", + " return data\n", + "\n", + "def print_tree(tree, exclude_fields=['text', 'page_index']):\n", + " cleaned_tree = remove_fields(tree.copy(), exclude_fields)\n", + " pprint(cleaned_tree, sort_dicts=False, width=150)\n", + "\n", + "def print_markdown(*lines):\n", + " text = \"\\n\".join(lines)\n", + " display(Markdown(text))\n", + "\n", + "def create_node_mapping(tree):\n", + " \"\"\"Create a mapping of node_id to node for quick lookup\"\"\"\n", + " def get_all_nodes(tree):\n", + " if isinstance(tree, dict):\n", + " return [tree] + [node for child in tree.get('nodes', []) for node in get_all_nodes(child)]\n", + " elif isinstance(tree, list):\n", + " return [node for item in tree for node in get_all_nodes(item)]\n", + " return []\n", + " return {node[\"node_id\"]: node for node in get_all_nodes(tree) if node.get(\"node_id\")}" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "heGtIMOVcG1N" + }, + "source": [ + "# Step 1: PageIndex Tree Generation" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "Mzd1VWjwMUJL" + }, + "source": [ + "## Submit a document with PageIndex SDK" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "f6--eZPLcG1N", + "outputId": "ca688cfd-6c4b-4a57-dac2-f3c2604c4112" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Downloaded https://arxiv.org/pdf/2501.12948.pdf\n", + "Document Submitted: pi-cmek7luf400960ao3o0o8us4d\n" + ] + } + ], + "source": [ + "# You can also use our GitHub repo to generate PageIndex structure\n", + "# https://github.com/VectifyAI/PageIndex\n", + "\n", + "pdf_url = \"https://arxiv.org/pdf/2501.12948.pdf\"\n", + "pdf_path = os.path.join(\"../data\", pdf_url.split('/')[-1])\n", + "os.makedirs(os.path.dirname(pdf_path), exist_ok=True)\n", + "\n", + "response = requests.get(pdf_url)\n", + "with open(pdf_path, \"wb\") as f:\n", + " f.write(response.content)\n", + "print(f\"Downloaded {pdf_url}\")\n", + "\n", + "doc_id = pi_client.submit_document(pdf_path)[\"doc_id\"]\n", + "print('Document Submitted:', doc_id)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "4-Hrh0azcG1N" + }, + "source": [ + "## Get the generated PageIndex tree structure" + ] + }, + { + "cell_type": "code", + "execution_count": 138, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 1000 + }, + "id": "b1Q1g6vrcG1O", + "outputId": "dc944660-38ad-47ea-d358-be422edbae53" + }, + "outputs": [ + { + "data": { + "text/markdown": [ + "## Simplified Tree Structure of the Document\n", + "---" + ], + "text/plain": [ + "