mirror of
https://github.com/VectifyAI/PageIndex.git
synced 2026-05-04 04:22:37 +02:00
fix notebook
This commit is contained in:
parent
9c48a61040
commit
03bb108aab
1 changed files with 162 additions and 164 deletions
|
|
@ -1,30 +1,19 @@
|
||||||
{
|
{
|
||||||
"nbformat": 4,
|
|
||||||
"nbformat_minor": 0,
|
|
||||||
"metadata": {
|
|
||||||
"colab": {
|
|
||||||
"provenance": []
|
|
||||||
},
|
|
||||||
"kernelspec": {
|
|
||||||
"name": "python3",
|
|
||||||
"display_name": "Python 3"
|
|
||||||
},
|
|
||||||
"language_info": {
|
|
||||||
"name": "python"
|
|
||||||
}
|
|
||||||
},
|
|
||||||
"cells": [
|
"cells": [
|
||||||
{
|
{
|
||||||
"cell_type": "markdown",
|
"cell_type": "markdown",
|
||||||
"source": [
|
|
||||||
""
|
|
||||||
],
|
|
||||||
"metadata": {
|
"metadata": {
|
||||||
"id": "XTboY7brzyp2"
|
"id": "XTboY7brzyp2"
|
||||||
}
|
},
|
||||||
|
"source": [
|
||||||
|
""
|
||||||
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "markdown",
|
"cell_type": "markdown",
|
||||||
|
"metadata": {
|
||||||
|
"id": "EtjMbl9Pz3S-"
|
||||||
|
},
|
||||||
"source": [
|
"source": [
|
||||||
"<p align=\"center\">Reasoning-based RAG ◦ No Vector DB ◦ No Chunking ◦ Human-like Retrieval</p>\n",
|
"<p align=\"center\">Reasoning-based RAG ◦ No Vector DB ◦ No Chunking ◦ Human-like Retrieval</p>\n",
|
||||||
"\n",
|
"\n",
|
||||||
|
|
@ -44,13 +33,13 @@
|
||||||
"</div>\n",
|
"</div>\n",
|
||||||
"\n",
|
"\n",
|
||||||
"---\n"
|
"---\n"
|
||||||
],
|
]
|
||||||
"metadata": {
|
|
||||||
"id": "EtjMbl9Pz3S-"
|
|
||||||
}
|
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "markdown",
|
"cell_type": "markdown",
|
||||||
|
"metadata": {
|
||||||
|
"id": "bbC9uLWCz8zl"
|
||||||
|
},
|
||||||
"source": [
|
"source": [
|
||||||
"# Agentic Retrieval with PageIndex Chat API\n",
|
"# Agentic Retrieval with PageIndex Chat API\n",
|
||||||
"\n",
|
"\n",
|
||||||
|
|
@ -72,66 +61,81 @@
|
||||||
"This notebook demonstrates a simple, minimal example of agentic retrieval with PageIndex. You will learn:\n",
|
"This notebook demonstrates a simple, minimal example of agentic retrieval with PageIndex. You will learn:\n",
|
||||||
"- [x] How to use PageIndex Chat API.\n",
|
"- [x] How to use PageIndex Chat API.\n",
|
||||||
"- [x] How to prompt the PageIndex Chat to make it a retrieval system"
|
"- [x] How to prompt the PageIndex Chat to make it a retrieval system"
|
||||||
],
|
]
|
||||||
"metadata": {
|
|
||||||
"id": "bbC9uLWCz8zl"
|
|
||||||
}
|
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "markdown",
|
"cell_type": "markdown",
|
||||||
"source": [
|
|
||||||
"### Install PageIndex SDK"
|
|
||||||
],
|
|
||||||
"metadata": {
|
"metadata": {
|
||||||
"id": "77SQbPoe-LTN"
|
"id": "77SQbPoe-LTN"
|
||||||
}
|
},
|
||||||
|
"source": [
|
||||||
|
"### Install PageIndex SDK"
|
||||||
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"source": [
|
"execution_count": 36,
|
||||||
"%pip install -q --upgrade pageindex"
|
|
||||||
],
|
|
||||||
"metadata": {
|
"metadata": {
|
||||||
"id": "6Eiv_cHf0OXz"
|
"id": "6Eiv_cHf0OXz"
|
||||||
},
|
},
|
||||||
"execution_count": 36,
|
"outputs": [],
|
||||||
"outputs": []
|
"source": [
|
||||||
|
"%pip install -q --upgrade pageindex"
|
||||||
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "markdown",
|
"cell_type": "markdown",
|
||||||
"source": [
|
|
||||||
"### Setup PageIndex"
|
|
||||||
],
|
|
||||||
"metadata": {
|
"metadata": {
|
||||||
"id": "UR9-qkdD-Om7"
|
"id": "UR9-qkdD-Om7"
|
||||||
}
|
},
|
||||||
|
"source": [
|
||||||
|
"### Setup PageIndex"
|
||||||
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
|
"execution_count": 60,
|
||||||
|
"metadata": {
|
||||||
|
"id": "AFzsW4gq0fjh"
|
||||||
|
},
|
||||||
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"from pageindex import PageIndexClient\n",
|
"from pageindex import PageIndexClient\n",
|
||||||
"\n",
|
"\n",
|
||||||
"# Get your PageIndex API key from https://dash.pageindex.ai/api-keys\n",
|
"# Get your PageIndex API key from https://dash.pageindex.ai/api-keys\n",
|
||||||
"PAGEINDEX_API_KEY = \"YOUR_PAGEINDEX_API_KEY\"\n",
|
"PAGEINDEX_API_KEY = \"YOUR_PAGEINDEX_API_KEY\"\n",
|
||||||
"pi_client = PageIndexClient(api_key=PAGEINDEX_API_KEY)"
|
"pi_client = PageIndexClient(api_key=PAGEINDEX_API_KEY)"
|
||||||
],
|
]
|
||||||
"metadata": {
|
|
||||||
"id": "AFzsW4gq0fjh"
|
|
||||||
},
|
|
||||||
"execution_count": 60,
|
|
||||||
"outputs": []
|
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "markdown",
|
"cell_type": "markdown",
|
||||||
"source": [
|
|
||||||
"### Upload a document"
|
|
||||||
],
|
|
||||||
"metadata": {
|
"metadata": {
|
||||||
"id": "uvzf9oWL-Ts9"
|
"id": "uvzf9oWL-Ts9"
|
||||||
}
|
},
|
||||||
|
"source": [
|
||||||
|
"### Upload a document"
|
||||||
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
|
"execution_count": 39,
|
||||||
|
"metadata": {
|
||||||
|
"colab": {
|
||||||
|
"base_uri": "https://localhost:8080/"
|
||||||
|
},
|
||||||
|
"id": "qf7sNRoL0hGw",
|
||||||
|
"outputId": "529f53c1-c827-45a7-cf01-41f567d4feaa"
|
||||||
|
},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"Downloaded https://arxiv.org/pdf/2507.13334.pdf\n",
|
||||||
|
"Document Submitted: pi-cmi34m6jy01sg0bqzofch62n8\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
"source": [
|
"source": [
|
||||||
"import os, requests\n",
|
"import os, requests\n",
|
||||||
"\n",
|
"\n",
|
||||||
|
|
@ -146,48 +150,20 @@
|
||||||
"\n",
|
"\n",
|
||||||
"doc_id = pi_client.submit_document(pdf_path)[\"doc_id\"]\n",
|
"doc_id = pi_client.submit_document(pdf_path)[\"doc_id\"]\n",
|
||||||
"print('Document Submitted:', doc_id)"
|
"print('Document Submitted:', doc_id)"
|
||||||
],
|
|
||||||
"metadata": {
|
|
||||||
"colab": {
|
|
||||||
"base_uri": "https://localhost:8080/"
|
|
||||||
},
|
|
||||||
"id": "qf7sNRoL0hGw",
|
|
||||||
"outputId": "529f53c1-c827-45a7-cf01-41f567d4feaa"
|
|
||||||
},
|
|
||||||
"execution_count": 39,
|
|
||||||
"outputs": [
|
|
||||||
{
|
|
||||||
"output_type": "stream",
|
|
||||||
"name": "stdout",
|
|
||||||
"text": [
|
|
||||||
"Downloaded https://arxiv.org/pdf/2507.13334.pdf\n",
|
|
||||||
"Document Submitted: pi-cmi34m6jy01sg0bqzofch62n8\n"
|
|
||||||
]
|
|
||||||
}
|
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "markdown",
|
"cell_type": "markdown",
|
||||||
"source": [
|
|
||||||
"### Check the processing status"
|
|
||||||
],
|
|
||||||
"metadata": {
|
"metadata": {
|
||||||
"id": "U4hpLB4T-fCt"
|
"id": "U4hpLB4T-fCt"
|
||||||
}
|
},
|
||||||
|
"source": [
|
||||||
|
"### Check the processing status"
|
||||||
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"source": [
|
"execution_count": 61,
|
||||||
"from pprint import pprint\n",
|
|
||||||
"\n",
|
|
||||||
"doc_info = pi_client.get_document(doc_id)\n",
|
|
||||||
"pprint(doc_info)\n",
|
|
||||||
"\n",
|
|
||||||
"if doc_info['status'] == 'completed':\n",
|
|
||||||
" print(f\"\\n Document ready! ({doc_info['pageNum']} pages)\")\n",
|
|
||||||
"elif doc_info['status'] == 'processing':\n",
|
|
||||||
" print(\"\\n Document is still processing. Please wait and check again.\")"
|
|
||||||
],
|
|
||||||
"metadata": {
|
"metadata": {
|
||||||
"colab": {
|
"colab": {
|
||||||
"base_uri": "https://localhost:8080/"
|
"base_uri": "https://localhost:8080/"
|
||||||
|
|
@ -195,11 +171,10 @@
|
||||||
"id": "PB1S_CWd2n87",
|
"id": "PB1S_CWd2n87",
|
||||||
"outputId": "472a64ab-747d-469c-9e46-3329456df212"
|
"outputId": "472a64ab-747d-469c-9e46-3329456df212"
|
||||||
},
|
},
|
||||||
"execution_count": 61,
|
|
||||||
"outputs": [
|
"outputs": [
|
||||||
{
|
{
|
||||||
"output_type": "stream",
|
|
||||||
"name": "stdout",
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
"text": [
|
"text": [
|
||||||
"{'createdAt': '2025-11-16T08:36:41.177000',\n",
|
"{'createdAt': '2025-11-16T08:36:41.177000',\n",
|
||||||
" 'description': 'This survey provides a comprehensive overview and taxonomy of '\n",
|
" 'description': 'This survey provides a comprehensive overview and taxonomy of '\n",
|
||||||
|
|
@ -214,29 +189,31 @@
|
||||||
" Document ready! (166 pages)\n"
|
" Document ready! (166 pages)\n"
|
||||||
]
|
]
|
||||||
}
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"from pprint import pprint\n",
|
||||||
|
"\n",
|
||||||
|
"doc_info = pi_client.get_document(doc_id)\n",
|
||||||
|
"pprint(doc_info)\n",
|
||||||
|
"\n",
|
||||||
|
"if doc_info['status'] == 'completed':\n",
|
||||||
|
" print(f\"\\n Document ready! ({doc_info['pageNum']} pages)\")\n",
|
||||||
|
"elif doc_info['status'] == 'processing':\n",
|
||||||
|
" print(\"\\n Document is still processing. Please wait and check again.\")"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "markdown",
|
"cell_type": "markdown",
|
||||||
"source": [
|
|
||||||
"### Ask a question about this document"
|
|
||||||
],
|
|
||||||
"metadata": {
|
"metadata": {
|
||||||
"id": "z1C9FOvO-p1m"
|
"id": "z1C9FOvO-p1m"
|
||||||
}
|
},
|
||||||
|
"source": [
|
||||||
|
"### Ask a question about this document"
|
||||||
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"source": [
|
"execution_count": 55,
|
||||||
"query = \"What are the evaluation methods used in this paper?\"\n",
|
|
||||||
"\n",
|
|
||||||
"for chunk in pi_client.chat_completions(\n",
|
|
||||||
" messages=[{\"role\": \"user\", \"content\": query}],\n",
|
|
||||||
" doc_id=doc_id,\n",
|
|
||||||
" stream=True\n",
|
|
||||||
"):\n",
|
|
||||||
" print(chunk, end='', flush=True)"
|
|
||||||
],
|
|
||||||
"metadata": {
|
"metadata": {
|
||||||
"colab": {
|
"colab": {
|
||||||
"base_uri": "https://localhost:8080/"
|
"base_uri": "https://localhost:8080/"
|
||||||
|
|
@ -244,11 +221,10 @@
|
||||||
"id": "X3RbQvy_0nt7",
|
"id": "X3RbQvy_0nt7",
|
||||||
"outputId": "9bfb314b-24ad-4eb2-d26c-01be5728d3cc"
|
"outputId": "9bfb314b-24ad-4eb2-d26c-01be5728d3cc"
|
||||||
},
|
},
|
||||||
"execution_count": 55,
|
|
||||||
"outputs": [
|
"outputs": [
|
||||||
{
|
{
|
||||||
"output_type": "stream",
|
|
||||||
"name": "stdout",
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
"text": [
|
"text": [
|
||||||
"I'll help you find the evaluation methods used in this paper. Let me start by examining the document structure to locate the relevant sections.{\"doc_name\": \"2507.13334_19.pdf\"}Perfect! I can see there's a dedicated section on \"Evaluation\" (node_id: 0015) that covers pages 45-50. Let me extract the content from those pages to get detailed information about the evaluation methods.{\"doc_name\": \"2507.13334_19.pdf\", \"pages\": \"45-50\"}Based on the comprehensive evaluation section of the paper, here are the **evaluation methods** used:\n",
|
"I'll help you find the evaluation methods used in this paper. Let me start by examining the document structure to locate the relevant sections.{\"doc_name\": \"2507.13334_19.pdf\"}Perfect! I can see there's a dedicated section on \"Evaluation\" (node_id: 0015) that covers pages 45-50. Let me extract the content from those pages to get detailed information about the evaluation methods.{\"doc_name\": \"2507.13334_19.pdf\", \"pages\": \"45-50\"}Based on the comprehensive evaluation section of the paper, here are the **evaluation methods** used:\n",
|
||||||
"\n",
|
"\n",
|
||||||
|
|
@ -359,48 +335,32 @@
|
||||||
"The paper emphasizes a **paradigm shift from static benchmarks to dynamic, holistic assessments** that evaluate not just task success but reasoning quality, robustness, and long-term autonomy."
|
"The paper emphasizes a **paradigm shift from static benchmarks to dynamic, holistic assessments** that evaluate not just task success but reasoning quality, robustness, and long-term autonomy."
|
||||||
]
|
]
|
||||||
}
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"query = \"What are the evaluation methods used in this paper?\"\n",
|
||||||
|
"\n",
|
||||||
|
"for chunk in pi_client.chat_completions(\n",
|
||||||
|
" messages=[{\"role\": \"user\", \"content\": query}],\n",
|
||||||
|
" doc_id=doc_id,\n",
|
||||||
|
" stream=True\n",
|
||||||
|
"):\n",
|
||||||
|
" print(chunk, end='', flush=True)"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "markdown",
|
"cell_type": "markdown",
|
||||||
|
"metadata": {
|
||||||
|
"id": "RXrdnxv92LY1"
|
||||||
|
},
|
||||||
"source": [
|
"source": [
|
||||||
"## Agentic Retrieval with PageIndex Chat API\n",
|
"## Agentic Retrieval with PageIndex Chat API\n",
|
||||||
"\n",
|
"\n",
|
||||||
"You can now easily prompt the PageIndex Chat API to be a retrieval assistant."
|
"You can now easily prompt the PageIndex Chat API to be a retrieval assistant."
|
||||||
],
|
]
|
||||||
"metadata": {
|
|
||||||
"id": "RXrdnxv92LY1"
|
|
||||||
}
|
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"source": [
|
"execution_count": 62,
|
||||||
"retrieval_prompt = f\"\"\"\n",
|
|
||||||
"You are a retrieval assistant. Your job is to retrieve the **raw relevant content** from the document based on the user's query.\n",
|
|
||||||
"\n",
|
|
||||||
"Query: {query}\n",
|
|
||||||
"\n",
|
|
||||||
"Return **only** valid JSON. No markdown, no backticks, no explanation.\n",
|
|
||||||
"\n",
|
|
||||||
"Expected format:\n",
|
|
||||||
"[\n",
|
|
||||||
" {{\n",
|
|
||||||
" \"page\": <number>,\n",
|
|
||||||
" \"content\": \"<raw text>\"\n",
|
|
||||||
" }}\n",
|
|
||||||
"]\n",
|
|
||||||
"\"\"\"\n",
|
|
||||||
"\n",
|
|
||||||
"full_response = \"\"\n",
|
|
||||||
"\n",
|
|
||||||
"for chunk in pi_client.chat_completions(\n",
|
|
||||||
" messages=[{\"role\": \"user\", \"content\": retrieval_prompt}],\n",
|
|
||||||
" doc_id=doc_id,\n",
|
|
||||||
" stream=True\n",
|
|
||||||
"):\n",
|
|
||||||
" print(chunk, end='', flush=True)\n",
|
|
||||||
" full_response += chunk"
|
|
||||||
],
|
|
||||||
"metadata": {
|
"metadata": {
|
||||||
"colab": {
|
"colab": {
|
||||||
"base_uri": "https://localhost:8080/"
|
"base_uri": "https://localhost:8080/"
|
||||||
|
|
@ -408,13 +368,14 @@
|
||||||
"id": "puyOGkAi0wrH",
|
"id": "puyOGkAi0wrH",
|
||||||
"outputId": "b08c682f-abfb-4381-8bb0-726d210120ad"
|
"outputId": "b08c682f-abfb-4381-8bb0-726d210120ad"
|
||||||
},
|
},
|
||||||
"execution_count": 62,
|
|
||||||
"outputs": [
|
"outputs": [
|
||||||
{
|
{
|
||||||
"output_type": "stream",
|
|
||||||
"name": "stdout",
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
"text": [
|
"text": [
|
||||||
"{\"doc_name\": \"2507.13334_19.pdf\"}{\"doc_name\": \"2507.13334_19.pdf\", \"pages\": \"45-50\"}```json\n",
|
"I'll help you find the evaluation methods discussed in this paper. Let me start by examining the document structure to locate relevant sections.{\"doc_name\": \"2507.13334_22.pdf\"}Now I can see the document structure. Section 6 is dedicated to \"Evaluation\" and spans pages 45-50. Let me retrieve the content from those pages to get the detailed evaluation methods.{\"doc_name\": \"2507.13334_22.pdf\", \"pages\": \"45-50\"}Perfect! I have retrieved the comprehensive evaluation section. Now let me compile the evaluation methods in the requested JSON format.\n",
|
||||||
|
"\n",
|
||||||
|
"```json\n",
|
||||||
"[\n",
|
"[\n",
|
||||||
" {\n",
|
" {\n",
|
||||||
" \"page\": 45,\n",
|
" \"page\": 45,\n",
|
||||||
|
|
@ -430,7 +391,7 @@
|
||||||
" },\n",
|
" },\n",
|
||||||
" {\n",
|
" {\n",
|
||||||
" \"page\": 48,\n",
|
" \"page\": 48,\n",
|
||||||
" \"content\": \"of task decomposition accuracy, multi-plan selection effectiveness, and iterative refinement capabilities. Real-time and streaming RAG applications present unique evaluation challenges in assessing both latency and accuracy under dynamic information conditions [444, 166, 1192].\\n\\nTool-integrated reasoning system evaluation employs comprehensive benchmarks spanning diverse tool usage scenarios and complexity levels. The Berkeley Function Calling Leaderboard (BFCL) provides 2,000 testing cases with step-by-step and end-to-end assessments measuring call accuracy, pass rates, and win rates across increasingly complex scenarios. T-Eval contributes 553 tool-use cases testing multi-turn interactions and nested tool calling capabilities [263, 1390, 835]. Advanced benchmarks including StableToolBench address API instability challenges, while NesTools evaluates nested tool scenarios and ToolHop assesses multi-hop tool usage across 995 queries and 3,912 tools [363, 377, 1264].\\n\\nWeb agent evaluation frameworks including WebArena and Mind2Web provide comprehensive assessment across thousands of tasks spanning 137 websites, revealing significant performance gaps in current LLM capabilities for complex web interactions. VideoWebArena extends evaluation to multimodal agents, while Deep Research Bench and DeepShop address specialized evaluation for research and shopping agents respectively $[1378,206,87,482]$.\\n\\nMulti-agent system evaluation employs specialized frameworks addressing coordination, communication, and collective intelligence. However, current frameworks face significant challenges in transactional integrity across complex workflows, with many systems lacking adequate compensation mechanisms for partial failures. Orchestration evaluation must address context management, coordination strategy effectiveness, and the ability to maintain system coherence under varying operational conditions [128, 901].\"\n",
|
" \"content\": \"of task decomposition accuracy, multi-plan selection effectiveness, and iterative refinement capabilities. Real-time and streaming RAG applications present unique evaluation challenges in assessing both latency and accuracy under dynamic information conditions [444, 166, 1192].\\n\\nTool-integrated reasoning system evaluation employs comprehensive benchmarks spanning diverse tool usage scenarios and complexity levels. The Berkeley Function Calling Leaderboard (BFCL) provides 2,000 testing cases with step-by-step and end-to-end assessments measuring call accuracy, pass rates, and win rates across increasingly complex scenarios. T-Eval contributes 553 tool-use cases testing multi-turn interactions and nested tool calling capabilities [263, 1390, 835]. Advanced benchmarks including StableToolBench address API instability challenges, while NesTools evaluates nested tool scenarios and ToolHop assesses multi-hop tool usage across 995 queries and 3,912 tools [363, 377, 1264].\\n\\nWeb agent evaluation frameworks including WebArena and Mind2Web provide comprehensive assessment across thousands of tasks spanning 137 websites, revealing significant performance gaps in current LLM capabilities for complex web interactions. VideoWebArena extends evaluation to multimodal agents, while Deep Research Bench and DeepShop address specialized evaluation for research and shopping agents respectively $[1378,206,87,482]$.\\n\\nMulti-agent system evaluation employs specialized frameworks addressing coordination, communication, and collective intelligence. However, current frameworks face significant challenges in transactional integrity across complex workflows, with many systems lacking adequate compensation mechanisms for partial failures. Orchestration evaluation must address context management, coordination strategy effectiveness, and the ability to maintain system coherence under varying operational conditions [128, 901].\\n\\n| Release Date | Open Source | Method / Model | Success Rate (\\\\%) | Source |\\n| :-- | :--: | :-- | :--: | :-- |\\n| $2025-02$ | $\\\\times$ | IBM CUGA | 61.7 | $[753]$ |\\n| $2025-01$ | $\\\\times$ | OpenAI Operator | 58.1 | $[813]$ |\\n| $2024-08$ | $\\\\times$ | Jace.AI | 57.1 | $[476]$ |\\n| $2024-12$ | $\\\\times$ | ScribeAgent + GPT-4o | 53.0 | $[950]$ |\\n| $2025-01$ | $\\\\checkmark$ | AgentSymbiotic | 52.1 | $[1323]$ |\\n| $2025-01$ | $\\\\checkmark$ | Learn-by-Interact | 48.0 | $[998]$ |\\n| $2024-10$ | $\\\\checkmark$ | AgentOccam-Judge | 45.7 | $[1231]$ |\\n| $2024-08$ | $\\\\times$ | WebPilot | 37.2 | $[1331]$ |\\n| $2024-10$ | $\\\\checkmark$ | GUI-API Hybrid Agent | 35.8 | $[988]$ |\\n| $2024-09$ | $\\\\checkmark$ | Agent Workflow Memory | 35.5 | $[1144]$ |\\n| $2024-04$ | $\\\\checkmark$ | SteP | 33.5 | $[979]$ |\\n| $2025-06$ | $\\\\checkmark$ | TTI | 26.1 | $[951]$ |\\n| $2024-04$ | $\\\\checkmark$ | BrowserGym + GPT-4 | 23.5 | $[238]$ |\\n\\nTable 8: WebArena [1378] Leaderboard: Top performing models with their success rates and availability status.\\n\\n### 6.3. Evaluation Challenges and Emerging Paradigms\\n\\nThis subsection identifies current limitations in evaluation methodologies and explores emerging approaches for more effective assessment.\"\n",
|
||||||
" },\n",
|
" },\n",
|
||||||
" {\n",
|
" {\n",
|
||||||
" \"page\": 49,\n",
|
" \"page\": 49,\n",
|
||||||
|
|
@ -444,34 +405,48 @@
|
||||||
"```"
|
"```"
|
||||||
]
|
]
|
||||||
}
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"retrieval_prompt = f\"\"\"\n",
|
||||||
|
"Your job is to retrieve the raw relevant content from the document based on the user's query.\n",
|
||||||
|
"\n",
|
||||||
|
"Query: {query}\n",
|
||||||
|
"\n",
|
||||||
|
"Return in JSON format:\n",
|
||||||
|
"```json\n",
|
||||||
|
"[\n",
|
||||||
|
" {{\n",
|
||||||
|
" \"page\": <number>,\n",
|
||||||
|
" \"content\": \"<raw text>\"\n",
|
||||||
|
" }},\n",
|
||||||
|
" ...\n",
|
||||||
|
"]\n",
|
||||||
|
"```\n",
|
||||||
|
"\"\"\"\n",
|
||||||
|
"\n",
|
||||||
|
"full_response = \"\"\n",
|
||||||
|
"\n",
|
||||||
|
"for chunk in pi_client.chat_completions(\n",
|
||||||
|
" messages=[{\"role\": \"user\", \"content\": retrieval_prompt}],\n",
|
||||||
|
" doc_id=doc_id,\n",
|
||||||
|
" stream=True\n",
|
||||||
|
"):\n",
|
||||||
|
" print(chunk, end='', flush=True)\n",
|
||||||
|
" full_response += chunk"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "markdown",
|
"cell_type": "markdown",
|
||||||
"source": [
|
|
||||||
"### Exctarct the JSON retreived results"
|
|
||||||
],
|
|
||||||
"metadata": {
|
"metadata": {
|
||||||
"id": "d-Y9towQ_CiF"
|
"id": "d-Y9towQ_CiF"
|
||||||
}
|
},
|
||||||
|
"source": [
|
||||||
|
"### Exctarct the JSON retreived results"
|
||||||
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"source": [
|
"execution_count": 59,
|
||||||
"%pip install jsonextractor\n",
|
|
||||||
"\n",
|
|
||||||
"def extract_json(content):\n",
|
|
||||||
" from json_extractor import JsonExtractor\n",
|
|
||||||
" start_idx = content.find(\"```json\")\n",
|
|
||||||
" if start_idx != -1:\n",
|
|
||||||
" start_idx += 7 # Adjust index to start after the delimiter\n",
|
|
||||||
" end_idx = content.rfind(\"```\")\n",
|
|
||||||
" json_content = content[start_idx:end_idx].strip()\n",
|
|
||||||
" return JsonExtractor.extract_valid_json(json_content)\n",
|
|
||||||
"\n",
|
|
||||||
"from pprint import pprint\n",
|
|
||||||
"pprint(extract_json(full_response))"
|
|
||||||
],
|
|
||||||
"metadata": {
|
"metadata": {
|
||||||
"colab": {
|
"colab": {
|
||||||
"base_uri": "https://localhost:8080/"
|
"base_uri": "https://localhost:8080/"
|
||||||
|
|
@ -479,18 +454,12 @@
|
||||||
"id": "rwjC65oB05Tt",
|
"id": "rwjC65oB05Tt",
|
||||||
"outputId": "64504ad5-1778-463f-989b-46e18aba2ea6"
|
"outputId": "64504ad5-1778-463f-989b-46e18aba2ea6"
|
||||||
},
|
},
|
||||||
"execution_count": 59,
|
|
||||||
"outputs": [
|
"outputs": [
|
||||||
{
|
{
|
||||||
"output_type": "stream",
|
|
||||||
"name": "stdout",
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
"text": [
|
"text": [
|
||||||
"Collecting jsonextractor\n",
|
"Note: you may need to restart the kernel to use updated packages.\n",
|
||||||
" Downloading JsonExtractor-0.0.2-py3-none-any.whl.metadata (2.1 kB)\n",
|
|
||||||
"Requirement already satisfied: regex in /usr/local/lib/python3.12/dist-packages (from jsonextractor) (2024.11.6)\n",
|
|
||||||
"Downloading JsonExtractor-0.0.2-py3-none-any.whl (4.6 kB)\n",
|
|
||||||
"Installing collected packages: jsonextractor\n",
|
|
||||||
"Successfully installed jsonextractor-0.0.2\n",
|
|
||||||
"[{'content': '## 6. Evaluation\\n'\n",
|
"[{'content': '## 6. Evaluation\\n'\n",
|
||||||
" '\\n'\n",
|
" '\\n'\n",
|
||||||
" 'The evaluation of context-engineered systems presents '\n",
|
" 'The evaluation of context-engineered systems presents '\n",
|
||||||
|
|
@ -895,7 +864,36 @@
|
||||||
" 'page': 50}]\n"
|
" 'page': 50}]\n"
|
||||||
]
|
]
|
||||||
}
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"%pip install -q jsonextractor\n",
|
||||||
|
"\n",
|
||||||
|
"def extract_json(content):\n",
|
||||||
|
" from json_extractor import JsonExtractor\n",
|
||||||
|
" start_idx = content.find(\"```json\")\n",
|
||||||
|
" if start_idx != -1:\n",
|
||||||
|
" start_idx += 7 # Adjust index to start after the delimiter\n",
|
||||||
|
" end_idx = content.rfind(\"```\")\n",
|
||||||
|
" json_content = content[start_idx:end_idx].strip()\n",
|
||||||
|
" return JsonExtractor.extract_valid_json(json_content)\n",
|
||||||
|
"\n",
|
||||||
|
"from pprint import pprint\n",
|
||||||
|
"pprint(extract_json(full_response))"
|
||||||
]
|
]
|
||||||
}
|
}
|
||||||
]
|
],
|
||||||
|
"metadata": {
|
||||||
|
"colab": {
|
||||||
|
"provenance": []
|
||||||
|
},
|
||||||
|
"kernelspec": {
|
||||||
|
"display_name": "Python 3",
|
||||||
|
"name": "python3"
|
||||||
|
},
|
||||||
|
"language_info": {
|
||||||
|
"name": "python"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"nbformat": 4,
|
||||||
|
"nbformat_minor": 0
|
||||||
}
|
}
|
||||||
Loading…
Add table
Add a link
Reference in a new issue