Add support for RAG

This commit is contained in:
Ramnique Singh 2025-04-30 23:36:49 +05:30 committed by akhisud3195
parent 2156c94449
commit b80eaffbe9
22 changed files with 552 additions and 900 deletions

View file

@ -22,11 +22,11 @@ Powered by OpenAI's Agents SDK, Rowboat is the fastest way to build multi-agents
export OPENAI_API_KEY=your-openai-api-key export OPENAI_API_KEY=your-openai-api-key
``` ```
2. Clone the repository and start Rowboat docker 2. Clone the repository and start Rowboat
```bash ```bash
git clone git@github.com:rowboatlabs/rowboat.git git clone git@github.com:rowboatlabs/rowboat.git
cd rowboat cd rowboat
docker-compose up --build ./start.sh
``` ```
3. Access the app at [http://localhost:3000](http://localhost:3000). 3. Access the app at [http://localhost:3000](http://localhost:3000).

View file

@ -0,0 +1,92 @@
# Using RAG in Rowboat
Rowboat provides multiple ways to enhance your agents with Retrieval-Augmented Generation (RAG). This guide will help you set up and use each RAG feature.
## Quick Start
Text RAG and local file uploads are enabled by default - no configuration needed! Just start using them right away.
## Available RAG Features
### 1. Text RAG
✅ Enabled by default:
- Process and reason over text content directly
- No configuration required
### 2. Local File Uploads
✅ Enabled by default:
- Upload PDF files directly from your device
- Files are stored locally
- No configuration required
- Files are parsed using OpenAI by default
### 3. S3 File Uploads
To enable S3 file uploads, set the following variables:
```bash
# Enable S3 uploads
export USE_RAG_S3_UPLOADS=true
# S3 Configuration
export AWS_ACCESS_KEY_ID=your_access_key
export AWS_SECRET_ACCESS_KEY=your_secret_key
export RAG_UPLOADS_S3_BUCKET=your_bucket_name
export RAG_UPLOADS_S3_REGION=your_region
```
### 4. URL Scraping
To enable URL scraping, set the following variables:
```bash
# Enable URL scraping
export USE_RAG_SCRAPING=true
# Firecrawl API key for web scraping
export FIRECRAWL_API_KEY=your_firecrawl_api_key
```
## File Parsing Options
### Default Parsing (OpenAI)
By default, uploaded PDF files are parsed using `gpt-4o`. You can customize this by setting the following:
```bash
# Override the default parsing model
export FILE_PARSING_MODEL=your-preferred-model
```
You can also change the model provider like so:
```bash
# Optional: Override the parsing provider settings
export FILE_PARSING_PROVIDER_BASE_URL=your-provider-base-url
export FILE_PARSING_PROVIDER_API_KEY=your-provider-api-key
```
### Using Gemini for File Parsing
To use Google's Gemini model for parsing uploaded PDFs, set the following variable:
```bash
# Enable Gemini for file parsing
export USE_GEMINI_FILE_PARSING=true
export GOOGLE_API_KEY=your_google_api_key
```
## Embedding Model options
By default, Rowboat uses OpenAI's `text-embedding-3-small` model for generating embeddings. You can customize this by setting the following:
```bash
# Override the default embedding model
export EMBEDDING_MODEL=your-preferred-model
```
You can also change the model provider like so:
```bash
# Optional: Override the embedding provider settings
export EMBEDDING_PROVIDER_BASE_URL=your-provider-base-url
export EMBEDDING_PROVIDER_API_KEY=your-provider-api-key
```
If you don't specify the provider settings, Rowboat will use OpenAI as the default provider.

View file

@ -14,6 +14,7 @@ nav:
- Test chats in the playground: playground.md - Test chats in the playground: playground.md
- Add tools: add_tools.md - Add tools: add_tools.md
- Update agents: update_agents.md - Update agents: update_agents.md
- Using RAG: using_rag.md
- API & SDK: - API & SDK:
- Using the API: using_the_api.md - Using the API: using_the_api.md

View file

@ -10,6 +10,7 @@ import { WithStringId } from "../lib/types/types";
import { DataSourceDoc } from "../lib/types/datasource_types"; import { DataSourceDoc } from "../lib/types/datasource_types";
import { DataSource } from "../lib/types/datasource_types"; import { DataSource } from "../lib/types/datasource_types";
import { uploadsS3Client } from "../lib/uploads_s3_client"; import { uploadsS3Client } from "../lib/uploads_s3_client";
import { USE_RAG_S3_UPLOADS } from "../lib/feature_flags";
export async function getDataSource(projectId: string, sourceId: string): Promise<WithStringId<z.infer<typeof DataSource>>> { export async function getDataSource(projectId: string, sourceId: string): Promise<WithStringId<z.infer<typeof DataSource>>> {
await projectAuthCheck(projectId); await projectAuthCheck(projectId);
@ -279,26 +280,27 @@ export async function getDownloadUrlForFile(
): Promise<string> { ): Promise<string> {
await projectAuthCheck(projectId); await projectAuthCheck(projectId);
await getDataSource(projectId, sourceId); await getDataSource(projectId, sourceId);
// fetch s3 key for file
const file = await dataSourceDocsCollection.findOne({ const file = await dataSourceDocsCollection.findOne({
sourceId, sourceId,
_id: new ObjectId(fileId), _id: new ObjectId(fileId),
'data.type': 'file', 'data.type': { $in: ['file_local', 'file_s3'] },
}); });
if (!file) { if (!file) {
throw new Error('File not found'); throw new Error('File not found');
} }
if (file.data.type !== 'file') {
throw new Error('File not found'); // if local, return path
if (file.data.type === 'file_local') {
return `/api/uploads/${fileId}`;
} else if (file.data.type === 'file_s3') {
const command = new GetObjectCommand({
Bucket: process.env.RAG_UPLOADS_S3_BUCKET,
Key: file.data.s3Key,
});
return await getSignedUrl(uploadsS3Client, command, { expiresIn: 60 }); // URL valid for 1 minute
} }
const command = new GetObjectCommand({ throw new Error('Invalid file type');
Bucket: process.env.RAG_UPLOADS_S3_BUCKET,
Key: file.data.s3Key,
});
return await getSignedUrl(uploadsS3Client, command, { expiresIn: 60 }); // URL valid for 1 minute
} }
export async function getUploadUrlsForFilesDataSource( export async function getUploadUrlsForFilesDataSource(
@ -307,37 +309,47 @@ export async function getUploadUrlsForFilesDataSource(
files: { name: string; type: string; size: number }[] files: { name: string; type: string; size: number }[]
): Promise<{ ): Promise<{
fileId: string, fileId: string,
presignedUrl: string, uploadUrl: string,
s3Key: string, path: string,
}[]> { }[]> {
await projectAuthCheck(projectId); await projectAuthCheck(projectId);
const source = await getDataSource(projectId, sourceId); const source = await getDataSource(projectId, sourceId);
if (source.data.type !== 'files') { if (source.data.type !== 'files_local' && source.data.type !== 'files_s3') {
throw new Error('Invalid files data source'); throw new Error('Invalid files data source');
} }
const urls: { const urls: {
fileId: string, fileId: string,
presignedUrl: string, uploadUrl: string,
s3Key: string, path: string,
}[] = []; }[] = [];
for (const file of files) { for (const file of files) {
const fileId = new ObjectId().toString(); const fileId = new ObjectId().toString();
const projectIdPrefix = projectId.slice(0, 2); // 2 characters from the start of the projectId
const s3Key = `datasources/files/${projectIdPrefix}/${projectId}/${sourceId}/${fileId}/${file.name}`; if (source.data.type === 'files_s3') {
// Generate presigned URL // Generate presigned URL
const command = new PutObjectCommand({ const projectIdPrefix = projectId.slice(0, 2); // 2 characters from the start of the projectId
Bucket: process.env.RAG_UPLOADS_S3_BUCKET, const path = `datasources/files/${projectIdPrefix}/${projectId}/${sourceId}/${fileId}/${file.name}`;
Key: s3Key, const command = new PutObjectCommand({
ContentType: file.type, Bucket: process.env.RAG_UPLOADS_S3_BUCKET,
}); Key: path,
const presignedUrl = await getSignedUrl(uploadsS3Client, command, { expiresIn: 10 * 60 }); // valid for 10 minutes ContentType: file.type,
urls.push({ });
fileId, const uploadUrl = await getSignedUrl(uploadsS3Client, command, { expiresIn: 10 * 60 }); // valid for 10 minutes
presignedUrl, urls.push({
s3Key, fileId,
}); uploadUrl,
path,
});
} else if (source.data.type === 'files_local') {
// Generate local upload URL
urls.push({
fileId,
uploadUrl: '/api/uploads/' + fileId,
path: '/api/uploads/' + fileId,
});
}
} }
return urls; return urls;

View file

@ -0,0 +1,87 @@
import { NextRequest, NextResponse } from 'next/server';
import path from 'path';
import fs from 'fs/promises';
import fsSync from 'fs';
import { dataSourceDocsCollection } from '@/app/lib/mongodb';
import { ObjectId } from 'mongodb';
const UPLOADS_DIR = process.env.RAG_UPLOADS_DIR || '/uploads';
// PUT endpoint to handle file uploads
export async function PUT(
request: NextRequest,
{ params }: { params: { fileId: string } }
) {
const fileId = params.fileId;
if (!fileId) {
return NextResponse.json({ error: 'Missing file ID' }, { status: 400 });
}
const filePath = path.join(UPLOADS_DIR, fileId);
try {
const data = await request.arrayBuffer();
await fs.writeFile(filePath, new Uint8Array(data));
return NextResponse.json({ success: true });
} catch (error) {
console.error('Error saving file:', error);
return NextResponse.json(
{ error: 'Failed to save file' },
{ status: 500 }
);
}
}
// GET endpoint to handle file downloads
export async function GET(
request: NextRequest,
{ params }: { params: { fileId: string } }
) {
const fileId = params.fileId;
if (!fileId) {
return NextResponse.json({ error: 'Missing file ID' }, { status: 400 });
}
const filePath = path.join(UPLOADS_DIR, fileId);
// get mimetype from database
const doc = await dataSourceDocsCollection.findOne({ _id: new ObjectId(fileId) });
if (!doc) {
return NextResponse.json({ error: 'File not found' }, { status: 404 });
}
if (doc.data.type !== 'file_local') {
return NextResponse.json({ error: 'File is not local' }, { status: 400 });
}
const mimeType = 'application/octet-stream';
const fileName = doc.data.name;
try {
// Check if file exists
await fs.access(filePath);
// Create a readable stream
const nodeStream = fsSync.createReadStream(filePath);
// Convert Node.js stream to Web stream
const webStream = new ReadableStream({
start(controller) {
nodeStream.on('data', (chunk) => controller.enqueue(chunk));
nodeStream.on('end', () => controller.close());
nodeStream.on('error', (err) => controller.error(err));
}
});
return new NextResponse(webStream, {
status: 200,
headers: {
'Content-Type': mimeType,
'Content-Disposition': `attachment; filename="${fileName}"`,
},
});
} catch (error) {
console.error('Error reading file:', error);
return NextResponse.json(
{ error: 'File not found' },
{ status: 404 }
);
}
}

View file

@ -1,3 +1,12 @@
import { openai } from "@ai-sdk/openai"; import { createOpenAI } from "@ai-sdk/openai";
export const embeddingModel = openai.embedding('text-embedding-3-small'); const EMBEDDING_PROVIDER_API_KEY = process.env.EMBEDDING_PROVIDER_API_KEY || process.env.OPENAI_API_KEY || '';
const EMBEDDING_PROVIDER_BASE_URL = process.env.EMBEDDING_PROVIDER_BASE_URL || undefined;
const EMBEDDING_MODEL = process.env.EMBEDDING_MODEL || 'text-embedding-3-small';
const openai = createOpenAI({
apiKey: EMBEDDING_PROVIDER_API_KEY,
baseURL: EMBEDDING_PROVIDER_BASE_URL,
});
export const embeddingModel = openai.embedding(EMBEDDING_MODEL);

View file

@ -3,6 +3,8 @@ export const USE_RAG_UPLOADS = process.env.USE_RAG_UPLOADS === 'true';
export const USE_RAG_SCRAPING = process.env.USE_RAG_SCRAPING === 'true'; export const USE_RAG_SCRAPING = process.env.USE_RAG_SCRAPING === 'true';
export const USE_CHAT_WIDGET = process.env.USE_CHAT_WIDGET === 'true'; export const USE_CHAT_WIDGET = process.env.USE_CHAT_WIDGET === 'true';
export const USE_AUTH = process.env.USE_AUTH === 'true'; export const USE_AUTH = process.env.USE_AUTH === 'true';
export const USE_RAG_S3_UPLOADS = process.env.USE_RAG_S3_UPLOADS === 'true';
export const USE_GEMINI_FILE_PARSING = process.env.USE_GEMINI_FILE_PARSING === 'true';
// Hardcoded flags // Hardcoded flags
export const USE_MULTIPLE_PROJECTS = true; export const USE_MULTIPLE_PROJECTS = true;

View file

@ -22,7 +22,10 @@ export const DataSource = z.object({
type: z.literal('urls'), type: z.literal('urls'),
}), }),
z.object({ z.object({
type: z.literal('files'), type: z.literal('files_local'),
}),
z.object({
type: z.literal('files_s3'),
}), }),
z.object({ z.object({
type: z.literal('text'), type: z.literal('text'),
@ -50,7 +53,13 @@ export const DataSourceDoc = z.object({
url: z.string(), url: z.string(),
}), }),
z.object({ z.object({
type: z.literal('file'), type: z.literal('file_local'),
name: z.string(),
size: z.number(),
mimeType: z.string(),
}),
z.object({
type: z.literal('file_s3'),
name: z.string(), name: z.string(),
size: z.number(), size: z.number(),
mimeType: z.string(), mimeType: z.string(),

View file

@ -119,9 +119,13 @@ export function SourcePage({
<DataSourceIcon type="urls" /> <DataSourceIcon type="urls" />
<div>Specify URLs</div> <div>Specify URLs</div>
</>} </>}
{source.data.type === 'files' && <> {source.data.type === 'files_local' && <>
<DataSourceIcon type="files" /> <DataSourceIcon type="files" />
<div>File upload</div> <div>File upload (local)</div>
</>}
{source.data.type === 'files_s3' && <>
<DataSourceIcon type="files" />
<div>File upload (S3)</div>
</>} </>}
{source.data.type === 'text' && <> {source.data.type === 'text' && <>
<DataSourceIcon type="text" /> <DataSourceIcon type="text" />
@ -148,11 +152,12 @@ export function SourcePage({
handleReload={handleReload} handleReload={handleReload}
/> />
} }
{source.data.type === 'files' && {(source.data.type === 'files_local' || source.data.type === 'files_s3') &&
<FilesSource <FilesSource
projectId={projectId} projectId={projectId}
dataSource={source} dataSource={source}
handleReload={handleReload} handleReload={handleReload}
type={source.data.type}
/> />
} }
{source.data.type === 'text' && {source.data.type === 'text' &&

View file

@ -46,7 +46,7 @@ function FileListItem({
} }
}; };
if (file.data.type !== 'file') { if (file.data.type !== 'file_local' && file.data.type !== 'file_s3') {
return null; return null;
} }
@ -180,10 +180,12 @@ export function FilesSource({
projectId, projectId,
dataSource, dataSource,
handleReload, handleReload,
type,
}: { }: {
projectId: string, projectId: string,
dataSource: WithStringId<z.infer<typeof DataSource>>, dataSource: WithStringId<z.infer<typeof DataSource>>,
handleReload: () => void; handleReload: () => void;
type: 'files_local' | 'files_s3';
}) { }) {
const [uploading, setUploading] = useState(false); const [uploading, setUploading] = useState(false);
const [fileListKey, setFileListKey] = useState(0); const [fileListKey, setFileListKey] = useState(0);
@ -199,7 +201,7 @@ export function FilesSource({
// Upload files in parallel // Upload files in parallel
await Promise.all(acceptedFiles.map(async (file, index) => { await Promise.all(acceptedFiles.map(async (file, index) => {
await fetch(urls[index].presignedUrl, { await fetch(urls[index].uploadUrl, {
method: 'PUT', method: 'PUT',
body: file, body: file,
headers: { headers: {
@ -209,20 +211,40 @@ export function FilesSource({
})); }));
// After successful uploads, update the database with file information // After successful uploads, update the database with file information
await addDocsToDataSource({ let docData: {
projectId, _id: string,
sourceId: dataSource._id, name: string,
docData: acceptedFiles.map((file, index) => ({ data: z.infer<typeof DataSourceDoc>['data']
}[] = [];
if (type === 'files_s3') {
docData = acceptedFiles.map((file, index) => ({
_id: urls[index].fileId, _id: urls[index].fileId,
name: file.name, name: file.name,
data: { data: {
type: 'file', type: 'file_s3' as const,
name: file.name, name: file.name,
size: file.size, size: file.size,
mimeType: file.type, mimeType: file.type,
s3Key: urls[index].s3Key, s3Key: urls[index].path,
}, },
})), }));
} else {
docData = acceptedFiles.map((file, index) => ({
_id: urls[index].fileId,
name: file.name,
data: {
type: 'file_local' as const,
name: file.name,
size: file.size,
mimeType: file.type,
},
}));
}
await addDocsToDataSource({
projectId,
sourceId: dataSource._id,
docData,
}); });
handleReload(); handleReload();
@ -233,22 +255,22 @@ export function FilesSource({
} finally { } finally {
setUploading(false); setUploading(false);
} }
}, [projectId, dataSource._id, handleReload]); }, [projectId, dataSource._id, handleReload, type]);
const { getRootProps, getInputProps, isDragActive } = useDropzone({ const { getRootProps, getInputProps, isDragActive } = useDropzone({
onDrop, onDrop,
disabled: uploading, disabled: uploading,
accept: { accept: {
'application/pdf': ['.pdf'], 'application/pdf': ['.pdf'],
'text/plain': ['.txt'], // 'text/plain': ['.txt'],
'application/msword': ['.doc'], // 'application/msword': ['.doc'],
'application/vnd.openxmlformats-officedocument.wordprocessingml.document': ['.docx'], // 'application/vnd.openxmlformats-officedocument.wordprocessingml.document': ['.docx'],
}, },
}); });
return ( return (
<Section <Section
title="File Uploads" title="File Uploads"
description="Upload and manage files for this data source." description="Upload and manage files for this data source."
> >
<div className="space-y-8"> <div className="space-y-8">
@ -269,7 +291,7 @@ export function FilesSource({
<div className="space-y-2"> <div className="space-y-2">
<p>Drag and drop files here, or click to select files</p> <p>Drag and drop files here, or click to select files</p>
<p className="text-sm text-gray-500 dark:text-gray-400"> <p className="text-sm text-gray-500 dark:text-gray-400">
Supported file types: PDF, TXT, DOC, DOCX Only PDF files are supported for now.
</p> </p>
</div> </div>
)} )}

View file

@ -118,10 +118,16 @@ export function SourcesList({ projectId }: { projectId: string }) {
<div>Text</div> <div>Text</div>
</div> </div>
)} )}
{source.data.type == 'files' && ( {source.data.type == 'files_local' && (
<div className="flex gap-2 items-center text-sm text-gray-600 dark:text-gray-300"> <div className="flex gap-2 items-center text-sm text-gray-600 dark:text-gray-300">
<DataSourceIcon type="files" /> <DataSourceIcon type="files" />
<div>Files</div> <div>Files (Local)</div>
</div>
)}
{source.data.type == 'files_s3' && (
<div className="flex gap-2 items-center text-sm text-gray-600 dark:text-gray-300">
<DataSourceIcon type="files" />
<div>Files (S3)</div>
</div> </div>
)} )}
</td> </td>

View file

@ -13,10 +13,12 @@ import { Panel } from "@/components/common/panel-common";
export function Form({ export function Form({
projectId, projectId,
useRagUploads, useRagUploads,
useRagS3Uploads,
useRagScraping, useRagScraping,
}: { }: {
projectId: string; projectId: string;
useRagUploads: boolean; useRagUploads: boolean;
useRagS3Uploads: boolean;
useRagScraping: boolean; useRagScraping: boolean;
}) { }) {
const [sourceType, setSourceType] = useState(""); const [sourceType, setSourceType] = useState("");
@ -34,8 +36,13 @@ export function Form({
startContent: <DataSourceIcon type="urls" /> startContent: <DataSourceIcon type="urls" />
}, },
{ {
key: "files", key: "files_local",
label: "Upload files", label: "Upload files (Local)",
startContent: <DataSourceIcon type="files" />
},
{
key: "files_s3",
label: "Upload files (S3)",
startContent: <DataSourceIcon type="files" /> startContent: <DataSourceIcon type="files" />
} }
]; ];
@ -73,7 +80,7 @@ export function Form({
projectId, projectId,
name: formData.get('name') as string, name: formData.get('name') as string,
data: { data: {
type: 'files', type: formData.get('type') as 'files_local' | 'files_s3',
}, },
status: 'ready', status: 'ready',
}); });
@ -125,7 +132,8 @@ export function Form({
onChange={setSourceType} onChange={setSourceType}
options={dropdownOptions} options={dropdownOptions}
disabledKeys={[ disabledKeys={[
...(useRagUploads ? [] : ['files']), ...(useRagUploads ? [] : ['files_local']),
...(useRagS3Uploads ? [] : ['files_s3']),
...(useRagScraping ? [] : ['urls']), ...(useRagScraping ? [] : ['urls']),
]} ]}
/> />
@ -196,10 +204,11 @@ export function Form({
/> />
</form>} </form>}
{sourceType === "files" && <form {(sourceType === "files_local" || sourceType === "files_s3") && <form
action={createFilesDataSource} action={createFilesDataSource}
className="flex flex-col gap-4" className="flex flex-col gap-4"
> >
<input type="hidden" name="type" value={sourceType} />
<div className="space-y-2"> <div className="space-y-2">
<label className="text-xs font-medium uppercase tracking-wider text-gray-500 dark:text-gray-400"> <label className="text-xs font-medium uppercase tracking-wider text-gray-500 dark:text-gray-400">
Name Name

View file

@ -1,7 +1,7 @@
import { Metadata } from "next"; import { Metadata } from "next";
import { Form } from "./form"; import { Form } from "./form";
import { redirect } from "next/navigation"; import { redirect } from "next/navigation";
import { USE_RAG, USE_RAG_UPLOADS, USE_RAG_SCRAPING } from "../../../../lib/feature_flags"; import { USE_RAG, USE_RAG_UPLOADS, USE_RAG_S3_UPLOADS, USE_RAG_SCRAPING } from "../../../../lib/feature_flags";
export const metadata: Metadata = { export const metadata: Metadata = {
title: "Add data source" title: "Add data source"
@ -20,6 +20,7 @@ export default async function Page({
<Form <Form
projectId={params.projectId} projectId={params.projectId}
useRagUploads={USE_RAG_UPLOADS} useRagUploads={USE_RAG_UPLOADS}
useRagS3Uploads={USE_RAG_S3_UPLOADS}
useRagScraping={USE_RAG_SCRAPING} useRagScraping={USE_RAG_SCRAPING}
/> />
); );

View file

@ -2,8 +2,10 @@ import '../lib/loadenv';
import { qdrantClient } from '../lib/qdrant'; import { qdrantClient } from '../lib/qdrant';
(async () => { (async () => {
await qdrantClient.deleteCollection('embeddings'); try {
const result = await qdrantClient.deleteCollection('embeddings');
const { collections } = await qdrantClient.getCollections(); console.log(`Delete qdrant collection 'embeddings' completed with result: ${result}`);
console.log(collections); } catch (error) {
console.error(`Unable to delete qdrant collection 'embeddings': ${error}`);
}
})(); })();

View file

@ -4,14 +4,29 @@ import { z } from 'zod';
import { dataSourceDocsCollection, dataSourcesCollection } from '../lib/mongodb'; import { dataSourceDocsCollection, dataSourcesCollection } from '../lib/mongodb';
import { EmbeddingRecord, DataSourceDoc, DataSource } from "../lib/types/datasource_types"; import { EmbeddingRecord, DataSourceDoc, DataSource } from "../lib/types/datasource_types";
import { WithId } from 'mongodb'; import { WithId } from 'mongodb';
import { embedMany } from 'ai'; import { embedMany, generateText } from 'ai';
import { embeddingModel } from '../lib/embedding'; import { embeddingModel } from '../lib/embedding';
import { qdrantClient } from '../lib/qdrant'; import { qdrantClient } from '../lib/qdrant';
import { PrefixLogger } from "../lib/utils"; import { PrefixLogger } from "../lib/utils";
import { GoogleGenerativeAI } from "@google/generative-ai"; import { GoogleGenerativeAI } from "@google/generative-ai";
import { GetObjectCommand } from "@aws-sdk/client-s3"; import { GetObjectCommand } from "@aws-sdk/client-s3";
import { uploadsS3Client } from '../lib/uploads_s3_client'; import { uploadsS3Client } from '../lib/uploads_s3_client';
import fs from 'fs/promises';
import crypto from 'crypto'; import crypto from 'crypto';
import path from 'path';
import { createOpenAI } from '@ai-sdk/openai';
import { USE_GEMINI_FILE_PARSING } from '../lib/feature_flags';
const FILE_PARSING_PROVIDER_API_KEY = process.env.FILE_PARSING_PROVIDER_API_KEY || process.env.OPENAI_API_KEY || '';
const FILE_PARSING_PROVIDER_BASE_URL = process.env.FILE_PARSING_PROVIDER_BASE_URL || undefined;
const FILE_PARSING_MODEL = process.env.FILE_PARSING_MODEL || 'gpt-4o';
const openai = createOpenAI({
apiKey: FILE_PARSING_PROVIDER_API_KEY,
baseURL: FILE_PARSING_PROVIDER_BASE_URL,
});
const UPLOADS_DIR = process.env.RAG_UPLOADS_DIR || '/uploads';
const splitter = new RecursiveCharacterTextSplitter({ const splitter = new RecursiveCharacterTextSplitter({
separators: ['\n\n', '\n', '. ', '.', ''], separators: ['\n\n', '\n', '. ', '.', ''],
@ -27,7 +42,11 @@ const day = 24 * hour;
// Configure Google Gemini API // Configure Google Gemini API
const genAI = new GoogleGenerativeAI(process.env.GOOGLE_API_KEY || ''); const genAI = new GoogleGenerativeAI(process.env.GOOGLE_API_KEY || '');
async function getFileContent(s3Key: string): Promise<Buffer> { async function getLocalFileContent(path: string): Promise<Buffer> {
return await fs.readFile(path);
}
async function getS3FileContent(s3Key: string): Promise<Buffer> {
const command = new GetObjectCommand({ const command = new GetObjectCommand({
Bucket: process.env.RAG_UPLOADS_S3_BUCKET, Bucket: process.env.RAG_UPLOADS_S3_BUCKET,
Key: s3Key, Key: s3Key,
@ -54,33 +73,59 @@ async function retryable<T>(fn: () => Promise<T>, maxAttempts: number = 3): Prom
} }
} }
async function runProcessPipeline(_logger: PrefixLogger, job: WithId<z.infer<typeof DataSource>>, doc: WithId<z.infer<typeof DataSourceDoc>>): Promise<void> { async function runProcessPipeline(_logger: PrefixLogger, job: WithId<z.infer<typeof DataSource>>, doc: WithId<z.infer<typeof DataSourceDoc>> & { data: { type: "file_local" | "file_s3" } }): Promise<void> {
const logger = _logger const logger = _logger
.child(doc._id.toString()) .child(doc._id.toString())
.child(doc.name); .child(doc.name);
// Get file content from S3 // Get file content
logger.log("Fetching file from S3"); let fileData: Buffer;
if (doc.data.type !== 'file') { if (doc.data.type === 'file_local') {
throw new Error("Invalid data source type"); logger.log("Fetching file from local");
fileData = await getLocalFileContent(path.join(UPLOADS_DIR, doc._id.toString()));
} else {
logger.log("Fetching file from S3");
fileData = await getS3FileContent(doc.data.s3Key);
} }
const fileData = await getFileContent(doc.data.s3Key);
// Use Gemini to extract text content let markdown = "";
logger.log("Extracting content using Gemini"); const extractPrompt = "Extract and return only the text content from this document in markdown format. Exclude any formatting instructions or additional commentary.";
const model = genAI.getGenerativeModel({ model: "gemini-2.0-flash-001" }); if (!USE_GEMINI_FILE_PARSING) {
const prompt = "Extract and return only the text content from this document in markdown format. Exclude any formatting instructions or additional commentary."; // Use OpenAI to extract text content
logger.log("Extracting content using OpenAI");
const result = await model.generateContent([ const { text } = await generateText({
{ model: openai(FILE_PARSING_MODEL),
inlineData: { system: extractPrompt,
data: fileData.toString('base64'), messages: [
mimeType: doc.data.mimeType {
} role: "user",
}, content: [
prompt {
]); type: "file",
const markdown = result.response.text(); data: fileData.toString('base64'),
mimeType: doc.data.mimeType,
}
]
}
],
});
markdown = text;
} else {
// Use Gemini to extract text content
logger.log("Extracting content using Gemini");
const model = genAI.getGenerativeModel({ model: "gemini-2.0-flash-001" });
const result = await model.generateContent([
{
inlineData: {
data: fileData.toString('base64'),
mimeType: doc.data.mimeType
}
},
extractPrompt,
]);
markdown = result.response.text();
}
// split into chunks // split into chunks
logger.log("Splitting into chunks"); logger.log("Splitting into chunks");
@ -165,7 +210,6 @@ async function runDeletionPipeline(_logger: PrefixLogger, job: WithId<z.infer<ty
// fetch next job from mongodb // fetch next job from mongodb
(async () => { (async () => {
while (true) { while (true) {
console.log("Polling for job...")
const now = Date.now(); const now = Date.now();
let job: WithId<z.infer<typeof DataSource>> | null = null; let job: WithId<z.infer<typeof DataSource>> | null = null;
@ -183,7 +227,7 @@ async function runDeletionPipeline(_logger: PrefixLogger, job: WithId<z.infer<ty
job = await dataSourcesCollection.findOneAndUpdate( job = await dataSourcesCollection.findOneAndUpdate(
{ {
$and: [ $and: [
{ 'data.type': { $eq: "files" } }, { 'data.type': { $in: ["files_local", "files_s3"] } },
{ {
$or: [ $or: [
// if the job has never been attempted // if the job has never been attempted
@ -234,7 +278,7 @@ async function runDeletionPipeline(_logger: PrefixLogger, job: WithId<z.infer<ty
let errors = false; let errors = false;
try { try {
if (job.data.type !== 'files') { if (job.data.type !== 'files_local' && job.data.type !== 'files_s3') {
throw new Error("Invalid data source type"); throw new Error("Invalid data source type");
} }
@ -276,8 +320,9 @@ async function runDeletionPipeline(_logger: PrefixLogger, job: WithId<z.infer<ty
// for each doc // for each doc
for (const doc of pendingDocs) { for (const doc of pendingDocs) {
const ldoc = doc as WithId<z.infer<typeof DataSourceDoc>> & { data: { type: "file_local" | "file_s3" } };
try { try {
await runProcessPipeline(logger, job, doc); await runProcessPipeline(logger, job, ldoc);
} catch (e: any) { } catch (e: any) {
errors = true; errors = true;
logger.log("Error processing doc:", e); logger.log("Error processing doc:", e);

View file

@ -112,7 +112,6 @@ async function runDeletionPipeline(_logger: PrefixLogger, job: WithId<z.infer<ty
// fetch next job from mongodb // fetch next job from mongodb
(async () => { (async () => {
while (true) { while (true) {
console.log("Polling for job...")
const now = Date.now(); const now = Date.now();
let job: WithId<z.infer<typeof DataSource>> | null = null; let job: WithId<z.infer<typeof DataSource>> | null = null;

View file

@ -143,7 +143,6 @@ async function runDeletionPipeline(_logger: PrefixLogger, job: WithId<z.infer<ty
// fetch next job from mongodb // fetch next job from mongodb
(async () => { (async () => {
while (true) { while (true) {
console.log("Polling for job...")
const now = Date.now(); const now = Date.now();
let job: WithId<z.infer<typeof DataSource>> | null = null; let job: WithId<z.infer<typeof DataSource>> | null = null;

View file

@ -1,14 +1,18 @@
import '../lib/loadenv'; import '../lib/loadenv';
import { qdrantClient } from '../lib/qdrant'; import { qdrantClient } from '../lib/qdrant';
(async () => { const EMBEDDING_VECTOR_SIZE = Number(process.env.EMBEDDING_VECTOR_SIZE) || 1536;
await qdrantClient.createCollection('embeddings', {
vectors: {
size: 1536,
distance: 'Dot',
},
});
const { collections } = await qdrantClient.getCollections(); (async () => {
console.log(collections); try {
const result = await qdrantClient.createCollection('embeddings', {
vectors: {
size: EMBEDDING_VECTOR_SIZE,
distance: 'Dot',
},
});
console.log(`Create qdrant collection 'embeddings' completed with result: ${result}`);
} catch (error) {
console.error(`Unable to create qdrant collection 'embeddings': ${error}`);
}
})(); })();

File diff suppressed because it is too large Load diff

View file

@ -15,7 +15,7 @@
"ragTextWorker": "tsx app/scripts/rag_text_worker.ts" "ragTextWorker": "tsx app/scripts/rag_text_worker.ts"
}, },
"dependencies": { "dependencies": {
"@ai-sdk/openai": "^0.0.37", "@ai-sdk/openai": "^1.3.21",
"@auth0/nextjs-auth0": "^3.5.0", "@auth0/nextjs-auth0": "^3.5.0",
"@aws-sdk/client-s3": "^3.743.0", "@aws-sdk/client-s3": "^3.743.0",
"@aws-sdk/s3-request-presigner": "^3.743.0", "@aws-sdk/s3-request-presigner": "^3.743.0",
@ -31,7 +31,7 @@
"@modelcontextprotocol/sdk": "^1.7.0", "@modelcontextprotocol/sdk": "^1.7.0",
"@primer/react": "^36.27.0", "@primer/react": "^36.27.0",
"@qdrant/js-client-rest": "^1.13.0", "@qdrant/js-client-rest": "^1.13.0",
"ai": "^3.3.28", "ai": "^4.3.13",
"cheerio": "^1.0.0", "cheerio": "^1.0.0",
"class-variance-authority": "^0.7.1", "class-variance-authority": "^0.7.1",
"clsx": "^2.1.1", "clsx": "^2.1.1",

View file

@ -1,5 +1,13 @@
version: '3.8' version: '3.8'
volumes:
uploads:
driver: local
driver_opts:
type: none
o: bind
device: ./data/uploads
services: services:
rowboat: rowboat:
build: build:
@ -21,10 +29,11 @@ services:
- COPILOT_API_URL=http://copilot:3002 - COPILOT_API_URL=http://copilot:3002
- COPILOT_API_KEY=${COPILOT_API_KEY} - COPILOT_API_KEY=${COPILOT_API_KEY}
- REDIS_URL=redis://redis:6379 - REDIS_URL=redis://redis:6379
- USE_RAG=${USE_RAG} - USE_RAG=true
- QDRANT_URL=${QDRANT_URL} - QDRANT_URL=http://qdrant:6333
- QDRANT_API_KEY=${QDRANT_API_KEY} - QDRANT_API_KEY=${QDRANT_API_KEY}
- USE_RAG_UPLOADS=${USE_RAG_UPLOADS} - USE_RAG_UPLOADS=true
- USE_RAG_S3_UPLOADS=${USE_RAG_S3_UPLOADS}
- AWS_ACCESS_KEY_ID=${AWS_ACCESS_KEY_ID} - AWS_ACCESS_KEY_ID=${AWS_ACCESS_KEY_ID}
- AWS_SECRET_ACCESS_KEY=${AWS_SECRET_ACCESS_KEY} - AWS_SECRET_ACCESS_KEY=${AWS_SECRET_ACCESS_KEY}
- RAG_UPLOADS_S3_BUCKET=${RAG_UPLOADS_S3_BUCKET} - RAG_UPLOADS_S3_BUCKET=${RAG_UPLOADS_S3_BUCKET}
@ -38,7 +47,10 @@ services:
- MAX_PROJECTS_PER_USER=${MAX_PROJECTS_PER_USER} - MAX_PROJECTS_PER_USER=${MAX_PROJECTS_PER_USER}
- VOICE_API_URL=${VOICE_API_URL} - VOICE_API_URL=${VOICE_API_URL}
- PROVIDER_DEFAULT_MODEL=${PROVIDER_DEFAULT_MODEL} - PROVIDER_DEFAULT_MODEL=${PROVIDER_DEFAULT_MODEL}
- RAG_UPLOADS_DIR=/app/uploads
restart: unless-stopped restart: unless-stopped
volumes:
- uploads:/app/uploads
rowboat_agents: rowboat_agents:
build: build:
@ -51,7 +63,7 @@ services:
- API_KEY=${AGENTS_API_KEY} - API_KEY=${AGENTS_API_KEY}
- REDIS_URL=redis://redis:6379 - REDIS_URL=redis://redis:6379
- MONGODB_URI=mongodb://mongo:27017/rowboat - MONGODB_URI=mongodb://mongo:27017/rowboat
- QDRANT_URL=${QDRANT_URL} - QDRANT_URL=http://qdrant:6333
- QDRANT_API_KEY=${QDRANT_API_KEY} - QDRANT_API_KEY=${QDRANT_API_KEY}
- PROVIDER_BASE_URL=${PROVIDER_BASE_URL} - PROVIDER_BASE_URL=${PROVIDER_BASE_URL}
- PROVIDER_API_KEY=${PROVIDER_API_KEY} - PROVIDER_API_KEY=${PROVIDER_API_KEY}
@ -99,21 +111,21 @@ services:
build: build:
context: ./apps/rowboat context: ./apps/rowboat
dockerfile: scripts.Dockerfile dockerfile: scripts.Dockerfile
command: ["sh", "-c", "npm run setupQdrant && echo 'index created successfully'"] command: ["sh", "-c", "npm run setupQdrant"]
profiles: [ "setup_qdrant" ]
environment: environment:
- QDRANT_URL=${QDRANT_URL} - QDRANT_URL=http://qdrant:6333
- QDRANT_API_KEY=${QDRANT_API_KEY} - QDRANT_API_KEY=${QDRANT_API_KEY}
- EMBEDDING_VECTOR_SIZE=${EMBEDDING_VECTOR_SIZE}
restart: no restart: no
delete_qdrant: delete_qdrant:
build: build:
context: ./apps/rowboat context: ./apps/rowboat
dockerfile: scripts.Dockerfile dockerfile: scripts.Dockerfile
command: ["sh", "-c", "npm run deleteQdrant && echo 'index deleted successfully'"] command: ["sh", "-c", "npm run deleteQdrant"]
profiles: [ "delete_qdrant" ] profiles: [ "delete_qdrant" ]
environment: environment:
- QDRANT_URL=${QDRANT_URL} - QDRANT_URL=http://qdrant:6333
- QDRANT_API_KEY=${QDRANT_API_KEY} - QDRANT_API_KEY=${QDRANT_API_KEY}
restart: no restart: no
@ -125,15 +137,23 @@ services:
profiles: [ "rag_files_worker" ] profiles: [ "rag_files_worker" ]
environment: environment:
- OPENAI_API_KEY=${OPENAI_API_KEY} - OPENAI_API_KEY=${OPENAI_API_KEY}
- EMBEDDING_PROVIDER_BASE_URL=${EMBEDDING_PROVIDER_BASE_URL}
- EMBEDDING_PROVIDER_API_KEY=${EMBEDDING_PROVIDER_API_KEY}
- EMBEDDING_MODEL=${EMBEDDING_MODEL}
- MONGODB_CONNECTION_STRING=mongodb://mongo:27017/rowboat - MONGODB_CONNECTION_STRING=mongodb://mongo:27017/rowboat
- REDIS_URL=redis://redis:6379
- GOOGLE_API_KEY=${GOOGLE_API_KEY} - GOOGLE_API_KEY=${GOOGLE_API_KEY}
- AWS_ACCESS_KEY_ID=${AWS_ACCESS_KEY_ID} - AWS_ACCESS_KEY_ID=${AWS_ACCESS_KEY_ID}
- AWS_SECRET_ACCESS_KEY=${AWS_SECRET_ACCESS_KEY} - AWS_SECRET_ACCESS_KEY=${AWS_SECRET_ACCESS_KEY}
- RAG_UPLOADS_S3_BUCKET=${RAG_UPLOADS_S3_BUCKET} - RAG_UPLOADS_S3_BUCKET=${RAG_UPLOADS_S3_BUCKET}
- RAG_UPLOADS_S3_REGION=${RAG_UPLOADS_S3_REGION} - RAG_UPLOADS_S3_REGION=${RAG_UPLOADS_S3_REGION}
- QDRANT_URL=${QDRANT_URL} - QDRANT_URL=http://qdrant:6333
- QDRANT_API_KEY=${QDRANT_API_KEY} - QDRANT_API_KEY=${QDRANT_API_KEY}
- RAG_UPLOADS_DIR=/app/uploads
- USE_GEMINI_FILE_PARSING=${USE_GEMINI_FILE_PARSING}
restart: unless-stopped restart: unless-stopped
volumes:
- uploads:/app/uploads
rag_urls_worker: rag_urls_worker:
build: build:
@ -143,9 +163,13 @@ services:
profiles: [ "rag_urls_worker" ] profiles: [ "rag_urls_worker" ]
environment: environment:
- OPENAI_API_KEY=${OPENAI_API_KEY} - OPENAI_API_KEY=${OPENAI_API_KEY}
- EMBEDDING_PROVIDER_BASE_URL=${EMBEDDING_PROVIDER_BASE_URL}
- EMBEDDING_PROVIDER_API_KEY=${EMBEDDING_PROVIDER_API_KEY}
- EMBEDDING_MODEL=${EMBEDDING_MODEL}
- MONGODB_CONNECTION_STRING=mongodb://mongo:27017/rowboat - MONGODB_CONNECTION_STRING=mongodb://mongo:27017/rowboat
- REDIS_URL=redis://redis:6379
- FIRECRAWL_API_KEY=${FIRECRAWL_API_KEY} - FIRECRAWL_API_KEY=${FIRECRAWL_API_KEY}
- QDRANT_URL=${QDRANT_URL} - QDRANT_URL=http://qdrant:6333
- QDRANT_API_KEY=${QDRANT_API_KEY} - QDRANT_API_KEY=${QDRANT_API_KEY}
restart: unless-stopped restart: unless-stopped
@ -157,8 +181,12 @@ services:
profiles: [ "rag_text_worker" ] profiles: [ "rag_text_worker" ]
environment: environment:
- OPENAI_API_KEY=${OPENAI_API_KEY} - OPENAI_API_KEY=${OPENAI_API_KEY}
- EMBEDDING_PROVIDER_BASE_URL=${EMBEDDING_PROVIDER_BASE_URL}
- EMBEDDING_PROVIDER_API_KEY=${EMBEDDING_PROVIDER_API_KEY}
- EMBEDDING_MODEL=${EMBEDDING_MODEL}
- MONGODB_CONNECTION_STRING=mongodb://mongo:27017/rowboat - MONGODB_CONNECTION_STRING=mongodb://mongo:27017/rowboat
- QDRANT_URL=${QDRANT_URL} - REDIS_URL=redis://redis:6379
- QDRANT_URL=http://qdrant:6333
- QDRANT_API_KEY=${QDRANT_API_KEY} - QDRANT_API_KEY=${QDRANT_API_KEY}
restart: unless-stopped restart: unless-stopped
@ -209,3 +237,13 @@ services:
# - ROWBOAT_API_HOST=http://rowboat:3000 # - ROWBOAT_API_HOST=http://rowboat:3000
# - MONGODB_URI=mongodb://mongo:27017/rowboat # - MONGODB_URI=mongodb://mongo:27017/rowboat
# restart: unless-stopped # restart: unless-stopped
qdrant:
image: qdrant/qdrant
ports:
- "6333:6333"
environment:
- QDRANT__STORAGE__STORAGE_PATH=/data/qdrant
restart: unless-stopped
volumes:
- ./data/qdrant:/data/qdrant

29
start.sh Executable file
View file

@ -0,0 +1,29 @@
#!/bin/bash
# ensure data dirs exist
mkdir -p data/uploads
mkdir -p data/qdrant
mkdir -p data/mongo
# Start with the base command and profile flags
CMD="docker-compose"
# enable rag text and files workers
CMD="$CMD --profile rag_text_worker"
CMD="$CMD --profile rag_files_worker"
# enable rag urls worker
if [ "$USE_RAG_SCRAPING" = "true" ]; then
CMD="$CMD --profile rag_urls_worker"
fi
# Add more mappings as needed
# if [ "$SOME_OTHER_ENV" = "true" ]; then
# CMD="$CMD --profile some_other_profile"
# fi
# Add the up and build flags at the end
CMD="$CMD up --build"
echo "Running: $CMD"
exec $CMD