DDD refactor: data-sources (#205)

This commit is contained in:
Ramnique Singh 2025-08-17 08:06:17 +05:30 committed by GitHub
parent 912c8be156
commit 4b33b20e76
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
68 changed files with 2589 additions and 1588 deletions

View file

@ -5,13 +5,12 @@ import {
} from "../lib/types/copilot_types";
import {
Workflow} from "../lib/types/workflow_types";
import { DataSource } from "../lib/types/datasource_types";
import { DataSource } from "@/src/entities/models/data-source";
import { z } from 'zod';
import { projectAuthCheck } from "./project.actions";
import { redisClient } from "../lib/redis";
import { authorizeUserAction, logUsage } from "./billing.actions";
import { USE_BILLING } from "../lib/feature_flags";
import { WithStringId } from "../lib/types/types";
import { getEditAgentInstructionsResponse } from "../lib/copilot/copilot";
import { container } from "@/di/container";
import { IUsageQuotaPolicy } from "@/src/application/policies/usage-quota.policy.interface";
@ -24,7 +23,7 @@ export async function getCopilotResponseStream(
messages: z.infer<typeof CopilotMessage>[],
current_workflow_config: z.infer<typeof Workflow>,
context: z.infer<typeof CopilotChatContext> | null,
dataSources?: WithStringId<z.infer<typeof DataSource>>[]
dataSources?: z.infer<typeof DataSource>[]
): Promise<{
streamId: string;
} | { billingError: string }> {

View file

@ -1,41 +1,53 @@
'use server';
import { ObjectId, WithId } from "mongodb";
import { dataSourcesCollection, dataSourceDocsCollection } from "../lib/mongodb";
import { z } from 'zod';
import { GetObjectCommand, PutObjectCommand } from "@aws-sdk/client-s3";
import { getSignedUrl } from "@aws-sdk/s3-request-presigner";
import { projectAuthCheck } from "./project.actions";
import { WithStringId } from "../lib/types/types";
import { DataSourceDoc } from "../lib/types/datasource_types";
import { DataSource } from "../lib/types/datasource_types";
import { uploadsS3Client } from "../lib/uploads_s3_client";
import { DataSourceDoc } from "@/src/entities/models/data-source-doc";
import { DataSource } from "@/src/entities/models/data-source";
import { container } from "@/di/container";
import { IFetchDataSourceController } from "@/src/interface-adapters/controllers/data-sources/fetch-data-source.controller";
import { authCheck } from "./auth.actions";
import { IListDataSourcesController } from "@/src/interface-adapters/controllers/data-sources/list-data-sources.controller";
import { ICreateDataSourceController } from "@/src/interface-adapters/controllers/data-sources/create-data-source.controller";
import { IRecrawlWebDataSourceController } from "@/src/interface-adapters/controllers/data-sources/recrawl-web-data-source.controller";
import { IDeleteDataSourceController } from "@/src/interface-adapters/controllers/data-sources/delete-data-source.controller";
import { IToggleDataSourceController } from "@/src/interface-adapters/controllers/data-sources/toggle-data-source.controller";
import { IAddDocsToDataSourceController } from "@/src/interface-adapters/controllers/data-sources/add-docs-to-data-source.controller";
import { IListDocsInDataSourceController } from "@/src/interface-adapters/controllers/data-sources/list-docs-in-data-source.controller";
import { IDeleteDocFromDataSourceController } from "@/src/interface-adapters/controllers/data-sources/delete-doc-from-data-source.controller";
import { IGetDownloadUrlForFileController } from "@/src/interface-adapters/controllers/data-sources/get-download-url-for-file.controller";
import { IGetUploadUrlsForFilesController } from "@/src/interface-adapters/controllers/data-sources/get-upload-urls-for-files.controller";
import { IUpdateDataSourceController } from "@/src/interface-adapters/controllers/data-sources/update-data-source.controller";
export async function getDataSource(projectId: string, sourceId: string): Promise<WithStringId<z.infer<typeof DataSource>>> {
await projectAuthCheck(projectId);
const source = await dataSourcesCollection.findOne({
_id: new ObjectId(sourceId),
projectId,
const fetchDataSourceController = container.resolve<IFetchDataSourceController>("fetchDataSourceController");
const listDataSourcesController = container.resolve<IListDataSourcesController>("listDataSourcesController");
const createDataSourceController = container.resolve<ICreateDataSourceController>("createDataSourceController");
const recrawlWebDataSourceController = container.resolve<IRecrawlWebDataSourceController>("recrawlWebDataSourceController");
const deleteDataSourceController = container.resolve<IDeleteDataSourceController>("deleteDataSourceController");
const toggleDataSourceController = container.resolve<IToggleDataSourceController>("toggleDataSourceController");
const addDocsToDataSourceController = container.resolve<IAddDocsToDataSourceController>("addDocsToDataSourceController");
const listDocsInDataSourceController = container.resolve<IListDocsInDataSourceController>("listDocsInDataSourceController");
const deleteDocFromDataSourceController = container.resolve<IDeleteDocFromDataSourceController>("deleteDocFromDataSourceController");
const getDownloadUrlForFileController = container.resolve<IGetDownloadUrlForFileController>("getDownloadUrlForFileController");
const getUploadUrlsForFilesController = container.resolve<IGetUploadUrlsForFilesController>("getUploadUrlsForFilesController");
const updateDataSourceController = container.resolve<IUpdateDataSourceController>("updateDataSourceController");
export async function getDataSource(sourceId: string): Promise<z.infer<typeof DataSource>> {
const user = await authCheck();
return await fetchDataSourceController.execute({
caller: 'user',
userId: user._id,
sourceId,
});
if (!source) {
throw new Error('Invalid data source');
}
const { _id, ...rest } = source;
return {
...rest,
_id: _id.toString(),
};
}
export async function listDataSources(projectId: string): Promise<WithStringId<z.infer<typeof DataSource>>[]> {
await projectAuthCheck(projectId);
const sources = await dataSourcesCollection.find({
projectId: projectId,
status: { $ne: 'deleted' },
}).toArray();
return sources.map((s) => ({
...s,
_id: s._id.toString(),
}));
export async function listDataSources(projectId: string): Promise<z.infer<typeof DataSource>[]> {
const user = await authCheck();
return await listDataSourcesController.execute({
caller: 'user',
userId: user._id,
projectId,
});
}
export async function createDataSource({
@ -50,272 +62,124 @@ export async function createDataSource({
description?: string,
data: z.infer<typeof DataSource>['data'],
status?: 'pending' | 'ready',
}): Promise<WithStringId<z.infer<typeof DataSource>>> {
await projectAuthCheck(projectId);
const source: z.infer<typeof DataSource> = {
projectId: projectId,
active: true,
name: name,
description,
createdAt: (new Date()).toISOString(),
attempts: 0,
version: 1,
data,
};
// Only set status for non-file data sources
if (data.type !== 'files_local' && data.type !== 'files_s3') {
source.status = status;
}
await dataSourcesCollection.insertOne(source);
const { _id, ...rest } = source as WithId<z.infer<typeof DataSource>>;
return {
...rest,
_id: _id.toString(),
};
}
export async function recrawlWebDataSource(projectId: string, sourceId: string) {
await projectAuthCheck(projectId);
const source = await getDataSource(projectId, sourceId);
if (source.data.type !== 'urls') {
throw new Error('Invalid data source type');
}
// mark all files as queued
await dataSourceDocsCollection.updateMany({
sourceId: sourceId,
}, {
$set: {
status: 'pending',
lastUpdatedAt: (new Date()).toISOString(),
attempts: 0,
}
});
// mark data source as pending
await dataSourcesCollection.updateOne({
_id: new ObjectId(sourceId),
}, {
$set: {
status: 'pending',
billingError: undefined,
lastUpdatedAt: (new Date()).toISOString(),
attempts: 0,
},
$inc: {
version: 1,
}): Promise<z.infer<typeof DataSource>> {
const user = await authCheck();
return await createDataSourceController.execute({
caller: 'user',
userId: user._id,
data: {
projectId,
name,
description: description || '',
status,
data,
},
});
}
export async function deleteDataSource(projectId: string, sourceId: string) {
await projectAuthCheck(projectId);
await getDataSource(projectId, sourceId);
export async function recrawlWebDataSource(sourceId: string) {
const user = await authCheck();
// mark data source as deleted
await dataSourcesCollection.updateOne({
_id: new ObjectId(sourceId),
}, {
$set: {
status: 'deleted',
billingError: undefined,
lastUpdatedAt: (new Date()).toISOString(),
attempts: 0,
},
$inc: {
version: 1,
},
return await recrawlWebDataSourceController.execute({
caller: 'user',
userId: user._id,
sourceId,
});
}
export async function toggleDataSource(projectId: string, sourceId: string, active: boolean) {
await projectAuthCheck(projectId);
await getDataSource(projectId, sourceId);
export async function deleteDataSource(sourceId: string) {
const user = await authCheck();
await dataSourcesCollection.updateOne({
"_id": new ObjectId(sourceId),
"projectId": projectId,
}, {
$set: {
"active": active,
}
return await deleteDataSourceController.execute({
caller: 'user',
userId: user._id,
sourceId,
});
}
export async function toggleDataSource(sourceId: string, active: boolean) {
const user = await authCheck();
return await toggleDataSourceController.execute({
caller: 'user',
userId: user._id,
sourceId,
active,
});
}
export async function addDocsToDataSource({
projectId,
sourceId,
docData,
}: {
projectId: string,
sourceId: string,
docData: {
_id?: string,
name: string,
data: z.infer<typeof DataSourceDoc>['data']
}[]
}): Promise<void> {
await projectAuthCheck(projectId);
const source = await getDataSource(projectId, sourceId);
const user = await authCheck();
await dataSourceDocsCollection.insertMany(docData.map(doc => {
const record: z.infer<typeof DataSourceDoc> = {
sourceId,
name: doc.name,
status: 'pending',
createdAt: new Date().toISOString(),
data: doc.data,
version: 1,
};
if (!doc._id) {
return record;
}
const recordWithId = record as WithId<z.infer<typeof DataSourceDoc>>;
recordWithId._id = new ObjectId(doc._id);
return recordWithId;
}));
// Only set status to pending when files are added
if (docData.length > 0 && (source.data.type === 'files_local' || source.data.type === 'files_s3')) {
await dataSourcesCollection.updateOne(
{ _id: new ObjectId(sourceId) },
{
$set: {
status: 'pending',
billingError: undefined,
attempts: 0,
lastUpdatedAt: new Date().toISOString(),
},
$inc: {
version: 1,
},
}
);
}
return await addDocsToDataSourceController.execute({
caller: 'user',
userId: user._id,
sourceId,
docs: docData,
});
}
export async function listDocsInDataSource({
projectId,
sourceId,
page = 1,
limit = 10,
}: {
projectId: string,
sourceId: string,
page?: number,
limit?: number,
}): Promise<{
files: WithStringId<z.infer<typeof DataSourceDoc>>[],
files: z.infer<typeof DataSourceDoc>[],
total: number
}> {
await projectAuthCheck(projectId);
await getDataSource(projectId, sourceId);
const user = await authCheck();
// Get total count
const total = await dataSourceDocsCollection.countDocuments({
const docs = await listDocsInDataSourceController.execute({
caller: 'user',
userId: user._id,
sourceId,
status: { $ne: 'deleted' },
});
// Fetch docs with pagination
const docs = await dataSourceDocsCollection.find({
sourceId,
status: { $ne: 'deleted' },
})
.skip((page - 1) * limit)
.limit(limit)
.toArray();
return {
files: docs.map(f => ({ ...f, _id: f._id.toString() })),
total
files: docs,
total: docs.length,
};
}
export async function deleteDocsFromDataSource({
projectId,
sourceId,
docIds,
export async function deleteDocFromDataSource({
docId,
}: {
projectId: string,
sourceId: string,
docIds: string[],
docId: string,
}): Promise<void> {
await projectAuthCheck(projectId);
await getDataSource(projectId, sourceId);
// mark for deletion
await dataSourceDocsCollection.updateMany(
{
sourceId,
_id: {
$in: docIds.map(id => new ObjectId(id))
}
},
{
$set: {
status: "deleted",
lastUpdatedAt: new Date().toISOString(),
},
$inc: {
version: 1,
},
}
);
// mark data source as pending
await dataSourcesCollection.updateOne({
_id: new ObjectId(sourceId),
}, {
$set: {
status: 'pending',
billingError: undefined,
attempts: 0,
lastUpdatedAt: new Date().toISOString(),
},
$inc: {
version: 1,
},
const user = await authCheck();
return await deleteDocFromDataSourceController.execute({
caller: 'user',
userId: user._id,
docId,
});
}
export async function getDownloadUrlForFile(
projectId: string,
sourceId: string,
fileId: string
): Promise<string> {
await projectAuthCheck(projectId);
await getDataSource(projectId, sourceId);
const file = await dataSourceDocsCollection.findOne({
sourceId,
_id: new ObjectId(fileId),
'data.type': { $in: ['file_local', 'file_s3'] },
const user = await authCheck();
return await getDownloadUrlForFileController.execute({
caller: 'user',
userId: user._id,
fileId,
});
if (!file) {
throw new Error('File not found');
}
// if local, return path
if (file.data.type === 'file_local') {
return `/api/uploads/${fileId}`;
} else if (file.data.type === 'file_s3') {
const command = new GetObjectCommand({
Bucket: process.env.RAG_UPLOADS_S3_BUCKET,
Key: file.data.s3Key,
});
return await getSignedUrl(uploadsS3Client, command, { expiresIn: 60 }); // URL valid for 1 minute
}
throw new Error('Invalid file type');
}
export async function getUploadUrlsForFilesDataSource(
projectId: string,
sourceId: string,
files: { name: string; type: string; size: number }[]
): Promise<{
@ -323,70 +187,31 @@ export async function getUploadUrlsForFilesDataSource(
uploadUrl: string,
path: string,
}[]> {
await projectAuthCheck(projectId);
const source = await getDataSource(projectId, sourceId);
if (source.data.type !== 'files_local' && source.data.type !== 'files_s3') {
throw new Error('Invalid files data source');
}
const user = await authCheck();
const urls: {
fileId: string,
uploadUrl: string,
path: string,
}[] = [];
for (const file of files) {
const fileId = new ObjectId().toString();
if (source.data.type === 'files_s3') {
// Generate presigned URL
const projectIdPrefix = projectId.slice(0, 2); // 2 characters from the start of the projectId
const path = `datasources/files/${projectIdPrefix}/${projectId}/${sourceId}/${fileId}/${file.name}`;
const command = new PutObjectCommand({
Bucket: process.env.RAG_UPLOADS_S3_BUCKET,
Key: path,
ContentType: file.type,
});
const uploadUrl = await getSignedUrl(uploadsS3Client, command, { expiresIn: 10 * 60 }); // valid for 10 minutes
urls.push({
fileId,
uploadUrl,
path,
});
} else if (source.data.type === 'files_local') {
// Generate local upload URL
urls.push({
fileId,
uploadUrl: '/api/uploads/' + fileId,
path: '/api/uploads/' + fileId,
});
}
}
return urls;
return await getUploadUrlsForFilesController.execute({
caller: 'user',
userId: user._id,
sourceId,
files,
});
}
export async function updateDataSource({
projectId,
sourceId,
description,
}: {
projectId: string,
sourceId: string,
description: string,
}) {
await projectAuthCheck(projectId);
await getDataSource(projectId, sourceId);
const user = await authCheck();
await dataSourcesCollection.updateOne({
_id: new ObjectId(sourceId),
}, {
$set: {
return await updateDataSourceController.execute({
caller: 'user',
userId: user._id,
sourceId,
data: {
description,
lastUpdatedAt: (new Date()).toISOString(),
},
$inc: {
version: 1,
},
});
}

View file

@ -1,6 +1,6 @@
'use server';
import { redirect } from "next/navigation";
import { db, dataSourcesCollection, projectsCollection } from "../lib/mongodb";
import { db, projectsCollection } from "../lib/mongodb";
import { z } from 'zod';
import crypto from 'crypto';
import { revalidatePath } from "next/cache";
@ -12,13 +12,16 @@ import { Project } from "../lib/types/project_types";
import { USE_AUTH } from "../lib/feature_flags";
import { authorizeUserAction } from "./billing.actions";
import { Workflow } from "../lib/types/workflow_types";
import { container } from "@/di/container";
import { IProjectActionAuthorizationPolicy } from "@/src/application/policies/project-action-authorization.policy";
import { ICreateApiKeyController } from "@/src/interface-adapters/controllers/api-keys/create-api-key.controller";
import { IListApiKeysController } from "@/src/interface-adapters/controllers/api-keys/list-api-keys.controller";
import { IDeleteApiKeyController } from "@/src/interface-adapters/controllers/api-keys/delete-api-key.controller";
import { IApiKeysRepository } from "@/src/application/repositories/api-keys.repository.interface";
import { IProjectMembersRepository } from "@/src/application/repositories/project-members.repository.interface";
import { IDataSourcesRepository } from "@/src/application/repositories/data-sources.repository.interface";
import { IDataSourceDocsRepository } from "@/src/application/repositories/data-source-docs.repository.interface";
import { container } from "@/di/container";
import { qdrantClient } from "../lib/qdrant";
const projectActionAuthorizationPolicy = container.resolve<IProjectActionAuthorizationPolicy>('projectActionAuthorizationPolicy');
const createApiKeyController = container.resolve<ICreateApiKeyController>('createApiKeyController');
@ -26,6 +29,8 @@ const listApiKeysController = container.resolve<IListApiKeysController>('listApi
const deleteApiKeyController = container.resolve<IDeleteApiKeyController>('deleteApiKeyController');
const apiKeysRepository = container.resolve<IApiKeysRepository>('apiKeysRepository');
const projectMembersRepository = container.resolve<IProjectMembersRepository>('projectMembersRepository');
const dataSourcesRepository = container.resolve<IDataSourcesRepository>('dataSourcesRepository');
const dataSourceDocsRepository = container.resolve<IDataSourceDocsRepository>('dataSourceDocsRepository');
export async function listTemplates() {
const templatesArray = Object.entries(templates)
@ -234,22 +239,15 @@ export async function deleteProject(projectId: string) {
// delete api keys
await apiKeysRepository.deleteAll(projectId);
// delete embeddings
const sources = await dataSourcesCollection.find({
projectId,
}, {
projection: {
_id: true,
}
}).toArray();
const ids = sources.map(s => s._id);
// delete data sources
await dataSourcesCollection.deleteMany({
_id: {
$in: ids,
}
// delete data sources data
await dataSourceDocsRepository.deleteByProjectId(projectId);
await dataSourcesRepository.deleteByProjectId(projectId);
await qdrantClient.delete("embeddings", {
filter: {
must: [
{ key: "projectId", match: { value: projectId } },
],
},
});
// delete project members

View file

@ -2,11 +2,13 @@ import { NextRequest, NextResponse } from 'next/server';
import path from 'path';
import fs from 'fs/promises';
import fsSync from 'fs';
import { dataSourceDocsCollection } from '@/app/lib/mongodb';
import { ObjectId } from 'mongodb';
import { container } from '@/di/container';
import { IDataSourceDocsRepository } from '@/src/application/repositories/data-source-docs.repository.interface';
const UPLOADS_DIR = process.env.RAG_UPLOADS_DIR || '/uploads';
const dataSourceDocsRepository = container.resolve<IDataSourceDocsRepository>('dataSourceDocsRepository');
// PUT endpoint to handle file uploads
export async function PUT(request: NextRequest, props: { params: Promise<{ fileId: string }> }) {
const params = await props.params;
@ -39,10 +41,8 @@ export async function GET(request: NextRequest, props: { params: Promise<{ fileI
return NextResponse.json({ error: 'Missing file ID' }, { status: 400 });
}
const filePath = path.join(UPLOADS_DIR, fileId);
// get mimetype from database
const doc = await dataSourceDocsCollection.findOne({ _id: new ObjectId(fileId) });
const doc = await dataSourceDocsRepository.fetch(fileId);
if (!doc) {
return NextResponse.json({ error: 'File not found' }, { status: 404 });
}
@ -54,6 +54,9 @@ export async function GET(request: NextRequest, props: { params: Promise<{ fileI
const fileName = doc.data.name;
try {
// strip uploads dir from path
const filePath = path.join(UPLOADS_DIR, doc.data.path.split('/api/uploads/')[1]);
// Check if file exists
await fs.access(filePath);
// Create a readable stream

View file

@ -2,7 +2,6 @@
import { tool, Tool } from "@openai/agents";
import { createOpenAI } from "@ai-sdk/openai";
import { embed, generateText } from "ai";
import { ObjectId } from "mongodb";
import { z } from "zod";
import { composio } from "./composio/composio";
import { SignJWT } from "jose";
@ -11,12 +10,16 @@ import crypto from "crypto";
// Internal dependencies
import { embeddingModel } from '../lib/embedding';
import { getMcpClient } from "./mcp";
import { dataSourceDocsCollection, dataSourcesCollection, projectsCollection } from "./mongodb";
import { projectsCollection } from "./mongodb";
import { qdrantClient } from '../lib/qdrant';
import { EmbeddingRecord } from "./types/datasource_types";
import { WorkflowAgent, WorkflowTool } from "./types/workflow_types";
import { PrefixLogger } from "./utils";
import { UsageTracker } from "./billing";
import { DataSource } from "@/src/entities/models/data-source";
import { IDataSourcesRepository } from "@/src/application/repositories/data-sources.repository.interface";
import { IDataSourceDocsRepository } from "@/src/application/repositories/data-source-docs.repository.interface";
import { container } from "@/di/container";
// Provider configuration
const PROVIDER_API_KEY = process.env.PROVIDER_API_KEY || process.env.OPENAI_API_KEY || '';
@ -92,6 +95,9 @@ export async function invokeRagTool(
logger.log(`returnType: ${returnType}`);
logger.log(`k: ${k}`);
const dataSourcesRepository = container.resolve<IDataSourcesRepository>('dataSourcesRepository');
const dataSourceDocsRepository = container.resolve<IDataSourceDocsRepository>('dataSourceDocsRepository');
// Create embedding for question
const { embedding, usage } = await embed({
model: embeddingModel,
@ -109,14 +115,19 @@ export async function invokeRagTool(
});
// Fetch all data sources for this project
const sources = await dataSourcesCollection.find({
projectId: projectId,
active: true,
}).toArray();
const sources: z.infer<typeof DataSource>[] = [];
let cursor = undefined;
do {
const resp = await dataSourcesRepository.list(projectId, {
active: true,
}, cursor);
sources.push(...resp.items);
cursor = resp.nextCursor;
} while(cursor);
const validSourceIds = sources
.filter(s => sourceIds.includes(s._id.toString())) // id should be in sourceIds
.filter(s => s.active) // should be active
.map(s => s._id.toString());
.filter(s => sourceIds.includes(s.id)) // id should be in sourceIds
.map(s => s.id);
logger.log(`valid source ids: ${validSourceIds.join(', ')}`);
// if no sources found, return empty response
@ -157,14 +168,12 @@ export async function invokeRagTool(
}
// otherwise, fetch the doc contents from mongodb
const docs = await dataSourceDocsCollection.find({
_id: { $in: results.map(r => new ObjectId(r.docId)) },
}).toArray();
const docs = await dataSourceDocsRepository.bulkFetch(results.map(r => r.docId));
logger.log(`fetched docs: ${docs.length}`);
// map the results to the docs
results = results.map(r => {
const doc = docs.find(d => d._id.toString() === r.docId);
const doc = docs.find(d => d.id === r.docId);
return {
...r,
content: doc?.content || '',

View file

@ -1,10 +1,9 @@
import z from "zod";
import { createOpenAI } from "@ai-sdk/openai";
import { generateObject, streamText, tool } from "ai";
import { WithStringId } from "../types/types";
import { Workflow, WorkflowTool } from "../types/workflow_types";
import { CopilotChatContext, CopilotMessage } from "../types/copilot_types";
import { DataSource } from "../types/datasource_types";
import { DataSource } from "@/src/entities/models/data-source";
import { PrefixLogger } from "../utils";
import zodToJsonSchema from "zod-to-json-schema";
import { COPILOT_INSTRUCTIONS_EDIT_AGENT } from "./copilot_edit_agent";
@ -102,11 +101,11 @@ ${JSON.stringify(workflow)}
`;
}
function getDataSourcesPrompt(dataSources: WithStringId<z.infer<typeof DataSource>>[]): string {
function getDataSourcesPrompt(dataSources: z.infer<typeof DataSource>[]): string {
let prompt = '';
if (dataSources.length > 0) {
const simplifiedDataSources = dataSources.map(ds => ({
id: ds._id,
id: ds.id,
name: ds.name,
description: ds.description,
data: ds.data,
@ -274,7 +273,7 @@ export async function* streamMultiAgentResponse(
context: z.infer<typeof CopilotChatContext> | null,
messages: z.infer<typeof CopilotMessage>[],
workflow: z.infer<typeof Workflow>,
dataSources: WithStringId<z.infer<typeof DataSource>>[]
dataSources: z.infer<typeof DataSource>[]
): AsyncIterable<z.infer<typeof ZEvent>> {
const logger = new PrefixLogger('copilot /stream');
logger.log('context', context);

View file

@ -2,8 +2,6 @@ import { MongoClient } from "mongodb";
import { User } from "./types/types";
import { Workflow } from "./types/workflow_types";
import { Project } from "./types/project_types";
import { DataSourceDoc } from "./types/datasource_types";
import { DataSource } from "./types/datasource_types";
import { TwilioConfig, TwilioInboundCall } from "./types/voice_types";
import { z } from 'zod';
import { apiV1 } from "rowboat-shared";
@ -11,8 +9,6 @@ import { apiV1 } from "rowboat-shared";
const client = new MongoClient(process.env["MONGODB_CONNECTION_STRING"] || "mongodb://localhost:27017");
export const db = client.db("rowboat");
export const dataSourcesCollection = db.collection<z.infer<typeof DataSource>>("sources");
export const dataSourceDocsCollection = db.collection<z.infer<typeof DataSourceDoc>>("source_docs");
export const projectsCollection = db.collection<z.infer<typeof Project>>("projects");
export const agentWorkflowsCollection = db.collection<z.infer<typeof Workflow>>("agent_workflows");
export const chatsCollection = db.collection<z.infer<typeof apiV1.Chat>>("chats");

View file

@ -1,7 +1,7 @@
import { z } from "zod";
import { Workflow } from "./workflow_types";
import { Message } from "./types";
import { DataSource } from "./datasource_types";
import { DataSource } from "@/src/entities/models/data-source";
export const CopilotUserMessage = z.object({
role: z.literal('user'),
@ -52,9 +52,7 @@ export const CopilotAPIRequest = z.object({
messages: z.array(CopilotMessage),
workflow: Workflow,
context: CopilotChatContext.nullable(),
dataSources: z.array(DataSource.extend({
_id: z.string(),
})).optional(),
dataSources: z.array(DataSource).optional(),
});
export const CopilotAPIResponse = z.union([
z.object({

View file

@ -1,90 +1,5 @@
import { z } from "zod";
export const DataSource = z.object({
name: z.string(),
description: z.string().optional(),
projectId: z.string(),
active: z.boolean().default(true),
status: z.union([
z.literal('pending'),
z.literal('ready'),
z.literal('error'),
z.literal('deleted'),
]).optional(),
version: z.number(),
error: z.string().optional(),
billingError: z.string().optional(),
createdAt: z.string().datetime(),
lastUpdatedAt: z.string().datetime().optional(),
attempts: z.number(),
lastAttemptAt: z.string().datetime().optional(),
pendingRefresh: z.boolean().default(false).optional(),
data: z.discriminatedUnion('type', [
z.object({
type: z.literal('urls'),
}),
z.object({
type: z.literal('files_local'),
}),
z.object({
type: z.literal('files_s3'),
}),
z.object({
type: z.literal('text'),
})
]),
});
export const DataSourceDoc = z.object({
sourceId: z.string(),
name: z.string(),
version: z.number(),
status: z.union([
z.literal('pending'),
z.literal('ready'),
z.literal('error'),
z.literal('deleted'),
]),
content: z.string().optional(),
createdAt: z.string().datetime(),
lastUpdatedAt: z.string().datetime().optional(),
error: z.string().optional(),
data: z.discriminatedUnion('type', [
z.object({
type: z.literal('url'),
url: z.string(),
}),
z.object({
type: z.literal('file_local'),
name: z.string(),
size: z.number(),
mimeType: z.string(),
}),
z.object({
type: z.literal('file_s3'),
name: z.string(),
size: z.number(),
mimeType: z.string(),
s3Key: z.string(),
}),
z.object({
type: z.literal('text'),
content: z.string(),
}),
]),
});
export const EmbeddingDoc = z.object({
content: z.string(),
sourceId: z.string(),
embeddings: z.array(z.number()),
metadata: z.object({
sourceURL: z.string(),
title: z.string(),
score: z.number().optional(),
}),
});
export const EmbeddingRecord = z.object({
id: z.string().uuid(),
vector: z.array(z.number()),

View file

@ -5,7 +5,7 @@ import { useRef, useState, createContext, useContext, useCallback, forwardRef, u
import { CopilotChatContext } from "../../../lib/types/copilot_types";
import { CopilotMessage } from "../../../lib/types/copilot_types";
import { Workflow } from "@/app/lib/types/workflow_types";
import { DataSource } from "@/app/lib/types/datasource_types";
import { DataSource } from "@/src/entities/models/data-source";
import { z } from "zod";
import { Action as WorkflowDispatch } from "@/app/projects/[projectId]/workflow/workflow_editor";
import { Panel } from "@/components/common/panel-common";
@ -14,7 +14,6 @@ import { Messages } from "./components/messages";
import { CopyIcon, CheckIcon, PlusIcon, XIcon, InfoIcon, Sparkles } from "lucide-react";
import { useCopilot } from "./use-copilot";
import { BillingUpgradeModal } from "@/components/common/billing-upgrade-modal";
import { WithStringId } from "@/app/lib/types/types";
const CopilotContext = createContext<{
workflow: z.infer<typeof Workflow> | null;
@ -33,7 +32,7 @@ interface AppProps {
onCopyJson?: (data: { messages: any[] }) => void;
onMessagesChange?: (messages: z.infer<typeof CopilotMessage>[]) => void;
isInitialState?: boolean;
dataSources?: WithStringId<z.infer<typeof DataSource>>[];
dataSources?: z.infer<typeof DataSource>[];
}
const App = forwardRef<{ handleCopyChat: () => void; handleUserMessage: (message: string) => void }, AppProps>(function App({
@ -277,7 +276,7 @@ export const Copilot = forwardRef<{ handleUserMessage: (message: string) => void
chatContext?: z.infer<typeof CopilotChatContext>;
dispatch: (action: WorkflowDispatch) => void;
isInitialState?: boolean;
dataSources?: WithStringId<z.infer<typeof DataSource>>[];
dataSources?: z.infer<typeof DataSource>[];
}>(({
projectId,
workflow,

View file

@ -2,7 +2,7 @@ import { useCallback, useRef, useState } from "react";
import { getCopilotResponseStream } from "@/app/actions/copilot.actions";
import { CopilotMessage } from "@/app/lib/types/copilot_types";
import { Workflow } from "@/app/lib/types/workflow_types";
import { DataSource } from "@/app/lib/types/datasource_types";
import { DataSource } from "@/src/entities/models/data-source";
import { z } from "zod";
import { WithStringId } from "@/app/lib/types/types";
@ -10,7 +10,7 @@ interface UseCopilotParams {
projectId: string;
workflow: z.infer<typeof Workflow>;
context: any;
dataSources?: WithStringId<z.infer<typeof DataSource>>[];
dataSources?: z.infer<typeof DataSource>[];
}
interface UseCopilotResult {

View file

@ -1,7 +1,6 @@
"use client";
import { WithStringId } from "../../../lib/types/types";
import { WorkflowPrompt, WorkflowAgent, Workflow, WorkflowTool } from "../../../lib/types/workflow_types";
import { DataSource } from "../../../lib/types/datasource_types";
import { DataSource } from "@/src/entities/models/data-source";
import { z } from "zod";
import { PlusIcon, Sparkles, X as XIcon, ChevronDown, ChevronRight, Trash2, Maximize2, Minimize2, StarIcon, DatabaseIcon, UserIcon, Settings, Info } from "lucide-react";
import { useState, useEffect, useRef } from "react";
@ -59,7 +58,7 @@ export function AgentConfig({
agents: z.infer<typeof WorkflowAgent>[],
tools: z.infer<typeof WorkflowTool>[],
prompts: z.infer<typeof WorkflowPrompt>[],
dataSources: WithStringId<z.infer<typeof DataSource>>[],
dataSources: z.infer<typeof DataSource>[],
handleUpdate: (agent: z.infer<typeof WorkflowAgent>) => void,
handleClose: () => void,
useRag: boolean,
@ -726,12 +725,12 @@ export function AgentConfig({
startContent={<PlusIcon className="w-4 h-4 text-gray-500" />}
>
{dataSources
.filter((ds) => !(agent.ragDataSources || []).includes(ds._id))
.filter((ds) => !(agent.ragDataSources || []).includes(ds.id))
.length > 0 ? (
dataSources
.filter((ds) => !(agent.ragDataSources || []).includes(ds._id))
.filter((ds) => !(agent.ragDataSources || []).includes(ds.id))
.map((ds) => (
<SelectItem key={ds._id}>
<SelectItem key={ds.id}>
{ds.name}
</SelectItem>
))
@ -775,7 +774,7 @@ export function AgentConfig({
{agent.ragDataSources !== undefined && agent.ragDataSources.length > 0 && (
<div className="flex flex-col gap-2 mt-2">
{(agent.ragDataSources || []).map((source) => {
const ds = dataSources.find((ds) => ds._id === source);
const ds = dataSources.find((ds) => ds.id === source);
return (
<div
key={source}

View file

@ -1,6 +1,5 @@
"use client";
import { WithStringId } from "../../../lib/types/types";
import { DataSource } from "../../../lib/types/datasource_types";
import { DataSource } from "@/src/entities/models/data-source";
import { z } from "zod";
import { XIcon, FileIcon, GlobeIcon, AlertTriangle, CheckCircle, Circle, ExternalLinkIcon, Type, PlusIcon, Edit3Icon, DownloadIcon, Trash2 } from "lucide-react";
import { useState, useEffect, useCallback } from "react";
@ -8,9 +7,9 @@ import { Panel } from "@/components/common/panel-common";
import { Button } from "@/components/ui/button";
import { DataSourceIcon } from "@/app/lib/components/datasource-icon";
import { Tooltip } from "@heroui/react";
import { getDataSource, listDocsInDataSource, deleteDocsFromDataSource, getDownloadUrlForFile, addDocsToDataSource, getUploadUrlsForFilesDataSource } from "@/app/actions/data-source.actions";
import { getDataSource, listDocsInDataSource, deleteDocFromDataSource, getDownloadUrlForFile, addDocsToDataSource, getUploadUrlsForFilesDataSource } from "@/app/actions/data-source.actions";
import { InputField } from "@/app/lib/components/input-field";
import { DataSourceDoc } from "../../../lib/types/datasource_types";
import { DataSourceDoc } from "@/src/entities/models/data-source-doc";
import { RelativeTime } from "@primer/react";
import { Pagination, Spinner, Button as HeroButton, Textarea as HeroTextarea } from "@heroui/react";
import { useDropzone } from "react-dropzone";
@ -24,12 +23,12 @@ export function DataSourceConfig({
handleClose: () => void,
onDataSourceUpdate?: () => void
}) {
const [dataSource, setDataSource] = useState<WithStringId<z.infer<typeof DataSource>> | null>(null);
const [dataSource, setDataSource] = useState<z.infer<typeof DataSource> | null>(null);
const [loading, setLoading] = useState(true);
const [error, setError] = useState<string | null>(null);
// Files-related state
const [files, setFiles] = useState<WithStringId<z.infer<typeof DataSourceDoc>>[]>([]);
const [files, setFiles] = useState<z.infer<typeof DataSourceDoc>[]>([]);
const [filesLoading, setFilesLoading] = useState(false);
const [filesPage, setFilesPage] = useState(1);
const [filesTotal, setFilesTotal] = useState(0);
@ -44,22 +43,22 @@ export function DataSourceConfig({
const currentProjectId = pathParts[2]; // /projects/[projectId]/workflow
setProjectId(currentProjectId);
const ds = await getDataSource(currentProjectId, dataSourceId);
const ds = await getDataSource(dataSourceId);
setDataSource(ds);
// Load files if it's a files data source
if (ds.data.type === 'files_local' || ds.data.type === 'files_s3') {
await loadFiles(currentProjectId, dataSourceId, 1);
await loadFiles(dataSourceId, 1);
}
// Load URLs if it's a URLs data source
if (ds.data.type === 'urls') {
await loadUrls(currentProjectId, dataSourceId, 1);
await loadUrls(dataSourceId, 1);
}
// Load text content if it's a text data source
if (ds.data.type === 'text') {
await loadTextContent(currentProjectId, dataSourceId);
await loadTextContent(dataSourceId);
}
} catch (err) {
console.error('Failed to load data source:', err);
@ -91,7 +90,7 @@ export function DataSourceConfig({
}
try {
const updatedSource = await getDataSource(projectId, dataSourceId);
const updatedSource = await getDataSource(dataSourceId);
if (!ignore) {
setDataSource(updatedSource);
onDataSourceUpdate?.(); // Notify parent of status change
@ -124,20 +123,19 @@ export function DataSourceConfig({
// Helper function to update data source and notify parent
const updateDataSourceAndNotify = useCallback(async () => {
try {
const updatedSource = await getDataSource(projectId, dataSourceId);
const updatedSource = await getDataSource(dataSourceId);
setDataSource(updatedSource);
onDataSourceUpdate?.();
} catch (err) {
console.error('Failed to reload data source:', err);
}
}, [projectId, dataSourceId, onDataSourceUpdate]);
}, [dataSourceId, onDataSourceUpdate]);
// Load files function
const loadFiles = async (projectId: string, sourceId: string, page: number) => {
const loadFiles = async (sourceId: string, page: number) => {
try {
setFilesLoading(true);
const { files, total } = await listDocsInDataSource({
projectId,
sourceId,
page,
limit: 10,
@ -153,7 +151,7 @@ export function DataSourceConfig({
};
// URLs-related state
const [urls, setUrls] = useState<WithStringId<z.infer<typeof DataSourceDoc>>[]>([]);
const [urls, setUrls] = useState<z.infer<typeof DataSourceDoc>[]>([]);
const [urlsLoading, setUrlsLoading] = useState(false);
const [urlsPage, setUrlsPage] = useState(1);
const [urlsTotal, setUrlsTotal] = useState(0);
@ -171,11 +169,10 @@ export function DataSourceConfig({
const [uploadingFiles, setUploadingFiles] = useState(false);
// Load URLs function
const loadUrls = async (projectId: string, sourceId: string, page: number) => {
const loadUrls = async (sourceId: string, page: number) => {
try {
setUrlsLoading(true);
const { files, total } = await listDocsInDataSource({
projectId,
sourceId,
page,
limit: 10,
@ -191,11 +188,10 @@ export function DataSourceConfig({
};
// Load text content function
const loadTextContent = async (projectId: string, sourceId: string) => {
const loadTextContent = async (sourceId: string) => {
try {
setTextLoading(true);
const { files } = await listDocsInDataSource({
projectId,
sourceId,
limit: 1,
});
@ -218,13 +214,11 @@ export function DataSourceConfig({
if (!window.confirm('Are you sure you want to delete this file?')) return;
try {
await deleteDocsFromDataSource({
projectId,
sourceId: dataSourceId,
docIds: [fileId],
await deleteDocFromDataSource({
docId: fileId,
});
// Reload files
await loadFiles(projectId, dataSourceId, filesPage);
await loadFiles(dataSourceId, filesPage);
// Reload data source to get updated status
await updateDataSourceAndNotify();
@ -236,7 +230,7 @@ export function DataSourceConfig({
// Handle file download
const handleDownloadFile = async (fileId: string) => {
try {
const url = await getDownloadUrlForFile(projectId, dataSourceId, fileId);
const url = await getDownloadUrlForFile(fileId);
window.open(url, '_blank');
} catch (err) {
console.error('Failed to download file:', err);
@ -245,7 +239,7 @@ export function DataSourceConfig({
// Handle page change
const handlePageChange = (page: number) => {
loadFiles(projectId, dataSourceId, page);
loadFiles(dataSourceId, page);
};
// Handle URL deletion
@ -253,13 +247,11 @@ export function DataSourceConfig({
if (!window.confirm('Are you sure you want to delete this URL?')) return;
try {
await deleteDocsFromDataSource({
projectId,
sourceId: dataSourceId,
docIds: [urlId],
await deleteDocFromDataSource({
docId: urlId,
});
// Reload URLs
await loadUrls(projectId, dataSourceId, urlsPage);
await loadUrls(dataSourceId, urlsPage);
// Reload data source to get updated status
await updateDataSourceAndNotify();
@ -270,7 +262,7 @@ export function DataSourceConfig({
// Handle URL page change
const handleUrlPageChange = (page: number) => {
loadUrls(projectId, dataSourceId, page);
loadUrls(dataSourceId, page);
};
// Handle text content update
@ -279,22 +271,18 @@ export function DataSourceConfig({
try {
// Delete existing text doc if it exists
const { files } = await listDocsInDataSource({
projectId,
sourceId: dataSourceId,
limit: 1,
});
if (files.length > 0) {
await deleteDocsFromDataSource({
projectId,
sourceId: dataSourceId,
docIds: [files[0]._id],
await deleteDocFromDataSource({
docId: files[0].id,
});
}
// Add new text doc
await addDocsToDataSource({
projectId,
sourceId: dataSourceId,
docData: [{
name: 'text',
@ -327,7 +315,6 @@ export function DataSourceConfig({
const first100Urls = urlsArray.slice(0, 100);
await addDocsToDataSource({
projectId,
sourceId: dataSourceId,
docData: first100Urls.map(url => ({
name: url,
@ -339,7 +326,7 @@ export function DataSourceConfig({
});
setShowAddUrlForm(false);
await loadUrls(projectId, dataSourceId, urlsPage);
await loadUrls(dataSourceId, urlsPage);
// Reload data source to get updated status
await updateDataSourceAndNotify();
@ -356,7 +343,7 @@ export function DataSourceConfig({
setUploadingFiles(true);
try {
const urls = await getUploadUrlsForFilesDataSource(projectId, dataSourceId, acceptedFiles.map(file => ({
const urls = await getUploadUrlsForFilesDataSource(dataSourceId, acceptedFiles.map(file => ({
name: file.name,
type: file.type,
size: file.size,
@ -403,17 +390,17 @@ export function DataSourceConfig({
name: file.name,
size: file.size,
mimeType: file.type,
path: urls[index].path,
},
}));
}
await addDocsToDataSource({
projectId,
sourceId: dataSourceId,
docData,
});
await loadFiles(projectId, dataSourceId, filesPage);
await loadFiles(dataSourceId, filesPage);
// Reload data source to get updated status
await updateDataSourceAndNotify();
@ -422,7 +409,7 @@ export function DataSourceConfig({
} finally {
setUploadingFiles(false);
}
}, [projectId, dataSourceId, dataSource, filesPage, updateDataSourceAndNotify]);
}, [dataSourceId, dataSource, filesPage, updateDataSourceAndNotify]);
const { getRootProps, getInputProps, isDragActive } = useDropzone({
onDrop: onFileDrop,
@ -676,7 +663,7 @@ export function DataSourceConfig({
<div className="space-y-2">
{files.map((file) => (
<div
key={file._id}
key={file.id}
className="flex items-center justify-between p-3 bg-gray-50 dark:bg-gray-800/50 rounded-lg border"
>
<div className="flex items-center gap-3 flex-1 min-w-0">
@ -696,7 +683,7 @@ export function DataSourceConfig({
{(file.data.type === 'file_local' || file.data.type === 'file_s3') && (
<Tooltip content="Download file">
<button
onClick={() => handleDownloadFile(file._id)}
onClick={() => handleDownloadFile(file.id)}
className="p-1 hover:bg-gray-200 dark:hover:bg-gray-700 rounded transition-colors"
>
<DownloadIcon className="w-4 h-4 text-gray-500" />
@ -705,7 +692,7 @@ export function DataSourceConfig({
)}
<Tooltip content="Delete file">
<button
onClick={() => handleDeleteFile(file._id)}
onClick={() => handleDeleteFile(file.id)}
className="p-1 hover:bg-red-100 dark:hover:bg-red-900/20 rounded transition-colors"
>
<Trash2 className="w-4 h-4 text-red-500" />
@ -805,7 +792,7 @@ export function DataSourceConfig({
<div className="space-y-2">
{urls.map((url) => (
<div
key={url._id}
key={url.id}
className="flex items-center justify-between p-3 bg-gray-50 dark:bg-gray-800/50 rounded-lg border"
>
<div className="flex items-center gap-3 flex-1 min-w-0">
@ -834,7 +821,7 @@ export function DataSourceConfig({
<div className="flex items-center gap-2">
<Tooltip content="Delete URL">
<button
onClick={() => handleDeleteUrl(url._id)}
onClick={() => handleDeleteUrl(url.id)}
className="p-1 hover:bg-red-100 dark:hover:bg-red-900/20 rounded transition-colors"
>
<Trash2 className="w-4 h-4 text-red-500" />

View file

@ -1,6 +1,5 @@
'use client';
import { WithStringId } from "../../../../lib/types/types";
import { DataSource } from "../../../../lib/types/datasource_types";
import { DataSource } from "@/src/entities/models/data-source";
import { ToggleSource } from "../components/toggle-source";
import { Spinner } from "@heroui/react";
import { SourceStatus } from "../components/source-status";
@ -28,14 +27,14 @@ export function SourcePage({
sourceId: string;
projectId: string;
}) {
const [source, setSource] = useState<WithStringId<z.infer<typeof DataSource>> | null>(null);
const [source, setSource] = useState<z.infer<typeof DataSource> | null>(null);
const [isLoading, setIsLoading] = useState(true);
const [showSaveSuccess, setShowSaveSuccess] = useState(false);
const [billingError, setBillingError] = useState<string | null>(null);
async function handleReload() {
setIsLoading(true);
const updatedSource = await getDataSource(projectId, sourceId);
const updatedSource = await getDataSource(sourceId);
setSource(updatedSource);
if ("billingError" in updatedSource && updatedSource.billingError) {
setBillingError(updatedSource.billingError);
@ -48,7 +47,7 @@ export function SourcePage({
let ignore = false;
async function fetchSource() {
setIsLoading(true);
const source = await getDataSource(projectId, sourceId);
const source = await getDataSource(sourceId);
if (!ignore) {
setSource(source);
if ("billingError" in source && source.billingError) {
@ -61,7 +60,7 @@ export function SourcePage({
return () => {
ignore = true;
};
}, [projectId, sourceId]);
}, [sourceId]);
// refresh source data every 15 seconds
// under certain conditions
@ -80,7 +79,7 @@ export function SourcePage({
if (timeout) {
clearTimeout(timeout);
}
const updatedSource = await getDataSource(projectId, sourceId);
const updatedSource = await getDataSource(sourceId);
if (!ignore) {
setSource(updatedSource);
if ("billingError" in updatedSource && updatedSource.billingError) {
@ -130,7 +129,6 @@ export function SourcePage({
<SectionLabel>Toggle</SectionLabel>
<SectionContent>
<ToggleSource
projectId={projectId}
sourceId={sourceId}
active={source.active}
/>
@ -153,7 +151,6 @@ export function SourcePage({
action={async (formData: FormData) => {
const description = formData.get('description') as string;
await updateDataSource({
projectId,
sourceId,
description,
});
@ -217,7 +214,7 @@ export function SourcePage({
<SectionRow>
<SectionLabel>Status</SectionLabel>
<SectionContent>
<SourceStatus status={source.status} projectId={projectId} />
<SourceStatus status={source.status} />
{("billingError" in source) && source.billingError && <div className="flex flex-col gap-1 items-start mt-4">
<div className="text-sm">{source.billingError}</div>
@ -240,14 +237,12 @@ export function SourcePage({
{/* Source-specific sections */}
{source.data.type === 'urls' &&
<ScrapeSource
projectId={projectId}
dataSource={source}
handleReload={handleReload}
/>
}
{(source.data.type === 'files_local' || source.data.type === 'files_s3') &&
<FilesSource
projectId={projectId}
dataSource={source}
handleReload={handleReload}
type={source.data.type}
@ -255,7 +250,6 @@ export function SourcePage({
}
{source.data.type === 'text' &&
<TextSource
projectId={projectId}
dataSource={source}
handleReload={handleReload}
/>
@ -272,7 +266,7 @@ export function SourcePage({
This action cannot be undone.
</p>
</div>
<DeleteSource projectId={projectId} sourceId={sourceId} />
<DeleteSource sourceId={sourceId} />
</div>
</Section>
</div>

View file

@ -4,15 +4,13 @@ import { deleteDataSource } from "../../../../actions/data-source.actions";
import { FormStatusButton } from "../../../../lib/components/form-status-button";
export function DeleteSource({
projectId,
sourceId,
}: {
projectId: string;
sourceId: string;
}) {
function handleDelete() {
if (window.confirm('Are you sure you want to delete this data source?')) {
deleteDataSource(projectId, sourceId);
deleteDataSource(sourceId);
}
}

View file

@ -1,24 +1,20 @@
"use client";
import { WithStringId } from "../../../../lib/types/types";
import { DataSourceDoc, DataSource } from "../../../../lib/types/datasource_types";
import { DataSourceDoc } from "@/src/entities/models/data-source-doc";
import { DataSource } from "@/src/entities/models/data-source";
import { z } from "zod";
import { useCallback, useEffect, useState } from "react";
import { useDropzone } from "react-dropzone";
import { deleteDocsFromDataSource, getUploadUrlsForFilesDataSource, addDocsToDataSource, getDownloadUrlForFile, listDocsInDataSource } from "../../../../actions/data-source.actions";
import { deleteDocFromDataSource, getUploadUrlsForFilesDataSource, addDocsToDataSource, getDownloadUrlForFile, listDocsInDataSource } from "../../../../actions/data-source.actions";
import { RelativeTime } from "@primer/react";
import { Pagination, Spinner } from "@heroui/react";
import { DownloadIcon } from "lucide-react";
import { Section } from "./section";
function FileListItem({
projectId,
sourceId,
file,
onDelete,
}: {
projectId: string,
sourceId: string,
file: WithStringId<z.infer<typeof DataSourceDoc>>,
file: z.infer<typeof DataSourceDoc>,
onDelete: (fileId: string) => Promise<void>;
}) {
const [isDeleting, setIsDeleting] = useState(false);
@ -27,7 +23,7 @@ function FileListItem({
const handleDeleteClick = async () => {
setIsDeleting(true);
try {
await onDelete(file._id);
await onDelete(file.id);
} finally {
setIsDeleting(false);
}
@ -36,7 +32,7 @@ function FileListItem({
const handleDownloadClick = async () => {
setIsDownloading(true);
try {
const url = await getDownloadUrlForFile(projectId, sourceId, file._id);
const url = await getDownloadUrlForFile(file.id);
window.open(url, '_blank');
} catch (error) {
console.error('Download failed:', error);
@ -90,17 +86,15 @@ function FileListItem({
}
function PaginatedFileList({
projectId,
sourceId,
handleReload,
onDelete,
}: {
projectId: string,
sourceId: string,
handleReload: () => void;
onDelete: (fileId: string) => Promise<void>;
}) {
const [files, setFiles] = useState<WithStringId<z.infer<typeof DataSourceDoc>>[]>([]);
const [files, setFiles] = useState<z.infer<typeof DataSourceDoc>[]>([]);
const [page, setPage] = useState(1);
const [total, setTotal] = useState(0);
const [loading, setLoading] = useState(false);
@ -114,7 +108,6 @@ function PaginatedFileList({
setLoading(true);
try {
const { files, total } = await listDocsInDataSource({
projectId,
sourceId,
page,
limit: 10,
@ -134,7 +127,7 @@ function PaginatedFileList({
return () => {
ignore = true;
}
}, [projectId, sourceId, page]);
}, [sourceId, page]);
return (
<div className="space-y-4">
@ -154,10 +147,8 @@ function PaginatedFileList({
<div className="space-y-3">
{files.map(file => (
<FileListItem
key={file._id}
key={file.id}
file={file}
projectId={projectId}
sourceId={sourceId}
onDelete={onDelete}
/>
))}
@ -177,13 +168,11 @@ function PaginatedFileList({
}
export function FilesSource({
projectId,
dataSource,
handleReload,
type,
}: {
projectId: string,
dataSource: WithStringId<z.infer<typeof DataSource>>,
dataSource: z.infer<typeof DataSource>,
handleReload: () => void;
type: 'files_local' | 'files_s3';
}) {
@ -193,7 +182,7 @@ export function FilesSource({
const onDrop = useCallback(async (acceptedFiles: File[]) => {
setUploading(true);
try {
const urls = await getUploadUrlsForFilesDataSource(projectId, dataSource._id, acceptedFiles.map(file => ({
const urls = await getUploadUrlsForFilesDataSource(dataSource.id, acceptedFiles.map(file => ({
name: file.name,
type: file.type,
size: file.size,
@ -237,13 +226,13 @@ export function FilesSource({
name: file.name,
size: file.size,
mimeType: file.type,
path: urls[index].path,
},
}));
}
await addDocsToDataSource({
projectId,
sourceId: dataSource._id,
sourceId: dataSource.id,
docData,
});
@ -255,7 +244,7 @@ export function FilesSource({
} finally {
setUploading(false);
}
}, [projectId, dataSource._id, handleReload, type]);
}, [dataSource.id, handleReload, type]);
const { getRootProps, getInputProps, isDragActive } = useDropzone({
onDrop,
@ -299,14 +288,11 @@ export function FilesSource({
<PaginatedFileList
key={fileListKey}
projectId={projectId}
sourceId={dataSource._id}
sourceId={dataSource.id}
handleReload={handleReload}
onDelete={async (docId) => {
await deleteDocsFromDataSource({
projectId,
sourceId: dataSource._id,
docIds: [docId],
await deleteDocFromDataSource({
docId: docId,
});
handleReload();
setFileListKey(prev => prev + 1);

View file

@ -1,9 +1,9 @@
"use client";
import { WithStringId } from "../../../../lib/types/types";
import { DataSourceDoc, DataSource } from "../../../../lib/types/datasource_types";
import { DataSourceDoc } from "@/src/entities/models/data-source-doc";
import { DataSource } from "@/src/entities/models/data-source";
import { z } from "zod";
import { Recrawl } from "./web-recrawl";
import { deleteDocsFromDataSource, listDocsInDataSource, recrawlWebDataSource, addDocsToDataSource } from "../../../../actions/data-source.actions";
import { deleteDocFromDataSource, listDocsInDataSource, recrawlWebDataSource, addDocsToDataSource } from "../../../../actions/data-source.actions";
import { useState, useEffect } from "react";
import { Spinner, Pagination } from "@heroui/react";
import { ExternalLinkIcon, PlusIcon } from "lucide-react";
@ -13,7 +13,7 @@ import { Textarea } from "@/components/ui/textarea";
import { Section } from "./section";
function UrlListItem({ file, onDelete }: {
file: WithStringId<z.infer<typeof DataSourceDoc>>,
file: z.infer<typeof DataSourceDoc>,
onDelete: (fileId: string) => Promise<void>;
}) {
const [isDeleting, setIsDeleting] = useState(false);
@ -37,7 +37,7 @@ function UrlListItem({ file, onDelete }: {
onClick={async () => {
setIsDeleting(true);
try {
await onDelete(file._id);
await onDelete(file.id);
} finally {
setIsDeleting(false);
}
@ -51,12 +51,11 @@ function UrlListItem({ file, onDelete }: {
);
}
function UrlList({ projectId, sourceId, onDelete }: {
projectId: string,
function UrlList({ sourceId, onDelete }: {
sourceId: string,
onDelete: (fileId: string) => Promise<void>,
}) {
const [files, setFiles] = useState<WithStringId<z.infer<typeof DataSourceDoc>>[]>([]);
const [files, setFiles] = useState<z.infer<typeof DataSourceDoc>[]>([]);
const [loading, setLoading] = useState(true);
const [page, setPage] = useState(1);
const [total, setTotal] = useState(0);
@ -69,7 +68,7 @@ function UrlList({ projectId, sourceId, onDelete }: {
async function fetchFiles() {
setLoading(true);
try {
const { files, total } = await listDocsInDataSource({ projectId, sourceId, page, limit: 10 });
const { files, total } = await listDocsInDataSource({ sourceId, page, limit: 10 });
if (!ignore) {
setFiles(files);
setTotal(total);
@ -86,7 +85,7 @@ function UrlList({ projectId, sourceId, onDelete }: {
return () => {
ignore = true;
};
}, [projectId, sourceId, page]);
}, [sourceId, page]);
return (
<div className="mt-6 space-y-4">
@ -102,7 +101,7 @@ function UrlList({ projectId, sourceId, onDelete }: {
) : (
<div className="space-y-2">
{files.map(file => (
<UrlListItem key={file._id} file={file} onDelete={onDelete} />
<UrlListItem key={file.id} file={file} onDelete={onDelete} />
))}
{Math.ceil(total / 10) > 1 && (
<div className="mt-4">
@ -120,12 +119,10 @@ function UrlList({ projectId, sourceId, onDelete }: {
}
export function ScrapeSource({
projectId,
dataSource,
handleReload,
}: {
projectId: string,
dataSource: WithStringId<z.infer<typeof DataSource>>,
dataSource: z.infer<typeof DataSource>,
handleReload: () => void;
}) {
const [fileListKey, setFileListKey] = useState(0);
@ -161,8 +158,7 @@ export function ScrapeSource({
const first100Urls = urlsArray.slice(0, 100);
await addDocsToDataSource({
projectId,
sourceId: dataSource._id,
sourceId: dataSource.id,
docData: first100Urls.map(url => ({
name: url,
data: {
@ -209,13 +205,10 @@ export function ScrapeSource({
<UrlList
key={fileListKey}
projectId={projectId}
sourceId={dataSource._id}
sourceId={dataSource.id}
onDelete={async (docId) => {
await deleteDocsFromDataSource({
projectId,
sourceId: dataSource._id,
docIds: [docId],
await deleteDocFromDataSource({
docId: docId,
});
handleReload();
setFileListKey(prev => prev + 1);
@ -230,10 +223,8 @@ export function ScrapeSource({
description="Update the content by scraping the URLs again."
>
<Recrawl
projectId={projectId}
sourceId={dataSource._id}
handleRefresh={async () => {
await recrawlWebDataSource(projectId, dataSource._id);
await recrawlWebDataSource(dataSource.id);
handleReload();
setFileListKey(prev => prev + 1);
}}

View file

@ -1,17 +1,15 @@
'use client';
import { getDataSource } from "../../../../actions/data-source.actions";
import { DataSource } from "../../../../lib/types/datasource_types";
import { DataSource } from "@/src/entities/models/data-source";
import { useEffect, useState } from "react";
import { z } from 'zod';
import { SourceStatus } from "./source-status";
export function SelfUpdatingSourceStatus({
projectId,
sourceId,
initialStatus,
compact = false,
}: {
projectId: string;
sourceId: string,
initialStatus: z.infer<typeof DataSource>['status'],
compact?: boolean;
@ -26,7 +24,7 @@ export function SelfUpdatingSourceStatus({
if (ignore) {
return;
}
const source = await getDataSource(projectId, sourceId);
const source = await getDataSource(sourceId);
setStatus(source.status);
timeoutId = setTimeout(check, 15 * 1000);
}
@ -41,7 +39,7 @@ export function SelfUpdatingSourceStatus({
clearTimeout(timeoutId);
}
};
}, [status, projectId, sourceId]);
}, [status, sourceId]);
return <SourceStatus status={status} compact={compact} projectId={projectId} />;
return <SourceStatus status={status} compact={compact} />;
}

View file

@ -1,15 +1,13 @@
import { DataSource } from "../../../../lib/types/datasource_types";
import { DataSource } from "@/src/entities/models/data-source";
import { Spinner } from "@heroui/react";
import { z } from 'zod';
import { CheckCircleIcon, XCircleIcon, ClockIcon } from "lucide-react";
export function SourceStatus({
status,
projectId,
compact = false,
}: {
status: z.infer<typeof DataSource>['status'],
projectId: string,
compact?: boolean;
}) {
return (

View file

@ -6,15 +6,14 @@ import { ToggleSource } from "./toggle-source";
import { SelfUpdatingSourceStatus } from "./self-updating-source-status";
import { DataSourceIcon } from "../../../../lib/components/datasource-icon";
import { useEffect, useState } from "react";
import { WithStringId } from "../../../../lib/types/types";
import { DataSource } from "../../../../lib/types/datasource_types";
import { DataSource } from "@/src/entities/models/data-source";
import { z } from "zod";
import { listDataSources } from "../../../../actions/data-source.actions";
import { Panel } from "@/components/common/panel-common";
import { PlusIcon } from "lucide-react";
export function SourcesList({ projectId }: { projectId: string }) {
const [sources, setSources] = useState<WithStringId<z.infer<typeof DataSource>>[]>([]);
const [sources, setSources] = useState<z.infer<typeof DataSource>[]>([]);
const [loading, setLoading] = useState(true);
useEffect(() => {
@ -115,12 +114,12 @@ export function SourcesList({ projectId }: { projectId: string }) {
<tbody className="bg-white dark:bg-gray-800 divide-y divide-gray-200 dark:divide-gray-700">
{sources.map((source) => (
<tr
key={source._id}
key={source.id}
className="hover:bg-gray-50 dark:hover:bg-gray-750 transition-colors"
>
<td className="px-6 py-4 text-left">
<Link
href={`/projects/${projectId}/sources/${source._id}`}
href={`/projects/${projectId}/sources/${source.id}`}
size="lg"
isBlock
className="text-sm text-gray-900 dark:text-gray-100 hover:text-blue-600 dark:hover:text-blue-400 truncate block"
@ -158,8 +157,7 @@ export function SourcesList({ projectId }: { projectId: string }) {
<td className="px-6 py-4 text-left">
<div className="text-sm">
<SelfUpdatingSourceStatus
sourceId={source._id}
projectId={projectId}
sourceId={source.id}
initialStatus={source.status}
compact={true}
/>
@ -168,8 +166,7 @@ export function SourcesList({ projectId }: { projectId: string }) {
)}
<td className="px-6 py-4 text-left">
<ToggleSource
projectId={projectId}
sourceId={source._id}
sourceId={source.id}
active={source.active}
compact={true}
className="bg-default-100"

View file

@ -1,21 +1,18 @@
"use client";
import { WithStringId } from "../../../../lib/types/types";
import { DataSource } from "../../../../lib/types/datasource_types";
import { DataSource } from "@/src/entities/models/data-source";
import { z } from "zod";
import { useState, useEffect } from "react";
import { Textarea } from "@/components/ui/textarea";
import { FormStatusButton } from "../../../../lib/components/form-status-button";
import { Spinner } from "@heroui/react";
import { addDocsToDataSource, deleteDocsFromDataSource, listDocsInDataSource } from "../../../../actions/data-source.actions";
import { addDocsToDataSource, deleteDocFromDataSource, listDocsInDataSource } from "../../../../actions/data-source.actions";
import { Section } from "./section";
export function TextSource({
projectId,
dataSource,
handleReload,
}: {
projectId: string,
dataSource: WithStringId<z.infer<typeof DataSource>>,
dataSource: z.infer<typeof DataSource>,
handleReload: () => void;
}) {
const [content, setContent] = useState("");
@ -30,8 +27,7 @@ export function TextSource({
setIsLoading(true);
try {
const { files } = await listDocsInDataSource({
projectId,
sourceId: dataSource._id,
sourceId: dataSource.id,
limit: 1,
});
@ -41,7 +37,7 @@ export function TextSource({
const doc = files[0];
if (doc.data.type === 'text') {
setContent(doc.data.content);
setDocId(doc._id);
setDocId(doc.id);
}
}
} catch (error) {
@ -55,7 +51,7 @@ export function TextSource({
return () => {
ignore = true;
};
}, [projectId, dataSource._id]);
}, [dataSource.id]);
async function handleSubmit(formData: FormData) {
setIsSaving(true);
@ -64,17 +60,14 @@ export function TextSource({
// Delete existing doc if it exists
if (docId) {
await deleteDocsFromDataSource({
projectId,
sourceId: dataSource._id,
docIds: [docId],
await deleteDocFromDataSource({
docId: docId,
});
}
// Add new doc
await addDocsToDataSource({
projectId,
sourceId: dataSource._id,
sourceId: dataSource.id,
docData: [{
name: 'text',
data: {

View file

@ -4,13 +4,11 @@ import { Spinner } from "@heroui/react";
import { useState } from "react";
export function ToggleSource({
projectId,
sourceId,
active,
compact = false,
className
}: {
projectId: string;
sourceId: string;
active: boolean;
compact?: boolean;
@ -22,7 +20,7 @@ export function ToggleSource({
async function handleToggle() {
setLoading(true);
try {
await toggleDataSource(projectId, sourceId, !isActive);
await toggleDataSource(sourceId, !isActive);
setIsActive(!isActive);
} finally {
setLoading(false);

View file

@ -3,12 +3,8 @@ import { FormStatusButton } from "../../../../lib/components/form-status-button"
import { RefreshCwIcon } from "lucide-react";
export function Recrawl({
projectId,
sourceId,
handleRefresh,
}: {
projectId: string;
sourceId: string;
handleRefresh: () => void;
}) {
return <form action={handleRefresh}>

View file

@ -71,8 +71,7 @@ export function Form({
// pick first 100
const first100Urls = urlsArray.slice(0, 100);
await addDocsToDataSource({
projectId,
sourceId: source._id,
sourceId: source.id,
docData: first100Urls.map(url => ({
name: url,
data: {
@ -82,7 +81,7 @@ export function Form({
})),
});
if (onSuccess) {
onSuccess(source._id);
onSuccess(source.id);
}
}
@ -97,7 +96,7 @@ export function Form({
});
if (onSuccess) {
onSuccess(source._id);
onSuccess(source.id);
}
}
@ -114,8 +113,7 @@ export function Form({
const content = formData.get('content') as string;
await addDocsToDataSource({
projectId,
sourceId: source._id,
sourceId: source.id,
docData: [{
name: 'text',
data: {
@ -126,7 +124,7 @@ export function Form({
});
if (onSuccess) {
onSuccess(source._id);
onSuccess(source.id);
}
}

View file

@ -1,6 +1,6 @@
"use client";
import { MCPServer, WithStringId } from "../../../lib/types/types";
import { DataSource } from "../../../lib/types/datasource_types";
import { DataSource } from "@/src/entities/models/data-source";
import { Project } from "../../../lib/types/project_types";
import { z } from "zod";
import { useCallback, useEffect, useState } from "react";
@ -32,7 +32,7 @@ export function App({
}) {
const [mode, setMode] = useState<'draft' | 'live'>('draft');
const [project, setProject] = useState<WithStringId<z.infer<typeof Project>> | null>(null);
const [dataSources, setDataSources] = useState<WithStringId<z.infer<typeof DataSource>>[] | null>(null);
const [dataSources, setDataSources] = useState<z.infer<typeof DataSource>[] | null>(null);
const [projectConfig, setProjectConfig] = useState<z.infer<typeof Project> | null>(null);
const [loading, setLoading] = useState(false);
const [eligibleModels, setEligibleModels] = useState<z.infer<typeof ModelsResponse> | "*">("*");

View file

@ -6,8 +6,7 @@ import { Button } from '@/components/ui/button';
import { Form } from '../../sources/new/form';
import { FilesSource } from '../../sources/components/files-source';
import { getDataSource } from '../../../../actions/data-source.actions';
import { WithStringId } from '../../../../lib/types/types';
import { DataSource } from '../../../../lib/types/datasource_types';
import { DataSource } from "@/src/entities/models/data-source";
import { z } from 'zod';
interface DataSourcesModalProps {
@ -30,11 +29,11 @@ export function DataSourcesModal({
useRagScraping
}: DataSourcesModalProps) {
const [currentView, setCurrentView] = useState<'form' | 'upload'>('form');
const [createdSource, setCreatedSource] = useState<WithStringId<z.infer<typeof DataSource>> | null>(null);
const [createdSource, setCreatedSource] = useState<z.infer<typeof DataSource> | null>(null);
const handleDataSourceCreated = async (sourceId: string) => {
// Get the created data source
const source = await getDataSource(projectId, sourceId);
const source = await getDataSource(sourceId);
// If it's a files data source, show the upload interface
if (source.data.type === 'files_local' || source.data.type === 'files_s3') {
@ -93,7 +92,6 @@ export function DataSourcesModal({
) : (
createdSource && (
<FilesSource
projectId={projectId}
dataSource={createdSource}
handleReload={handleFilesUploaded}
type={createdSource.data.type as 'files_local' | 'files_s3'}

View file

@ -2,7 +2,7 @@ import React, { forwardRef, useImperativeHandle } from "react";
import { z } from "zod";
import { WorkflowPrompt, WorkflowAgent, WorkflowTool, WorkflowPipeline, Workflow } from "../../../lib/types/workflow_types";
import { Project } from "../../../lib/types/project_types";
import { DataSource } from "../../../lib/types/datasource_types";
import { DataSource } from "@/src/entities/models/data-source";
import { WithStringId } from "../../../lib/types/types";
import { Dropdown, DropdownItem, DropdownTrigger, DropdownMenu } from "@heroui/react";
import { useRef, useEffect, useState } from "react";
@ -48,7 +48,7 @@ interface EntityListProps {
tools: z.infer<typeof WorkflowTool>[];
prompts: z.infer<typeof WorkflowPrompt>[];
pipelines: z.infer<typeof WorkflowPipeline>[];
dataSources: WithStringId<z.infer<typeof DataSource>>[];
dataSources: z.infer<typeof DataSource>[];
workflow: z.infer<typeof Workflow>;
selectedEntity: {
type: "agent" | "tool" | "prompt" | "datasource" | "pipeline" | "visualise";
@ -1071,14 +1071,14 @@ export const EntityList = forwardRef<
className={clsx(
"flex items-center gap-2 px-3 py-2 rounded-md min-h-[24px] cursor-pointer",
{
"bg-indigo-50 dark:bg-indigo-950/30": selectedEntity?.type === "datasource" && selectedEntity.name === dataSource._id,
"hover:bg-zinc-50 dark:hover:bg-zinc-800": !(selectedEntity?.type === "datasource" && selectedEntity.name === dataSource._id)
"bg-indigo-50 dark:bg-indigo-950/30": selectedEntity?.type === "datasource" && selectedEntity.name === dataSource.id,
"hover:bg-zinc-50 dark:hover:bg-zinc-800": !(selectedEntity?.type === "datasource" && selectedEntity.name === dataSource.id)
}
)}
onClick={() => handleSelectDataSource(dataSource._id)}
onClick={() => handleSelectDataSource(dataSource.id)}
>
<div
ref={selectedEntity?.type === "datasource" && selectedEntity.name === dataSource._id ? selectedRef : undefined}
ref={selectedEntity?.type === "datasource" && selectedEntity.name === dataSource.id ? selectedRef : undefined}
className="flex-1 flex items-center gap-2 text-sm text-left"
>
<div className="shrink-0 flex items-center justify-center w-3 h-3">
@ -1097,7 +1097,7 @@ export const EntityList = forwardRef<
name={dataSource.name}
onDelete={async () => {
if (window.confirm(`Are you sure you want to delete the data source "${dataSource.name}"?`)) {
await deleteDataSource(projectId, dataSource._id);
await deleteDataSource(dataSource.id);
onDataSourcesUpdated?.();
}
}}

View file

@ -2,7 +2,7 @@
import React, { useReducer, Reducer, useState, useCallback, useEffect, useRef, createContext, useContext } from "react";
import { MCPServer, Message, WithStringId } from "../../../lib/types/types";
import { Workflow, WorkflowTool, WorkflowPrompt, WorkflowAgent, WorkflowPipeline } from "../../../lib/types/workflow_types";
import { DataSource } from "../../../lib/types/datasource_types";
import { DataSource } from "@/src/entities/models/data-source";
import { Project } from "../../../lib/types/project_types";
import { produce, applyPatches, enablePatches, produceWithPatches, Patch } from 'immer';
import { AgentConfig } from "../entities/agent_config";
@ -821,7 +821,7 @@ export function WorkflowEditor({
chatWidgetHost,
}: {
projectId: string;
dataSources: WithStringId<z.infer<typeof DataSource>>[];
dataSources: z.infer<typeof DataSource>[];
workflow: z.infer<typeof Workflow>;
useRag: boolean;
useRagUploads: boolean;

View file

@ -1,66 +1,50 @@
import '../lib/loadenv';
import { RecursiveCharacterTextSplitter } from "@langchain/textsplitters";
import FirecrawlApp from '@mendable/firecrawl-js';
import { z } from 'zod';
import { dataSourceDocsCollection, dataSourcesCollection, projectsCollection, usersCollection } from '../lib/mongodb';
import { EmbeddingRecord, DataSourceDoc, DataSource } from "../lib/types/datasource_types";
import { ObjectId, WithId } from 'mongodb';
import { EmbeddingRecord } from "../lib/types/datasource_types";
import { DataSourceDoc } from "@/src/entities/models/data-source-doc";
import { embedMany, generateText } from 'ai';
import { embeddingModel } from '../lib/embedding';
import { qdrantClient } from '../lib/qdrant';
import { PrefixLogger } from "../lib/utils";
import { GoogleGenerativeAI } from "@google/generative-ai";
import { GetObjectCommand } from "@aws-sdk/client-s3";
import { uploadsS3Client } from '../lib/uploads_s3_client';
import fs from 'fs/promises';
import crypto from 'crypto';
import path from 'path';
import { createOpenAI } from '@ai-sdk/openai';
import { USE_BILLING, USE_GEMINI_FILE_PARSING } from '../lib/feature_flags';
import { authorize, getCustomerIdForProject, logUsage, UsageTracker } from '../lib/billing';
import { BillingError } from '@/src/entities/errors/common';
import { DataSource } from '@/src/entities/models/data-source';
import { IDataSourcesRepository } from '@/src/application/repositories/data-sources.repository.interface';
import { IDataSourceDocsRepository } from '@/src/application/repositories/data-source-docs.repository.interface';
import { IUploadsStorageService } from '@/src/application/services/uploads-storage.service.interface';
import { container } from '@/di/container';
const FILE_PARSING_PROVIDER_API_KEY = process.env.FILE_PARSING_PROVIDER_API_KEY || process.env.OPENAI_API_KEY || '';
const FILE_PARSING_PROVIDER_BASE_URL = process.env.FILE_PARSING_PROVIDER_BASE_URL || undefined;
const FILE_PARSING_MODEL = process.env.FILE_PARSING_MODEL || 'gpt-4o';
const dataSourcesRepository = container.resolve<IDataSourcesRepository>('dataSourcesRepository');
const dataSourceDocsRepository = container.resolve<IDataSourceDocsRepository>('dataSourceDocsRepository');
const localUploadsStorageService = container.resolve<IUploadsStorageService>('localUploadsStorageService');
const s3UploadsStorageService = container.resolve<IUploadsStorageService>('s3UploadsStorageService');
const firecrawl = new FirecrawlApp({ apiKey: process.env.FIRECRAWL_API_KEY || "test" });
const openai = createOpenAI({
apiKey: FILE_PARSING_PROVIDER_API_KEY,
baseURL: FILE_PARSING_PROVIDER_BASE_URL,
});
const UPLOADS_DIR = process.env.RAG_UPLOADS_DIR || '/uploads';
const splitter = new RecursiveCharacterTextSplitter({
separators: ['\n\n', '\n', '. ', '.', ''],
chunkSize: 1024,
chunkOverlap: 20,
});
const second = 1000;
const minute = 60 * second;
const hour = 60 * minute;
const day = 24 * hour;
// Configure Google Gemini API
const genAI = new GoogleGenerativeAI(process.env.GOOGLE_API_KEY || '');
async function getLocalFileContent(path: string): Promise<Buffer> {
return await fs.readFile(path);
}
async function getS3FileContent(s3Key: string): Promise<Buffer> {
const command = new GetObjectCommand({
Bucket: process.env.RAG_UPLOADS_S3_BUCKET,
Key: s3Key,
});
const response = await uploadsS3Client.send(command);
const chunks: Uint8Array[] = [];
for await (const chunk of response.Body as any) {
chunks.push(chunk);
}
return Buffer.concat(chunks);
}
async function retryable<T>(fn: () => Promise<T>, maxAttempts: number = 3): Promise<T> {
let attempts = 0;
while (true) {
@ -75,19 +59,23 @@ async function retryable<T>(fn: () => Promise<T>, maxAttempts: number = 3): Prom
}
}
async function runProcessPipeline(_logger: PrefixLogger, usageTracker: UsageTracker, job: WithId<z.infer<typeof DataSource>>, doc: WithId<z.infer<typeof DataSourceDoc>> & { data: { type: "file_local" | "file_s3" } }) {
async function runProcessFilePipeline(_logger: PrefixLogger, usageTracker: UsageTracker, job: z.infer<typeof DataSource>, doc: z.infer<typeof DataSourceDoc>) {
if (doc.data.type !== 'file_local' && doc.data.type !== 'file_s3') {
throw new Error("Invalid data source type");
}
const logger = _logger
.child(doc._id.toString())
.child(doc.id)
.child(doc.name);
// Get file content
let fileData: Buffer;
if (doc.data.type === 'file_local') {
logger.log("Fetching file from local");
fileData = await getLocalFileContent(path.join(UPLOADS_DIR, doc._id.toString()));
fileData = await localUploadsStorageService.getFileContents(doc.id);
} else {
logger.log("Fetching file from S3");
fileData = await getS3FileContent(doc.data.s3Key);
fileData = await s3UploadsStorageService.getFileContents(doc.id);
}
let markdown = "";
@ -167,8 +155,8 @@ async function runProcessPipeline(_logger: PrefixLogger, usageTracker: UsageTrac
vector: embedding,
payload: {
projectId: job.projectId,
sourceId: job._id.toString(),
docId: doc._id.toString(),
sourceId: job.id,
docId: doc.id,
content: splits[i].pageContent,
title: doc.name,
name: doc.name,
@ -180,21 +168,136 @@ async function runProcessPipeline(_logger: PrefixLogger, usageTracker: UsageTrac
// store content in doc record
logger.log("Storing content in doc record");
await dataSourceDocsCollection.updateOne({
_id: doc._id,
version: doc.version,
}, {
$set: {
content: markdown,
status: "ready",
lastUpdatedAt: new Date().toISOString(),
}
await dataSourceDocsRepository.updateByVersion(doc.id, doc.version, {
content: markdown,
status: "ready",
});
}
async function runDeletionPipeline(_logger: PrefixLogger, job: WithId<z.infer<typeof DataSource>>, doc: WithId<z.infer<typeof DataSourceDoc>>): Promise<void> {
async function runScrapePipeline(_logger: PrefixLogger, usageTracker: UsageTracker, job: z.infer<typeof DataSource>, doc: z.infer<typeof DataSourceDoc>) {
const logger = _logger
.child(doc._id.toString())
.child(doc.id)
.child(doc.name);
// scrape the url using firecrawl
logger.log("Scraping using Firecrawl");
const scrapeResult = await retryable(async () => {
if (doc.data.type !== 'url') {
throw new Error("Invalid data source type");
}
const scrapeResult = await firecrawl.scrapeUrl(doc.data.url, {
formats: ['markdown'],
onlyMainContent: true,
excludeTags: ['script', 'style', 'noscript', 'img',]
});
if (!scrapeResult.success) {
throw new Error("Unable to scrape URL: " + doc.data.url);
}
return scrapeResult;
}, 3); // Retry up to 3 times
usageTracker.track({
type: "FIRECRAWL_SCRAPE_USAGE",
context: "rag.urls.firecrawl_scrape",
});
// split into chunks
logger.log("Splitting into chunks");
const splits = await splitter.createDocuments([scrapeResult.markdown || '']);
// generate embeddings
logger.log("Generating embeddings");
const { embeddings, usage } = await embedMany({
model: embeddingModel,
values: splits.map((split) => split.pageContent)
});
usageTracker.track({
type: "EMBEDDING_MODEL_USAGE",
modelName: embeddingModel.modelId,
tokens: usage.tokens,
context: "rag.urls.embedding_usage",
});
// store embeddings in qdrant
logger.log("Storing embeddings in Qdrant");
const points: z.infer<typeof EmbeddingRecord>[] = embeddings.map((embedding, i) => ({
id: crypto.randomUUID(),
vector: embedding,
payload: {
projectId: job.projectId,
sourceId: job.id,
docId: doc.id,
content: splits[i].pageContent,
title: scrapeResult.metadata?.title || '',
name: doc.name,
},
}));
await qdrantClient.upsert("embeddings", {
points,
});
// store scraped markdown in doc record
logger.log("Storing scraped markdown in doc record");
await dataSourceDocsRepository.updateByVersion(doc.id, doc.version, {
content: scrapeResult.markdown,
status: "ready",
});
}
async function runProcessTextPipeline(_logger: PrefixLogger, usageTracker: UsageTracker, job: z.infer<typeof DataSource>, doc: z.infer<typeof DataSourceDoc>) {
const logger = _logger
.child(doc.id)
.child(doc.name);
if (doc.data.type !== 'text') {
throw new Error("Invalid data source type");
}
// split into chunks
logger.log("Splitting into chunks");
const splits = await splitter.createDocuments([doc.data.content]);
// generate embeddings
logger.log("Generating embeddings");
const { embeddings, usage } = await embedMany({
model: embeddingModel,
values: splits.map((split) => split.pageContent)
});
usageTracker.track({
type: "EMBEDDING_MODEL_USAGE",
modelName: embeddingModel.modelId,
tokens: usage.tokens,
context: "rag.text.embedding_usage",
});
// store embeddings in qdrant
logger.log("Storing embeddings in Qdrant");
const points: z.infer<typeof EmbeddingRecord>[] = embeddings.map((embedding, i) => ({
id: crypto.randomUUID(),
vector: embedding,
payload: {
projectId: job.projectId,
sourceId: job.id,
docId: doc.id,
content: splits[i].pageContent,
title: doc.name,
name: doc.name,
},
}));
await qdrantClient.upsert("embeddings", {
points,
});
// store content in doc record
logger.log("Storing content in doc record");
await dataSourceDocsRepository.updateByVersion(doc.id, doc.version, {
content: doc.data.content,
status: "ready",
});
}
async function runDeletionPipeline(_logger: PrefixLogger, job: z.infer<typeof DataSource>, doc: z.infer<typeof DataSourceDoc>): Promise<void> {
const logger = _logger
.child(doc.id)
.child(doc.name);
// Delete embeddings from qdrant
@ -211,13 +314,13 @@ async function runDeletionPipeline(_logger: PrefixLogger, job: WithId<z.infer<ty
{
key: "sourceId",
match: {
value: job._id.toString(),
value: job.id,
}
},
{
key: "docId",
match: {
value: doc._id.toString(),
value: doc.id,
}
}
],
@ -226,85 +329,33 @@ async function runDeletionPipeline(_logger: PrefixLogger, job: WithId<z.infer<ty
// Delete docs from db
logger.log("Deleting doc from db");
await dataSourceDocsCollection.deleteOne({ _id: doc._id });
await dataSourceDocsRepository.delete(doc.id);
}
// fetch next job from mongodb
(async () => {
while (true) {
const now = Date.now();
let job: WithId<z.infer<typeof DataSource>> | null = null;
let job: z.infer<typeof DataSource> | null = null;
// first try to find a job that needs deleting
job = await dataSourcesCollection.findOneAndUpdate({
status: "deleted",
"data.type": { $in: ["files_local", "files_s3"] },
$or: [
{ attempts: { $exists: false } },
{ attempts: { $lte: 3 } }
]
}, { $set: { lastAttemptAt: new Date().toISOString() }, $inc: { attempts: 1 } }, { returnDocument: "after", sort: { createdAt: 1 } });
job = await dataSourcesRepository.pollDeleteJob();
if (job === null) {
job = await dataSourcesCollection.findOneAndUpdate(
{
$and: [
{ 'data.type': { $in: ["files_local", "files_s3"] } },
{
$or: [
// if the job has never been attempted
{
status: "pending",
attempts: 0,
},
// if the job was attempted but wasn't completed in the last hour
{
status: "pending",
lastAttemptAt: { $lt: new Date(now - 1 * hour).toISOString() },
},
// if the job errored out but hasn't been retried 3 times yet
{
status: "error",
attempts: { $lt: 3 },
},
// if the job errored out but hasn't been retried in the last 5 minutes
{
status: "error",
lastAttemptAt: { $lt: new Date(now - 1 * hour).toISOString() },
},
]
}
]
},
{
$set: {
status: "pending",
lastAttemptAt: new Date().toISOString(),
},
$inc: {
attempts: 1
},
},
{ returnDocument: "after", sort: { createdAt: 1 } }
);
job = await dataSourcesRepository.pollPendingJob();
}
if (job === null) {
// if no doc found, sleep for a bit and start again
await new Promise(resolve => setTimeout(resolve, 5 * second));
await new Promise(resolve => setTimeout(resolve, 5 * 1000));
continue;
}
const logger = new PrefixLogger(`${job._id.toString()}-${job.version}`);
logger.log(`Starting job ${job._id}. Type: ${job.data.type}. Status: ${job.status}`);
const logger = new PrefixLogger(`${job.id}-${job.version}`);
logger.log(`Starting job ${job.id}. Type: ${job.data.type}. Status: ${job.status}`);
let errors = false;
try {
if (job.data.type !== 'files_local' && job.data.type !== 'files_s3') {
throw new Error("Invalid data source type");
}
if (job.status === "deleted") {
// delete all embeddings for this source
logger.log("Deleting embeddings from Qdrant");
@ -312,32 +363,33 @@ async function runDeletionPipeline(_logger: PrefixLogger, job: WithId<z.infer<ty
filter: {
must: [
{ key: "projectId", match: { value: job.projectId } },
{ key: "sourceId", match: { value: job._id.toString() } },
{ key: "sourceId", match: { value: job.id } },
],
},
});
// delete all docs for this source
logger.log("Deleting docs from db");
await dataSourceDocsCollection.deleteMany({
sourceId: job._id.toString(),
});
await dataSourceDocsRepository.deleteBySourceId(job.id);
// delete the source record from db
logger.log("Deleting source record from db");
await dataSourcesCollection.deleteOne({
_id: job._id,
});
await dataSourcesRepository.delete(job.id);
logger.log("Job deleted");
continue;
}
// fetch docs that need updating
const pendingDocs = await dataSourceDocsCollection.find({
sourceId: job._id.toString(),
status: { $in: ["pending", "error"] },
}).toArray();
const pendingDocs = [];
let cursor = undefined;
do {
const result = await dataSourceDocsRepository.list(job.id, {
status: ["pending", "error"],
}, cursor);
pendingDocs.push(...result.items);
cursor = result.nextCursor;
} while (cursor);
logger.log(`Found ${pendingDocs.length} docs to process`);
@ -365,21 +417,21 @@ async function runDeletionPipeline(_logger: PrefixLogger, job: WithId<z.infer<ty
}
}
const ldoc = doc as WithId<z.infer<typeof DataSourceDoc>> & { data: { type: "file_local" | "file_s3" } };
const usageTracker = new UsageTracker();
try {
await runProcessPipeline(logger, usageTracker, job, ldoc);
if (doc.data.type === "file_local" || doc.data.type === "file_s3") {
await runProcessFilePipeline(logger, usageTracker, job, doc);
} else if (doc.data.type === "text") {
await runProcessTextPipeline(logger, usageTracker, job, doc);
} else if (doc.data.type === "url") {
await runScrapePipeline(logger, usageTracker, job, doc);
}
} catch (e: any) {
errors = true;
logger.log("Error processing doc:", e);
await dataSourceDocsCollection.updateOne({
_id: doc._id,
version: doc.version,
}, {
$set: {
status: "error",
error: e.message,
}
await dataSourceDocsRepository.updateByVersion(doc.id, doc.version, {
status: "error",
error: e.message,
});
} finally {
// log usage in billing
@ -392,10 +444,15 @@ async function runDeletionPipeline(_logger: PrefixLogger, job: WithId<z.infer<ty
}
// fetch docs that need to be deleted
const deletedDocs = await dataSourceDocsCollection.find({
sourceId: job._id.toString(),
status: "deleted",
}).toArray();
const deletedDocs = [];
cursor = undefined;
do {
const result = await dataSourceDocsRepository.list(job.id, {
status: ["deleted"],
}, cursor);
deletedDocs.push(...result.items);
cursor = result.nextCursor;
} while (cursor);
logger.log(`Found ${deletedDocs.length} docs to delete`);
@ -405,55 +462,32 @@ async function runDeletionPipeline(_logger: PrefixLogger, job: WithId<z.infer<ty
} catch (e: any) {
errors = true;
logger.log("Error deleting doc:", e);
await dataSourceDocsCollection.updateOne({
_id: doc._id,
version: doc.version,
}, {
$set: {
status: "error",
error: e.message,
}
await dataSourceDocsRepository.updateByVersion(doc.id, doc.version, {
status: "error",
error: e.message,
});
}
}
} catch (e) {
if (e instanceof BillingError) {
logger.log("Billing error:", e.message);
await dataSourcesCollection.updateOne({
_id: job._id,
version: job.version,
}, {
$set: {
status: "error",
billingError: e.message,
lastUpdatedAt: new Date().toISOString(),
}
await dataSourcesRepository.release(job.id, job.version, {
status: "error",
billingError: e.message,
});
}
logger.log("Error processing job; will retry:", e);
await dataSourcesCollection.updateOne({
_id: job._id,
version: job.version,
}, {
$set: {
status: "error",
lastUpdatedAt: new Date().toISOString(),
}
await dataSourcesRepository.release(job.id, job.version, {
status: "error",
});
continue;
}
// mark job as complete
logger.log("Marking job as completed...");
await dataSourcesCollection.updateOne({
_id: job._id,
version: job.version,
}, {
$set: {
status: errors ? "error" : "ready",
...(errors ? { error: "There were some errors processing this job" } : {}),
lastUpdatedAt: new Date().toISOString(),
}
await dataSourcesRepository.release(job.id, job.version, {
status: errors ? "error" : "ready",
...(errors ? { error: "There were some errors processing this job" } : {}),
});
}
})();

View file

@ -1,345 +0,0 @@
import '../lib/loadenv';
import { RecursiveCharacterTextSplitter } from "@langchain/textsplitters";
import { z } from 'zod';
import { dataSourceDocsCollection, dataSourcesCollection } from '../lib/mongodb';
import { EmbeddingRecord, DataSourceDoc, DataSource } from "../lib/types/datasource_types";
import { WithId } from 'mongodb';
import { embedMany } from 'ai';
import { embeddingModel } from '../lib/embedding';
import { qdrantClient } from '../lib/qdrant';
import { PrefixLogger } from "../lib/utils";
import crypto from 'crypto';
import { USE_BILLING } from '../lib/feature_flags';
import { authorize, getCustomerIdForProject, logUsage, UsageTracker } from '../lib/billing';
import { BillingError } from '@/src/entities/errors/common';
const splitter = new RecursiveCharacterTextSplitter({
separators: ['\n\n', '\n', '. ', '.', ''],
chunkSize: 1024,
chunkOverlap: 20,
});
const second = 1000;
const minute = 60 * second;
const hour = 60 * minute;
async function runProcessPipeline(_logger: PrefixLogger, usageTracker: UsageTracker, job: WithId<z.infer<typeof DataSource>>, doc: WithId<z.infer<typeof DataSourceDoc>>) {
const logger = _logger
.child(doc._id.toString())
.child(doc.name);
if (doc.data.type !== 'text') {
throw new Error("Invalid data source type");
}
// split into chunks
logger.log("Splitting into chunks");
const splits = await splitter.createDocuments([doc.data.content]);
// generate embeddings
logger.log("Generating embeddings");
const { embeddings, usage } = await embedMany({
model: embeddingModel,
values: splits.map((split) => split.pageContent)
});
usageTracker.track({
type: "EMBEDDING_MODEL_USAGE",
modelName: embeddingModel.modelId,
tokens: usage.tokens,
context: "rag.text.embedding_usage",
});
// store embeddings in qdrant
logger.log("Storing embeddings in Qdrant");
const points: z.infer<typeof EmbeddingRecord>[] = embeddings.map((embedding, i) => ({
id: crypto.randomUUID(),
vector: embedding,
payload: {
projectId: job.projectId,
sourceId: job._id.toString(),
docId: doc._id.toString(),
content: splits[i].pageContent,
title: doc.name,
name: doc.name,
},
}));
await qdrantClient.upsert("embeddings", {
points,
});
// store content in doc record
logger.log("Storing content in doc record");
await dataSourceDocsCollection.updateOne({
_id: doc._id,
version: doc.version,
}, {
$set: {
content: doc.data.content,
status: "ready",
lastUpdatedAt: new Date().toISOString(),
}
});
}
async function runDeletionPipeline(_logger: PrefixLogger, job: WithId<z.infer<typeof DataSource>>, doc: WithId<z.infer<typeof DataSourceDoc>>): Promise<void> {
const logger = _logger
.child(doc._id.toString())
.child(doc.name);
// Delete embeddings from qdrant
logger.log("Deleting embeddings from Qdrant");
await qdrantClient.delete("embeddings", {
filter: {
must: [
{
key: "projectId",
match: {
value: job.projectId,
}
},
{
key: "sourceId",
match: {
value: job._id.toString(),
}
},
{
key: "docId",
match: {
value: doc._id.toString(),
}
}
],
},
});
// Delete docs from db
logger.log("Deleting doc from db");
await dataSourceDocsCollection.deleteOne({ _id: doc._id });
}
// fetch next job from mongodb
(async () => {
while (true) {
const now = Date.now();
let job: WithId<z.infer<typeof DataSource>> | null = null;
// first try to find a job that needs deleting
job = await dataSourcesCollection.findOneAndUpdate({
status: "deleted",
"data.type": "text",
$or: [
{ attempts: { $exists: false } },
{ attempts: { $lte: 3 } }
]
}, { $set: { lastAttemptAt: new Date().toISOString() }, $inc: { attempts: 1 } }, { returnDocument: "after", sort: { createdAt: 1 } });
if (job === null) {
job = await dataSourcesCollection.findOneAndUpdate(
{
$and: [
{ 'data.type': { $eq: "text" } },
{
$or: [
// if the job has never been attempted
{
status: "pending",
attempts: 0,
},
// if the job was attempted but wasn't completed in the last hour
{
status: "pending",
lastAttemptAt: { $lt: new Date(now - 1 * hour).toISOString() },
},
// if the job errored out but hasn't been retried 3 times yet
{
status: "error",
attempts: { $lt: 3 },
},
// if the job errored out but hasn't been retried in the last 5 minutes
{
status: "error",
lastAttemptAt: { $lt: new Date(now - 1 * hour).toISOString() },
},
]
}
]
},
{
$set: {
status: "pending",
lastAttemptAt: new Date().toISOString(),
},
$inc: {
attempts: 1
},
},
{ returnDocument: "after", sort: { createdAt: 1 } }
);
}
if (job === null) {
// if no doc found, sleep for a bit and start again
await new Promise(resolve => setTimeout(resolve, 5 * second));
continue;
}
const logger = new PrefixLogger(`${job._id.toString()}-${job.version}`);
logger.log(`Starting job ${job._id}. Type: ${job.data.type}. Status: ${job.status}`);
let errors = false;
try {
if (job.data.type !== 'text') {
throw new Error("Invalid data source type");
}
if (job.status === "deleted") {
// delete all embeddings for this source
logger.log("Deleting embeddings from Qdrant");
await qdrantClient.delete("embeddings", {
filter: {
must: [
{ key: "projectId", match: { value: job.projectId } },
{ key: "sourceId", match: { value: job._id.toString() } },
],
},
});
// delete all docs for this source
logger.log("Deleting docs from db");
await dataSourceDocsCollection.deleteMany({
sourceId: job._id.toString(),
});
// delete the source record from db
logger.log("Deleting source record from db");
await dataSourcesCollection.deleteOne({
_id: job._id,
});
logger.log("Job deleted");
continue;
}
// fetch docs that need updating
const pendingDocs = await dataSourceDocsCollection.find({
sourceId: job._id.toString(),
status: { $in: ["pending", "error"] },
}).toArray();
logger.log(`Found ${pendingDocs.length} docs to process`);
// fetch project, user and billing data
let billingCustomerId: string | null = null;
if (USE_BILLING) {
try {
billingCustomerId = await getCustomerIdForProject(job.projectId);
} catch (e) {
logger.log("Unable to fetch billing customer id:", e);
throw new Error("Unable to fetch billing customer id");
}
}
// for each doc
for (const doc of pendingDocs) {
// authorize with billing
if (USE_BILLING && billingCustomerId) {
const authResponse = await authorize(billingCustomerId, {
type: "use_credits",
});
if ('error' in authResponse) {
throw new BillingError(authResponse.error || "Unknown billing error")
}
}
const usageTracker = new UsageTracker();
try {
await runProcessPipeline(logger, usageTracker, job, doc);
} catch (e: any) {
errors = true;
logger.log("Error processing doc:", e);
await dataSourceDocsCollection.updateOne({
_id: doc._id,
version: doc.version,
}, {
$set: {
status: "error",
error: e.message,
}
});
} finally {
// log usage in billing
if (USE_BILLING && billingCustomerId) {
await logUsage(billingCustomerId, {
items: usageTracker.flush(),
});
}
}
}
// fetch docs that need to be deleted
const deletedDocs = await dataSourceDocsCollection.find({
sourceId: job._id.toString(),
status: "deleted",
}).toArray();
logger.log(`Found ${deletedDocs.length} docs to delete`);
for (const doc of deletedDocs) {
try {
await runDeletionPipeline(logger, job, doc);
} catch (e: any) {
errors = true;
logger.log("Error deleting doc:", e);
await dataSourceDocsCollection.updateOne({
_id: doc._id,
version: doc.version,
}, {
$set: {
status: "error",
error: e.message,
}
});
}
}
} catch (e) {
if (e instanceof BillingError) {
logger.log("Billing error:", e.message);
await dataSourcesCollection.updateOne({
_id: job._id,
version: job.version,
}, {
$set: {
status: "error",
billingError: e.message,
lastUpdatedAt: new Date().toISOString(),
}
});
}
logger.log("Error processing job; will retry:", e);
await dataSourcesCollection.updateOne({
_id: job._id,
version: job.version,
}, {
$set: {
status: "error",
lastUpdatedAt: new Date().toISOString(),
}
});
continue;
}
// mark job as complete
logger.log("Marking job as completed...");
await dataSourcesCollection.updateOne({
_id: job._id,
version: job.version,
}, {
$set: {
status: errors ? "error" : "ready",
...(errors ? { error: "There were some errors processing this job" } : {}),
}
});
}
})();

View file

@ -1,381 +0,0 @@
import '../lib/loadenv';
import FirecrawlApp from '@mendable/firecrawl-js';
import { RecursiveCharacterTextSplitter } from "@langchain/textsplitters";
import { z } from 'zod';
import { dataSourceDocsCollection, dataSourcesCollection } from '../lib/mongodb';
import { EmbeddingRecord, DataSourceDoc, DataSource } from "../lib/types/datasource_types";
import { WithId } from 'mongodb';
import { embedMany } from 'ai';
import { embeddingModel } from '../lib/embedding';
import { qdrantClient } from '../lib/qdrant';
import { PrefixLogger } from "../lib/utils";
import crypto from 'crypto';
import { USE_BILLING } from '../lib/feature_flags';
import { authorize, getCustomerIdForProject, logUsage, UsageTracker } from '../lib/billing';
import { BillingError } from '@/src/entities/errors/common';
const firecrawl = new FirecrawlApp({ apiKey: process.env.FIRECRAWL_API_KEY });
const splitter = new RecursiveCharacterTextSplitter({
separators: ['\n\n', '\n', '. ', '.', ''],
chunkSize: 1024,
chunkOverlap: 20,
});
const second = 1000;
const minute = 60 * second;
const hour = 60 * minute;
const day = 24 * hour;
async function retryable<T>(fn: () => Promise<T>, maxAttempts: number = 3): Promise<T> {
let attempts = 0;
while (true) {
try {
return await fn();
} catch (e) {
attempts++;
if (attempts >= maxAttempts) {
throw e;
}
}
}
}
async function runScrapePipeline(_logger: PrefixLogger, usageTracker: UsageTracker, job: WithId<z.infer<typeof DataSource>>, doc: WithId<z.infer<typeof DataSourceDoc>>) {
const logger = _logger
.child(doc._id.toString())
.child(doc.name);
// scrape the url using firecrawl
logger.log("Scraping using Firecrawl");
const scrapeResult = await retryable(async () => {
if (doc.data.type !== 'url') {
throw new Error("Invalid data source type");
}
const scrapeResult = await firecrawl.scrapeUrl(doc.data.url, {
formats: ['markdown'],
onlyMainContent: true,
excludeTags: ['script', 'style', 'noscript', 'img',]
});
if (!scrapeResult.success) {
throw new Error("Unable to scrape URL: " + doc.data.url);
}
return scrapeResult;
}, 3); // Retry up to 3 times
usageTracker.track({
type: "FIRECRAWL_SCRAPE_USAGE",
context: "rag.urls.firecrawl_scrape",
});
// split into chunks
logger.log("Splitting into chunks");
const splits = await splitter.createDocuments([scrapeResult.markdown || '']);
// generate embeddings
logger.log("Generating embeddings");
const { embeddings, usage } = await embedMany({
model: embeddingModel,
values: splits.map((split) => split.pageContent)
});
usageTracker.track({
type: "EMBEDDING_MODEL_USAGE",
modelName: embeddingModel.modelId,
tokens: usage.tokens,
context: "rag.urls.embedding_usage",
});
// store embeddings in qdrant
logger.log("Storing embeddings in Qdrant");
const points: z.infer<typeof EmbeddingRecord>[] = embeddings.map((embedding, i) => ({
id: crypto.randomUUID(),
vector: embedding,
payload: {
projectId: job.projectId,
sourceId: job._id.toString(),
docId: doc._id.toString(),
content: splits[i].pageContent,
title: scrapeResult.metadata?.title || '',
name: doc.name,
},
}));
await qdrantClient.upsert("embeddings", {
points,
});
// store scraped markdown in doc record
logger.log("Storing scraped markdown in doc record");
await dataSourceDocsCollection.updateOne({
_id: doc._id,
version: doc.version,
}, {
$set: {
content: scrapeResult.markdown,
status: "ready",
lastUpdatedAt: new Date().toISOString(),
}
});
}
async function runDeletionPipeline(_logger: PrefixLogger, job: WithId<z.infer<typeof DataSource>>, doc: WithId<z.infer<typeof DataSourceDoc>>): Promise<void> {
const logger = _logger
.child(doc._id.toString())
.child(doc.name);
// Delete embeddings from qdrant
logger.log("Deleting embeddings from Qdrant");
await qdrantClient.delete("embeddings", {
filter: {
must: [
{
key: "projectId",
match: {
value: job.projectId,
}
},
{
key: "sourceId",
match: {
value: job._id.toString(),
}
},
{
key: "docId",
match: {
value: doc._id.toString(),
}
}
],
},
});
// Delete docs from db
logger.log("Deleting doc from db");
await dataSourceDocsCollection.deleteOne({ _id: doc._id });
}
// fetch next job from mongodb
(async () => {
while (true) {
const now = Date.now();
let job: WithId<z.infer<typeof DataSource>> | null = null;
// first try to find a job that needs deleting
job = await dataSourcesCollection.findOneAndUpdate({
status: "deleted",
"data.type": "urls",
$or: [
{ attempts: { $exists: false } },
{ attempts: { $lte: 3 } }
]
}, { $set: { lastAttemptAt: new Date().toISOString() }, $inc: { attempts: 1 } }, { returnDocument: "after", sort: { createdAt: 1 } });
if (job === null) {
job = await dataSourcesCollection.findOneAndUpdate(
{
$and: [
{ 'data.type': { $eq: "urls" } },
{
$or: [
// if the job has never been attempted
{
status: "pending",
attempts: 0,
},
// if the job was attempted but wasn't completed in the last hour
{
status: "pending",
lastAttemptAt: { $lt: new Date(now - 1 * hour).toISOString() },
},
// if the job errored out but hasn't been retried 3 times yet
{
status: "error",
attempts: { $lt: 3 },
},
// if the job errored out but hasn't been retried in the last 5 minutes
{
status: "error",
lastAttemptAt: { $lt: new Date(now - 1 * hour).toISOString() },
},
]
}
]
},
{
$set: {
status: "pending",
lastAttemptAt: new Date().toISOString(),
},
$inc: {
attempts: 1
},
},
{ returnDocument: "after", sort: { createdAt: 1 } }
);
}
if (job === null) {
// if no doc found, sleep for a bit and start again
await new Promise(resolve => setTimeout(resolve, 5 * second));
continue;
}
const logger = new PrefixLogger(`${job._id.toString()}-${job.version}`);
logger.log(`Starting job ${job._id}. Type: ${job.data.type}. Status: ${job.status}`);
let errors = false;
try {
if (job.data.type !== 'urls') {
throw new Error("Invalid data source type");
}
if (job.status === "deleted") {
// delete all embeddings for this source
logger.log("Deleting embeddings from Qdrant");
await qdrantClient.delete("embeddings", {
filter: {
must: [
{ key: "projectId", match: { value: job.projectId } },
{ key: "sourceId", match: { value: job._id.toString() } },
],
},
});
// delete all docs for this source
logger.log("Deleting docs from db");
await dataSourceDocsCollection.deleteMany({
sourceId: job._id.toString(),
});
// delete the source record from db
logger.log("Deleting source record from db");
await dataSourcesCollection.deleteOne({
_id: job._id,
});
logger.log("Job deleted");
continue;
}
// fetch docs that need updating
const pendingDocs = await dataSourceDocsCollection.find({
sourceId: job._id.toString(),
status: { $in: ["pending", "error"] },
}).toArray();
logger.log(`Found ${pendingDocs.length} docs to process`);
// fetch project, user and billing data
let billingCustomerId: string | null = null;
if (USE_BILLING) {
try {
billingCustomerId = await getCustomerIdForProject(job.projectId);
} catch (e) {
logger.log("Unable to fetch billing customer id:", e);
throw new Error("Unable to fetch billing customer id");
}
}
// for each doc
for (const doc of pendingDocs) {
// authorize with billing
if (USE_BILLING && billingCustomerId) {
const authResponse = await authorize(billingCustomerId, {
type: "use_credits",
});
if ('error' in authResponse) {
throw new BillingError(authResponse.error || "Unknown billing error")
}
}
const usageTracker = new UsageTracker();
try {
await runScrapePipeline(logger, usageTracker, job, doc);
} catch (e: any) {
errors = true;
logger.log("Error processing doc:", e);
await dataSourceDocsCollection.updateOne({
_id: doc._id,
version: doc.version,
}, {
$set: {
status: "error",
error: e.message,
}
});
} finally {
// log usage in billing
if (USE_BILLING && billingCustomerId) {
await logUsage(billingCustomerId, {
items: usageTracker.flush(),
});
}
}
}
// fetch docs that need to be deleted
const deletedDocs = await dataSourceDocsCollection.find({
sourceId: job._id.toString(),
status: "deleted",
}).toArray();
logger.log(`Found ${deletedDocs.length} docs to delete`);
for (const doc of deletedDocs) {
try {
await runDeletionPipeline(logger, job, doc);
} catch (e: any) {
errors = true;
logger.log("Error deleting doc:", e);
await dataSourceDocsCollection.updateOne({
_id: doc._id,
version: doc.version,
}, {
$set: {
status: "error",
error: e.message,
}
});
}
}
} catch (e) {
if (e instanceof BillingError) {
logger.log("Billing error:", e.message);
await dataSourcesCollection.updateOne({
_id: job._id,
version: job.version,
}, {
$set: {
status: "error",
billingError: e.message,
lastUpdatedAt: new Date().toISOString(),
}
});
}
logger.log("Error processing job; will retry:", e);
await dataSourcesCollection.updateOne({
_id: job._id,
version: job.version,
}, {
$set: {
status: "error",
lastUpdatedAt: new Date().toISOString(),
}
});
continue;
}
// mark job as complete
logger.log("Marking job as completed...");
await dataSourcesCollection.updateOne({
_id: job._id,
version: job.version,
}, {
$set: {
status: errors ? "error" : "ready",
...(errors ? { error: "There were some errors processing this job" } : {}),
}
});
}
})();