diff --git a/.env.example b/.env.example new file mode 100644 index 000000000..cc6ebe313 --- /dev/null +++ b/.env.example @@ -0,0 +1,17 @@ +# Frontend Configuration +FRONTEND_PORT=3000 +NEXT_PUBLIC_API_URL=http://backend:8000 + +# Backend Configuration +BACKEND_PORT=8000 + +# Database Configuration +POSTGRES_USER=postgres +POSTGRES_PASSWORD=postgres +POSTGRES_DB=surfsense +POSTGRES_PORT=5432 + +# pgAdmin Configuration +PGADMIN_PORT=5050 +PGADMIN_DEFAULT_EMAIL=admin@surfsense.com +PGADMIN_DEFAULT_PASSWORD=surfsense diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md new file mode 100644 index 000000000..a80f14583 --- /dev/null +++ b/.github/PULL_REQUEST_TEMPLATE.md @@ -0,0 +1,45 @@ + + +## Description + + +## Motivation and Context + + +FIX # + +## Changes Overview + +- + +## Screenshots + + +## API Changes + +- [ ] This PR includes API changes + +## Types of changes + +- [ ] Bug fix (non-breaking change which fixes an issue) +- [ ] New feature (non-breaking change which adds functionality) +- [ ] Performance improvement (non-breaking change which enhances performance) +- [ ] Documentation update +- [ ] Breaking change (fix or feature that would cause existing functionality to change) + +## Testing + +- [ ] I have tested these changes locally +- [ ] I have added/updated unit tests +- [ ] I have added/updated integration tests + +## Checklist: + + +- [ ] My code follows the code style of this project +- [ ] My change requires documentation updates +- [ ] I have updated the documentation accordingly +- [ ] My change requires dependency updates +- [ ] I have updated the dependencies accordingly +- [ ] My code builds clean without any errors or warnings +- [ ] All new and existing tests passed \ No newline at end of file diff --git a/.github/workflows/docker-publish.yml b/.github/workflows/docker-publish.yml new file mode 100644 index 000000000..9b7ecc6a0 --- /dev/null +++ b/.github/workflows/docker-publish.yml @@ -0,0 +1,76 @@ +name: Docker Publish + +on: + push: + branches: [ "main" ] + +jobs: + build_and_push_backend: + runs-on: ubuntu-latest + permissions: + contents: read + packages: write + steps: + - name: Checkout repository + uses: actions/checkout@v4 + + - name: Set up QEMU + uses: docker/setup-qemu-action@v3 + + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v3 + + - name: Log in to GitHub Container Registry + uses: docker/login-action@v3 + with: + registry: ghcr.io + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} + + - name: Build and push backend image + uses: docker/build-push-action@v5 + with: + context: ./surfsense_backend + file: ./surfsense_backend/Dockerfile + push: true + tags: ghcr.io/${{ github.repository_owner }}/surfsense_backend:${{ github.sha }} + platforms: linux/amd64,linux/arm64 + labels: | + org.opencontainers.image.source=${{ github.repositoryUrl }} + org.opencontainers.image.created=${{ fromJSON(steps.meta.outputs.json).labels['org.opencontainers.image.created'] }} + org.opencontainers.image.revision=${{ github.sha }} + + build_and_push_frontend: + runs-on: ubuntu-latest + permissions: + contents: read + packages: write + steps: + - name: Checkout repository + uses: actions/checkout@v4 + + - name: Set up QEMU + uses: docker/setup-qemu-action@v3 + + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v3 + + - name: Log in to GitHub Container Registry + uses: docker/login-action@v3 + with: + registry: ghcr.io + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} + + - name: Build and push frontend image + uses: docker/build-push-action@v5 + with: + context: ./surfsense_web + file: ./surfsense_web/Dockerfile + push: true + tags: ghcr.io/${{ github.repository_owner }}/surfsense_web:${{ github.sha }} + platforms: linux/amd64,linux/arm64 + labels: | + org.opencontainers.image.source=${{ github.repositoryUrl }} + org.opencontainers.image.created=${{ fromJSON(steps.meta.outputs.json).labels['org.opencontainers.image.created'] }} + org.opencontainers.image.revision=${{ github.sha }} diff --git a/.gitignore b/.gitignore index ac1266863..1a7f2267f 100644 --- a/.gitignore +++ b/.gitignore @@ -1 +1,2 @@ -.flashrank_cache* \ No newline at end of file +.flashrank_cache* +podcasts/ diff --git a/DEPLOYMENT_GUIDE.md b/DEPLOYMENT_GUIDE.md new file mode 100644 index 000000000..e4cc86dec --- /dev/null +++ b/DEPLOYMENT_GUIDE.md @@ -0,0 +1,124 @@ +# SurfSense Deployment Guide + +This guide explains the different deployment options available for SurfSense using Docker Compose. + +## Deployment Options + +SurfSense uses a flexible Docker Compose configuration that allows you to easily switch between deployment modes without manually editing files. Our approach uses Docker's built-in override functionality with two configuration files: + +1. **docker-compose.yml**: Contains essential core services (database and pgAdmin) +2. **docker-compose.override.yml**: Contains application services (frontend and backend) + +This structure provides several advantages: +- No need to comment/uncomment services manually +- Clear separation between core infrastructure and application services +- Easy switching between development and production environments + +## Deployment Modes + +### Full Stack Mode (Development) + +This mode runs everything: frontend, backend, database, and pgAdmin. It's ideal for development environments where you need the complete application stack. + +```bash +# Both files are automatically used (docker-compose.yml + docker-compose.override.yml) +docker compose up -d +``` + +### Core Services Mode (Production) + +This mode runs only the database and pgAdmin services. It's suitable for production environments where you might want to deploy the frontend and backend separately or need to run database migrations. + +```bash +# Explicitly use only the main file +docker compose -f docker-compose.yml up -d +``` + +## Custom Deployment Options + +### Running Specific Services + +You can specify which services to start by naming them: + +```bash +# Start only database +docker compose up -d db + +# Start database and pgAdmin +docker compose up -d db pgadmin + +# Start only backend (requires db to be running) +docker compose up -d backend +``` + +### Using Custom Override Files + +You can create and use custom override files for different environments: + +```bash +# Create a staging configuration +docker compose -f docker-compose.yml -f docker-compose.staging.yml up -d +``` + +## Environment Variables + +The deployment can be customized using environment variables: + +```bash +# Change default ports +FRONTEND_PORT=4000 BACKEND_PORT=9000 docker compose up -d + +# Or use a .env file +# Create or modify .env file with your desired values +docker compose up -d +``` + +## Common Deployment Workflows + +### Initial Setup + +```bash +# Clone the repository +git clone https://github.com/MODSetter/SurfSense.git +cd SurfSense + +# Copy example env files +cp .env.example .env +cp surfsense_backend/.env.example surfsense_backend/.env +cp surfsense_web/.env.example surfsense_web/.env + +# Edit the .env files with your configuration + +# Start full stack for development +docker compose up -d +``` + +### Database-Only Mode (for migrations or maintenance) + +```bash +# Start just the database +docker compose -f docker-compose.yml up -d db + +# Run migrations or maintenance tasks +docker compose exec db psql -U postgres -d surfsense +``` + +### Scaling in Production + +For production deployments, you might want to: + +1. Run core services with Docker Compose +2. Deploy frontend/backend with specialized services like Vercel, Netlify, or dedicated application servers + +This separation allows for better scaling and resource utilization in production environments. + +## Troubleshooting + +If you encounter issues with the deployment: + +- Check container logs: `docker compose logs -f [service_name]` +- Ensure all required environment variables are set +- Verify network connectivity between containers +- Check that required ports are available and not blocked by firewalls + +For more detailed setup instructions, refer to [DOCKER_SETUP.md](DOCKER_SETUP.md). \ No newline at end of file diff --git a/DOCKER_SETUP.md b/DOCKER_SETUP.md index 44e6a142f..6b7ee4764 100644 --- a/DOCKER_SETUP.md +++ b/DOCKER_SETUP.md @@ -7,73 +7,186 @@ This document explains how to run the SurfSense project using Docker Compose. - Docker and Docker Compose installed on your machine - Git (to clone the repository) +## Environment Variables Configuration + +SurfSense Docker setup supports configuration through environment variables. You can set these variables in two ways: + +1. Create a `.env` file in the project root directory (copy from `.env.example`) +2. Set environment variables directly in your shell before running Docker Compose + +The following environment variables are available: + +``` +# Frontend Configuration +FRONTEND_PORT=3000 +NEXT_PUBLIC_API_URL=http://backend:8000 + +# Backend Configuration +BACKEND_PORT=8000 + +# Database Configuration +POSTGRES_USER=postgres +POSTGRES_PASSWORD=postgres +POSTGRES_DB=surfsense +POSTGRES_PORT=5432 + +# pgAdmin Configuration +PGADMIN_PORT=5050 +PGADMIN_DEFAULT_EMAIL=admin@surfsense.com +PGADMIN_DEFAULT_PASSWORD=surfsense +``` + +## Deployment Options + +SurfSense uses a flexible Docker Compose setup that allows you to choose between different deployment modes: + +### Option 1: Full-Stack Deployment (Development Mode) +Includes frontend, backend, database, and pgAdmin. This is the default when running `docker compose up`. + +### Option 2: Core Services Only (Production Mode) +Includes only database and pgAdmin, suitable for production environments where you might deploy frontend/backend separately. + +Our setup uses two files: +- `docker-compose.yml`: Contains core services (database and pgAdmin) +- `docker-compose.override.yml`: Contains application services (frontend and backend) + ## Setup 1. Make sure you have all the necessary environment variables set up: - Copy `surfsense_backend/.env.example` to `surfsense_backend/.env` and fill in the required values - Copy `surfsense_web/.env.example` to `surfsense_web/.env` and fill in the required values + - Optionally: Copy `.env.example` to `.env` in the project root to customize Docker settings -2. Build and start the containers: +2. Deploy based on your needs: + + **Full Stack (Development Mode)**: ```bash - docker-compose up --build + # Both files are automatically used + docker compose up --build + ``` + + **Core Services Only (Production Mode)**: + ```bash + # Explicitly use only the main file + docker compose -f docker-compose.yml up --build ``` 3. To run in detached mode (in the background): ```bash - docker-compose up -d + # Full stack + docker compose up -d + + # Core services only + docker compose -f docker-compose.yml up -d ``` 4. Access the applications: - - Frontend: http://localhost:3000 - - Backend API: http://localhost:8000 - - API Documentation: http://localhost:8000/docs + - Frontend: http://localhost:3000 (when using full stack) + - Backend API: http://localhost:8000 (when using full stack) + - API Documentation: http://localhost:8000/docs (when using full stack) + - pgAdmin: http://localhost:5050 + +## Customizing the Deployment + +If you need to make temporary changes to either full stack or core services deployment, you can: + +1. **Temporarily disable override file**: + ```bash + docker compose -f docker-compose.yml up -d + ``` + +2. **Use a custom override file**: + ```bash + docker compose -f docker-compose.yml -f custom-override.yml up -d + ``` + +3. **Temporarily modify which services start**: + ```bash + docker compose up -d db pgadmin + ``` ## Useful Commands - Stop the containers: ```bash - docker-compose down + docker compose down ``` - View logs: ```bash # All services - docker-compose logs -f + docker compose logs -f # Specific service - docker-compose logs -f backend - docker-compose logs -f frontend - docker-compose logs -f db + docker compose logs -f backend + docker compose logs -f frontend + docker compose logs -f db + docker compose logs -f pgadmin ``` - Restart a specific service: ```bash - docker-compose restart backend + docker compose restart backend ``` - Execute commands in a running container: ```bash # Backend - docker-compose exec backend python -m pytest + docker compose exec backend python -m pytest # Frontend - docker-compose exec frontend pnpm lint + docker compose exec frontend pnpm lint ``` ## Database The PostgreSQL database with pgvector extensions is available at: - Host: localhost -- Port: 5432 -- Username: postgres -- Password: postgres -- Database: surfsense +- Port: 5432 (configurable via POSTGRES_PORT) +- Username: postgres (configurable via POSTGRES_USER) +- Password: postgres (configurable via POSTGRES_PASSWORD) +- Database: surfsense (configurable via POSTGRES_DB) -You can connect to it using any PostgreSQL client. +You can connect to it using any PostgreSQL client or the included pgAdmin. + +## pgAdmin + +pgAdmin is a web-based administration tool for PostgreSQL. It is included in the Docker setup for easier database management. + +- URL: http://localhost:5050 (configurable via PGADMIN_PORT) +- Default Email: admin@surfsense.com (configurable via PGADMIN_DEFAULT_EMAIL) +- Default Password: surfsense (configurable via PGADMIN_DEFAULT_PASSWORD) + +### Connecting to the Database in pgAdmin + +1. Log in to pgAdmin using the credentials above +2. Right-click on "Servers" in the left sidebar and select "Create" > "Server" +3. In the "General" tab, give your connection a name (e.g., "SurfSense DB") +4. In the "Connection" tab, enter the following: + - Host: db + - Port: 5432 + - Maintenance database: surfsense + - Username: postgres + - Password: postgres +5. Click "Save" to establish the connection ## Troubleshooting - If you encounter permission errors, you may need to run the docker commands with `sudo`. -- If ports are already in use, modify the port mappings in the `docker-compose.yml` file. +- If ports are already in use, modify the port mappings in the `.env` file or directly in the `docker-compose.yml` file. - For backend dependency issues, you may need to modify the `Dockerfile` in the backend directory. -- For frontend dependency issues, you may need to modify the `Dockerfile` in the frontend directory. +- If you encounter frontend dependency errors, adjust the frontend's `Dockerfile` accordingly. +- If pgAdmin doesn't connect to the database, ensure you're using `db` as the hostname, not `localhost`, as that's the Docker network name. +- If you need only specific services, you can explicitly name them: `docker compose up db pgadmin` + +## Understanding Docker Compose File Structure + +The project uses Docker's default override mechanism: + +1. **docker-compose.yml**: Contains essential services (database and pgAdmin) +2. **docker-compose.override.yml**: Contains development services (frontend and backend) + +When you run `docker compose up` without additional flags, Docker automatically merges both files. +When you run `docker compose -f docker-compose.yml up`, only the specified file is used. + +This approach lets you maintain a cleaner codebase without manually commenting/uncommenting services in your configuration files. diff --git a/README.md b/README.md index e412fe2be..7272206a4 100644 --- a/README.md +++ b/README.md @@ -1,39 +1,53 @@ - - ![new_header](https://github.com/user-attachments/assets/e236b764-0ddc-42ff-a1f1-8fbb3d2e0e65) + # SurfSense -While tools like NotebookLM and Perplexity are impressive and highly effective for conducting research on any topic/query, SurfSense elevates this capability by integrating with your personal knowledge base. It is a highly customizable AI research agent, connected to external sources such as search engines (Tavily), Slack, Linear, Notion, YouTube, GitHub and more to come. +While tools like NotebookLM and Perplexity are impressive and highly effective for conducting research on any topic/query, SurfSense elevates this capability by integrating with your personal knowledge base. It is a highly customizable AI research agent, connected to external sources such as search engines (Tavily, LinkUp), Slack, Linear, Notion, YouTube, GitHub and more to come. + +
+MODSetter%2FSurfSense | Trendshift +
# Video https://github.com/user-attachments/assets/48142909-6391-4084-b7e8-81da388bb1fc +# Podcast's + +https://github.com/user-attachments/assets/d516982f-de00-4c41-9e4c-632a7d942f41 + +## Podcast Sample + +https://github.com/user-attachments/assets/bf64a6ca-934b-47ac-9e1b-edac5fe972ec ## Key Features -### 1. Latest -#### 💡 **Idea**: +### 💡 **Idea**: Have your own highly customizable private NotebookLM and Perplexity integrated with external sources. -#### 📁 **Multiple File Format Uploading Support** -Save content from your own personal files *(Documents, images and supports **27 file extensions**)* to your own personal knowledge base . -#### 🔍 **Powerful Search** +### 📁 **Multiple File Format Uploading Support** +Save content from your own personal files *(Documents, images, videos and supports **50+ file extensions**)* to your own personal knowledge base . +### 🔍 **Powerful Search** Quickly research or find anything in your saved content . -#### đŸ’Ŧ **Chat with your Saved Content** +### đŸ’Ŧ **Chat with your Saved Content** Interact in Natural Language and get cited answers. -#### 📄 **Cited Answers** +### 📄 **Cited Answers** Get Cited answers just like Perplexity. -#### 🔔 **Privacy & Local LLM Support** +### 🔔 **Privacy & Local LLM Support** Works Flawlessly with Ollama local LLMs. -#### 🏠 **Self Hostable** +### 🏠 **Self Hostable** Open source and easy to deploy locally. -#### 📊 **Advanced RAG Techniques** +### đŸŽ™ī¸ Podcasts +- Blazingly fast podcast generation agent. (Creates a 3-minute podcast in under 20 seconds.) +- Convert your chat conversations into engaging audio content +- Support for multiple TTS providers (OpenAI, Azure, Google Vertex AI) + +### 📊 **Advanced RAG Techniques** - Supports 150+ LLM's - Supports 6000+ Embedding Models. - Supports all major Rerankers (Pinecode, Cohere, Flashrank etc) @@ -41,8 +55,8 @@ Open source and easy to deploy locally. - Utilizes Hybrid Search (Semantic + Full Text Search combined with Reciprocal Rank Fusion). - RAG as a Service API Backend. -#### â„šī¸ **External Sources** -- Search Engines (Tavily) +### â„šī¸ **External Sources** +- Search Engines (Tavily, LinkUp) - Slack - Linear - Notion @@ -50,17 +64,41 @@ Open source and easy to deploy locally. - GitHub - and more to come..... -#### 🔖 Cross Browser Extension +## 📄 **Supported File Extensions** + +> **Note**: File format support depends on your ETL service configuration. LlamaCloud supports 50+ formats, while Unstructured supports 34+ core formats. + +### Documents & Text +**LlamaCloud**: `.pdf`, `.doc`, `.docx`, `.docm`, `.dot`, `.dotm`, `.rtf`, `.txt`, `.xml`, `.epub`, `.odt`, `.wpd`, `.pages`, `.key`, `.numbers`, `.602`, `.abw`, `.cgm`, `.cwk`, `.hwp`, `.lwp`, `.mw`, `.mcw`, `.pbd`, `.sda`, `.sdd`, `.sdp`, `.sdw`, `.sgl`, `.sti`, `.sxi`, `.sxw`, `.stw`, `.sxg`, `.uof`, `.uop`, `.uot`, `.vor`, `.wps`, `.zabw` + +**Unstructured**: `.doc`, `.docx`, `.odt`, `.rtf`, `.pdf`, `.xml`, `.txt`, `.md`, `.markdown`, `.rst`, `.html`, `.org`, `.epub` + +### Presentations +**LlamaCloud**: `.ppt`, `.pptx`, `.pptm`, `.pot`, `.potm`, `.potx`, `.odp`, `.key` + +**Unstructured**: `.ppt`, `.pptx` + +### Spreadsheets & Data +**LlamaCloud**: `.xlsx`, `.xls`, `.xlsm`, `.xlsb`, `.xlw`, `.csv`, `.tsv`, `.ods`, `.fods`, `.numbers`, `.dbf`, `.123`, `.dif`, `.sylk`, `.slk`, `.prn`, `.et`, `.uos1`, `.uos2`, `.wk1`, `.wk2`, `.wk3`, `.wk4`, `.wks`, `.wq1`, `.wq2`, `.wb1`, `.wb2`, `.wb3`, `.qpw`, `.xlr`, `.eth` + +**Unstructured**: `.xls`, `.xlsx`, `.csv`, `.tsv` + +### Images +**LlamaCloud**: `.jpg`, `.jpeg`, `.png`, `.gif`, `.bmp`, `.svg`, `.tiff`, `.webp`, `.html`, `.htm`, `.web` + +**Unstructured**: `.jpg`, `.jpeg`, `.png`, `.bmp`, `.tiff`, `.heic` + +### Audio & Video *(Always Supported)* +`.mp3`, `.mpga`, `.m4a`, `.wav`, `.mp4`, `.mpeg`, `.webm` + +### Email & Communication +**Unstructured**: `.eml`, `.msg`, `.p7s` + +### 🔖 Cross Browser Extension - The SurfSense extension can be used to save any webpage you like. - Its main usecase is to save any webpages protected beyond authentication. -### 2. Temporarily Deprecated - -#### Podcasts -- The SurfSense Podcast feature is currently being reworked for better UI and stability. Expect it soon. - - ## FEATURE REQUESTS AND FUTURE @@ -76,7 +114,13 @@ Join the [SurfSense Discord](https://discord.gg/ejRNvftDp9) and help shape the f SurfSense provides two installation methods: -1. **[Docker Installation](https://www.surfsense.net/docs/docker-installation)** - The easiest way to get SurfSense up and running with all dependencies containerized. Less Customization. +1. **[Docker Installation](https://www.surfsense.net/docs/docker-installation)** - The easiest way to get SurfSense up and running with all dependencies containerized. + - Includes pgAdmin for database management through a web UI + - Supports environment variable customization via `.env` file + - Flexible deployment options (full stack or core services only) + - No need to manually edit configuration files between environments + - See [Docker Setup Guide](DOCKER_SETUP.md) for detailed instructions + - For deployment scenarios and options, see [Deployment Guide](DEPLOYMENT_GUIDE.md) 2. **[Manual Installation (Recommended)](https://www.surfsense.net/docs/manual-installation)** - For users who prefer more control over their setup or need to customize their deployment. @@ -84,7 +128,6 @@ Both installation guides include detailed OS-specific instructions for Windows, Before installation, make sure to complete the [prerequisite setup steps](https://www.surfsense.net/docs/) including: - PGVector setup -- Google OAuth configuration - Unstructured.io API key - Other required API keys @@ -101,6 +144,9 @@ Before installation, make sure to complete the [prerequisite setup steps](https: ![researcher](https://github.com/user-attachments/assets/fda3e61f-f936-4b66-b565-d84edde44a67) +**Podcast Agent** +![podcasts](https://github.com/user-attachments/assets/6cb82ffd-9e14-4172-bc79-67faf34c4c1c) + **Agent Chat** @@ -112,6 +158,7 @@ Before installation, make sure to complete the [prerequisite setup steps](https: ![ext2](https://github.com/user-attachments/assets/a9b9f1aa-2677-404d-b0a0-c1b2dddf24a7) + ## Tech Stack @@ -178,6 +225,14 @@ Before installation, make sure to complete the [prerequisite setup steps](https: - **@tanstack/react-table**: Headless UI for building powerful tables & datagrids. + ### **DevOps** + +- **Docker**: Container platform for consistent deployment across environments + +- **Docker Compose**: Tool for defining and running multi-container Docker applications + +- **pgAdmin**: Web-based PostgreSQL administration tool included in Docker setup + ### **Extension** Manifest v3 on Plasmo @@ -185,16 +240,8 @@ Before installation, make sure to complete the [prerequisite setup steps](https: ## Future Work - Add More Connectors. - Patch minor bugs. -- Implement Canvas. -- Complete Hybrid Search. **[Done]** -- Add support for file uploads QA. **[Done]** -- Shift to WebSockets for Streaming responses. **[Deprecated in favor of AI SDK Stream Protocol]** -- Based on feedback, I will work on making it compatible with local models. **[Done]** -- Cross Browser Extension **[Done]** -- Critical Notifications **[Done | PAUSED]** -- Saving Chats **[Done]** -- Basic keyword search page for saved sessions **[Done]** -- Multi & Single Document Chat **[Done]** +- Document Chat **[REIMPLEMENT]** +- Document Podcasts @@ -203,3 +250,13 @@ Before installation, make sure to complete the [prerequisite setup steps](https: Contributions are very welcome! A contribution can be as small as a ⭐ or even finding and creating issues. Fine-tuning the Backend is always desired. +## Star History + + + + + + Star History Chart + + + diff --git a/docker-compose.override.yml b/docker-compose.override.yml new file mode 100644 index 000000000..c971c68b5 --- /dev/null +++ b/docker-compose.override.yml @@ -0,0 +1,34 @@ +version: '3.8' + +services: + frontend: + build: + context: ./surfsense_web + dockerfile: Dockerfile + ports: + - "${FRONTEND_PORT:-3000}:3000" + volumes: + - ./surfsense_web:/app + - /app/node_modules + depends_on: + - backend + environment: + - NEXT_PUBLIC_API_URL=${NEXT_PUBLIC_API_URL:-http://backend:8000} + + backend: + build: + context: ./surfsense_backend + dockerfile: Dockerfile + ports: + - "${BACKEND_PORT:-8000}:8000" + volumes: + - ./surfsense_backend:/app + depends_on: + - db + env_file: + - ./surfsense_backend/.env + environment: + - DATABASE_URL=postgresql+asyncpg://${POSTGRES_USER:-postgres}:${POSTGRES_PASSWORD:-postgres}@db:5432/${POSTGRES_DB:-surfsense} + - PYTHONPATH=/app + - UVICORN_LOOP=asyncio + - UNSTRUCTURED_HAS_PATCHED_LOOP=1 diff --git a/docker-compose.yml b/docker-compose.yml index 736400a6e..219933c48 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -1,48 +1,29 @@ version: '3.8' services: - frontend: - build: - context: ./surfsense_web - dockerfile: Dockerfile - ports: - - "3000:3000" - volumes: - - ./surfsense_web:/app - - /app/node_modules - depends_on: - - backend - environment: - - NEXT_PUBLIC_API_URL=http://backend:8000 - - backend: - build: - context: ./surfsense_backend - dockerfile: Dockerfile - ports: - - "8000:8000" - volumes: - - ./surfsense_backend:/app - depends_on: - - db - env_file: - - ./surfsense_backend/.env - environment: - - DATABASE_URL=postgresql+asyncpg://postgres:postgres@db:5432/surfsense - - PYTHONPATH=/app - - UVICORN_LOOP=asyncio - - UNSTRUCTURED_HAS_PATCHED_LOOP=1 - db: image: ankane/pgvector:latest ports: - - "5432:5432" + - "${POSTGRES_PORT:-5432}:5432" volumes: - postgres_data:/var/lib/postgresql/data environment: - - POSTGRES_USER=postgres - - POSTGRES_PASSWORD=postgres - - POSTGRES_DB=surfsense + - POSTGRES_USER=${POSTGRES_USER:-postgres} + - POSTGRES_PASSWORD=${POSTGRES_PASSWORD:-postgres} + - POSTGRES_DB=${POSTGRES_DB:-surfsense} + + pgadmin: + image: dpage/pgadmin4 + ports: + - "${PGADMIN_PORT:-5050}:80" + environment: + - PGADMIN_DEFAULT_EMAIL=${PGADMIN_DEFAULT_EMAIL:-admin@surfsense.com} + - PGADMIN_DEFAULT_PASSWORD=${PGADMIN_DEFAULT_PASSWORD:-surfsense} + volumes: + - pgadmin_data:/var/lib/pgadmin + depends_on: + - db volumes: - postgres_data: \ No newline at end of file + postgres_data: + pgadmin_data: \ No newline at end of file diff --git a/surfsense_backend/.env.example b/surfsense_backend/.env.example index 6dfcc9967..c0032a912 100644 --- a/surfsense_backend/.env.example +++ b/surfsense_backend/.env.example @@ -1,10 +1,15 @@ DATABASE_URL="postgresql+asyncpg://postgres:postgres@localhost:5432/surfsense" SECRET_KEY="SECRET" -GOOGLE_OAUTH_CLIENT_ID="924507538m" -GOOGLE_OAUTH_CLIENT_SECRET="GOCSV" NEXT_FRONTEND_URL="http://localhost:3000" +#Auth +AUTH_TYPE="GOOGLE" or "LOCAL" +# For Google Auth Only +GOOGLE_OAUTH_CLIENT_ID="924507538m" +GOOGLE_OAUTH_CLIENT_SECRET="GOCSV" + +#Embedding Model EMBEDDING_MODEL="mixedbread-ai/mxbai-embed-large-v1" RERANKERS_MODEL_NAME="ms-marco-MiniLM-L-12-v2" @@ -15,15 +20,32 @@ FAST_LLM="openai/gpt-4o-mini" STRATEGIC_LLM="openai/gpt-4o" LONG_CONTEXT_LLM="gemini/gemini-2.0-flash" +#LiteLLM TTS Provider: https://docs.litellm.ai/docs/text_to_speech#supported-providers +TTS_SERVICE="openai/tts-1" + +#LiteLLM STT Provider: https://docs.litellm.ai/docs/audio_transcription#supported-providers +STT_SERVICE="openai/whisper-1" + # Chosen LiteLLM Providers Keys OPENAI_API_KEY="sk-proj-iA" GEMINI_API_KEY="AIzaSyB6-1641124124124124124124124124124" -UNSTRUCTURED_API_KEY="Tpu3P0U8iy" FIRECRAWL_API_KEY="fcr-01J0000000000000000000000" +#File Parser Service +ETL_SERVICE="UNSTRUCTURED" or "LLAMACLOUD" +UNSTRUCTURED_API_KEY="Tpu3P0U8iy" +LLAMA_CLOUD_API_KEY="llx-nnn" + #OPTIONAL: Add these for LangSmith Observability LANGSMITH_TRACING=true LANGSMITH_ENDPOINT="https://api.smith.langchain.com" LANGSMITH_API_KEY="lsv2_pt_....." LANGSMITH_PROJECT="surfsense" + +# OPTIONAL: LiteLLM API Base +FAST_LLM_API_BASE="" +STRATEGIC_LLM_API_BASE="" +LONG_CONTEXT_LLM_API_BASE="" +TTS_SERVICE_API_BASE="" +STT_SERVICE_API_BASE="" diff --git a/surfsense_backend/.gitignore b/surfsense_backend/.gitignore index ee59e4764..b89ba2402 100644 --- a/surfsense_backend/.gitignore +++ b/surfsense_backend/.gitignore @@ -5,3 +5,4 @@ data/ __pycache__/ .flashrank_cache surf_new_backend.egg-info/ +podcasts/ diff --git a/surfsense_backend/README.md b/surfsense_backend/README.md index 879fa4372..f78ec7df5 100644 --- a/surfsense_backend/README.md +++ b/surfsense_backend/README.md @@ -110,7 +110,6 @@ See pyproject.toml for detailed dependency information. Key dependencies include - fastapi and related packages - fastapi-users: Authentication and user management - firecrawl-py: Web crawling capabilities -- gpt-researcher: Advanced research capabilities - langchain components for AI workflows - litellm: LLM model integration - pgvector: Vector similarity search in PostgreSQL diff --git a/surfsense_backend/alembic/versions/1_add_github_connector_enum.py b/surfsense_backend/alembic/versions/1_add_github_connector_enum.py index bb72838ad..1902777b6 100644 --- a/surfsense_backend/alembic/versions/1_add_github_connector_enum.py +++ b/surfsense_backend/alembic/versions/1_add_github_connector_enum.py @@ -2,7 +2,6 @@ Revision ID: 1 Revises: -Create Date: 2023-10-27 10:00:00.000000 """ from typing import Sequence, Union diff --git a/surfsense_backend/alembic/versions/2_add_linear_connector_enum.py b/surfsense_backend/alembic/versions/2_add_linear_connector_enum.py index d3527d34a..526c7c3ad 100644 --- a/surfsense_backend/alembic/versions/2_add_linear_connector_enum.py +++ b/surfsense_backend/alembic/versions/2_add_linear_connector_enum.py @@ -2,7 +2,6 @@ Revision ID: 2 Revises: e55302644c51 -Create Date: 2025-04-16 10:00:00.000000 """ from typing import Sequence, Union diff --git a/surfsense_backend/alembic/versions/3_add_linear_connector_to_documenttype_.py b/surfsense_backend/alembic/versions/3_add_linear_connector_to_documenttype_.py index ab50d8550..e71ee2ed4 100644 --- a/surfsense_backend/alembic/versions/3_add_linear_connector_to_documenttype_.py +++ b/surfsense_backend/alembic/versions/3_add_linear_connector_to_documenttype_.py @@ -2,7 +2,6 @@ Revision ID: 3 Revises: 2 -Create Date: 2025-04-16 10:05:00.059921 """ from typing import Sequence, Union diff --git a/surfsense_backend/alembic/versions/4_add_linkup_api_enum.py b/surfsense_backend/alembic/versions/4_add_linkup_api_enum.py new file mode 100644 index 000000000..093bdf067 --- /dev/null +++ b/surfsense_backend/alembic/versions/4_add_linkup_api_enum.py @@ -0,0 +1,44 @@ +"""Add LINKUP_API to SearchSourceConnectorType enum + +Revision ID: 4 +Revises: 3 + +""" +from typing import Sequence, Union + +from alembic import op +import sqlalchemy as sa + + +# revision identifiers, used by Alembic. +revision: str = '4' +down_revision: Union[str, None] = '3' +branch_labels: Union[str, Sequence[str], None] = None +depends_on: Union[str, Sequence[str], None] = None + + +def upgrade() -> None: + # ### commands auto generated by Alembic - please adjust! ### + + # Manually add the command to add the enum value + op.execute("ALTER TYPE searchsourceconnectortype ADD VALUE 'LINKUP_API'") + + # Pass for the rest, as autogenerate didn't run to add other schema details + pass + # ### end Alembic commands ### + + +def downgrade() -> None: + # ### commands auto generated by Alembic - please adjust! ### + + # Downgrading removal of an enum value requires recreating the type + op.execute("ALTER TYPE searchsourceconnectortype RENAME TO searchsourceconnectortype_old") + op.execute("CREATE TYPE searchsourceconnectortype AS ENUM('SERPER_API', 'TAVILY_API', 'SLACK_CONNECTOR', 'NOTION_CONNECTOR', 'GITHUB_CONNECTOR', 'LINEAR_CONNECTOR')") + op.execute(( + "ALTER TABLE search_source_connectors ALTER COLUMN connector_type TYPE searchsourceconnectortype USING " + "connector_type::text::searchsourceconnectortype" + )) + op.execute("DROP TYPE searchsourceconnectortype_old") + + pass + # ### end Alembic commands ### \ No newline at end of file diff --git a/surfsense_backend/alembic/versions/5_remove_title_char_limit.py b/surfsense_backend/alembic/versions/5_remove_title_char_limit.py new file mode 100644 index 000000000..62fe019f4 --- /dev/null +++ b/surfsense_backend/alembic/versions/5_remove_title_char_limit.py @@ -0,0 +1,57 @@ +"""Remove char limit on title columns + +Revision ID: 5 +Revises: 4 + +""" +from typing import Sequence, Union + +from alembic import op +import sqlalchemy as sa + + +# revision identifiers, used by Alembic. +revision: str = '5' +down_revision: Union[str, None] = '4' +branch_labels: Union[str, Sequence[str], None] = None +depends_on: Union[str, Sequence[str], None] = None + + +def upgrade() -> None: + # Alter Chat table + op.alter_column('chats', 'title', + existing_type=sa.String(200), + type_=sa.String(), + existing_nullable=False) + + # Alter Document table + op.alter_column('documents', 'title', + existing_type=sa.String(200), + type_=sa.String(), + existing_nullable=False) + + # Alter Podcast table + op.alter_column('podcasts', 'title', + existing_type=sa.String(200), + type_=sa.String(), + existing_nullable=False) + + +def downgrade() -> None: + # Revert Chat table + op.alter_column('chats', 'title', + existing_type=sa.String(), + type_=sa.String(200), + existing_nullable=False) + + # Revert Document table + op.alter_column('documents', 'title', + existing_type=sa.String(), + type_=sa.String(200), + existing_nullable=False) + + # Revert Podcast table + op.alter_column('podcasts', 'title', + existing_type=sa.String(), + type_=sa.String(200), + existing_nullable=False) \ No newline at end of file diff --git a/surfsense_backend/alembic/versions/6_change_podcast_content_to_transcript.py b/surfsense_backend/alembic/versions/6_change_podcast_content_to_transcript.py new file mode 100644 index 000000000..fa7a0f8f6 --- /dev/null +++ b/surfsense_backend/alembic/versions/6_change_podcast_content_to_transcript.py @@ -0,0 +1,43 @@ +"""Change podcast_content to podcast_transcript with JSON type + +Revision ID: 6 +Revises: 5 + +""" +from typing import Sequence, Union + +from alembic import op +import sqlalchemy as sa +from sqlalchemy.dialects.postgresql import JSON + + +# revision identifiers, used by Alembic. +revision: str = '6' +down_revision: Union[str, None] = '5' +branch_labels: Union[str, Sequence[str], None] = None +depends_on: Union[str, Sequence[str], None] = None + + +def upgrade() -> None: + # Drop the old column and create a new one with the new name and type + # We need to do this because PostgreSQL doesn't support direct column renames with type changes + op.add_column('podcasts', sa.Column('podcast_transcript', JSON, nullable=False, server_default='{}')) + + # Copy data from old column to new column + # Convert text to JSON by storing it as a JSON string value + op.execute("UPDATE podcasts SET podcast_transcript = jsonb_build_object('text', podcast_content) WHERE podcast_content != ''") + + # Drop the old column + op.drop_column('podcasts', 'podcast_content') + + +def downgrade() -> None: + # Add back the original column + op.add_column('podcasts', sa.Column('podcast_content', sa.Text(), nullable=False, server_default='')) + + # Copy data from JSON column back to text column + # Extract the 'text' field if it exists, otherwise use empty string + op.execute("UPDATE podcasts SET podcast_content = COALESCE((podcast_transcript->>'text'), '')") + + # Drop the new column + op.drop_column('podcasts', 'podcast_transcript') \ No newline at end of file diff --git a/surfsense_backend/alembic/versions/7_remove_is_generated_column.py b/surfsense_backend/alembic/versions/7_remove_is_generated_column.py new file mode 100644 index 000000000..03048a146 --- /dev/null +++ b/surfsense_backend/alembic/versions/7_remove_is_generated_column.py @@ -0,0 +1,27 @@ +"""Remove is_generated column from podcasts table + +Revision ID: 7 +Revises: 6 + +""" +from typing import Sequence, Union + +from alembic import op +import sqlalchemy as sa + + +# revision identifiers, used by Alembic. +revision: str = '7' +down_revision: Union[str, None] = '6' +branch_labels: Union[str, Sequence[str], None] = None +depends_on: Union[str, Sequence[str], None] = None + + +def upgrade() -> None: + # Drop the is_generated column + op.drop_column('podcasts', 'is_generated') + + +def downgrade() -> None: + # Add back the is_generated column with its original constraints + op.add_column('podcasts', sa.Column('is_generated', sa.Boolean(), nullable=False, server_default='false')) \ No newline at end of file diff --git a/surfsense_backend/alembic/versions/8_add_content_hash_to_documents.py b/surfsense_backend/alembic/versions/8_add_content_hash_to_documents.py new file mode 100644 index 000000000..64982fc56 --- /dev/null +++ b/surfsense_backend/alembic/versions/8_add_content_hash_to_documents.py @@ -0,0 +1,56 @@ +"""Add content_hash column to documents table + +Revision ID: 8 +Revises: 7 +""" +from typing import Sequence, Union + +from alembic import op +import sqlalchemy as sa + + +# revision identifiers, used by Alembic. +revision: str = '8' +down_revision: Union[str, None] = '7' +branch_labels: Union[str, Sequence[str], None] = None +depends_on: Union[str, Sequence[str], None] = None + + +def upgrade() -> None: + # Add content_hash column as nullable first to handle existing data + op.add_column('documents', sa.Column('content_hash', sa.String(), nullable=True)) + + # Update existing documents to generate content hashes + # Using SHA-256 hash of the content column with proper UTF-8 encoding + op.execute(""" + UPDATE documents + SET content_hash = encode(sha256(convert_to(content, 'UTF8')), 'hex') + WHERE content_hash IS NULL + """) + + # Handle duplicate content hashes by keeping only the oldest document for each hash + # Delete newer documents with duplicate content hashes + op.execute(""" + DELETE FROM documents + WHERE id NOT IN ( + SELECT MIN(id) + FROM documents + GROUP BY content_hash + ) + """) + + # Now alter the column to match the model: nullable=False, index=True, unique=True + op.alter_column('documents', 'content_hash', + existing_type=sa.String(), + nullable=False) + op.create_index(op.f('ix_documents_content_hash'), 'documents', ['content_hash'], unique=False) + op.create_unique_constraint(op.f('uq_documents_content_hash'), 'documents', ['content_hash']) + + +def downgrade() -> None: + # Remove constraints and index first + op.drop_constraint(op.f('uq_documents_content_hash'), 'documents', type_='unique') + op.drop_index(op.f('ix_documents_content_hash'), table_name='documents') + + # Remove content_hash column from documents table + op.drop_column('documents', 'content_hash') \ No newline at end of file diff --git a/surfsense_backend/alembic/versions/e55302644c51_add_github_connector_to_documenttype_.py b/surfsense_backend/alembic/versions/e55302644c51_add_github_connector_to_documenttype_.py index 1f15912d4..12d653794 100644 --- a/surfsense_backend/alembic/versions/e55302644c51_add_github_connector_to_documenttype_.py +++ b/surfsense_backend/alembic/versions/e55302644c51_add_github_connector_to_documenttype_.py @@ -2,7 +2,6 @@ Revision ID: e55302644c51 Revises: 1 -Create Date: 2025-04-13 19:56:00.059921 """ from typing import Sequence, Union diff --git a/surfsense_backend/app/agents/__init__.py b/surfsense_backend/app/agents/__init__.py index 944afebc6..e69de29bb 100644 --- a/surfsense_backend/app/agents/__init__.py +++ b/surfsense_backend/app/agents/__init__.py @@ -1 +0,0 @@ -"""This is upcoming research agent. Work in progress.""" \ No newline at end of file diff --git a/surfsense_backend/app/agents/podcaster/__init__.py b/surfsense_backend/app/agents/podcaster/__init__.py new file mode 100644 index 000000000..8459b2977 --- /dev/null +++ b/surfsense_backend/app/agents/podcaster/__init__.py @@ -0,0 +1,8 @@ +"""New LangGraph Agent. + +This module defines a custom graph. +""" + +from .graph import graph + +__all__ = ["graph"] diff --git a/surfsense_backend/app/agents/podcaster/configuration.py b/surfsense_backend/app/agents/podcaster/configuration.py new file mode 100644 index 000000000..6bbb4ce03 --- /dev/null +++ b/surfsense_backend/app/agents/podcaster/configuration.py @@ -0,0 +1,28 @@ +"""Define the configurable parameters for the agent.""" + +from __future__ import annotations + +from dataclasses import dataclass, fields +from typing import Optional + +from langchain_core.runnables import RunnableConfig + + +@dataclass(kw_only=True) +class Configuration: + """The configuration for the agent.""" + + # Changeme: Add configurable values here! + # these values can be pre-set when you + # create assistants (https://langchain-ai.github.io/langgraph/cloud/how-tos/configuration_cloud/) + # and when you invoke the graph + podcast_title: str + + @classmethod + def from_runnable_config( + cls, config: Optional[RunnableConfig] = None + ) -> Configuration: + """Create a Configuration instance from a RunnableConfig object.""" + configurable = (config.get("configurable") or {}) if config else {} + _fields = {f.name for f in fields(cls) if f.init} + return cls(**{k: v for k, v in configurable.items() if k in _fields}) diff --git a/surfsense_backend/app/agents/podcaster/graph.py b/surfsense_backend/app/agents/podcaster/graph.py new file mode 100644 index 000000000..d102432ef --- /dev/null +++ b/surfsense_backend/app/agents/podcaster/graph.py @@ -0,0 +1,31 @@ +from langgraph.graph import StateGraph + +from .configuration import Configuration +from .state import State + + +from .nodes import create_merged_podcast_audio, create_podcast_transcript + + +def build_graph(): + + # Define a new graph + workflow = StateGraph(State, config_schema=Configuration) + + # Add the node to the graph + workflow.add_node("create_podcast_transcript", create_podcast_transcript) + workflow.add_node("create_merged_podcast_audio", create_merged_podcast_audio) + + # Set the entrypoint as `call_model` + workflow.add_edge("__start__", "create_podcast_transcript") + workflow.add_edge("create_podcast_transcript", "create_merged_podcast_audio") + workflow.add_edge("create_merged_podcast_audio", "__end__") + + # Compile the workflow into an executable graph + graph = workflow.compile() + graph.name = "Surfsense Podcaster" # This defines the custom name in LangSmith + + return graph + +# Compile the graph once when the module is loaded +graph = build_graph() diff --git a/surfsense_backend/app/agents/podcaster/nodes.py b/surfsense_backend/app/agents/podcaster/nodes.py new file mode 100644 index 000000000..9ea590a53 --- /dev/null +++ b/surfsense_backend/app/agents/podcaster/nodes.py @@ -0,0 +1,206 @@ +from typing import Any, Dict +import json +import os +import uuid +from pathlib import Path +import asyncio + +from langchain_core.messages import HumanMessage, SystemMessage +from langchain_core.runnables import RunnableConfig +from litellm import aspeech +from ffmpeg.asyncio import FFmpeg + +from .configuration import Configuration +from .state import PodcastTranscriptEntry, State, PodcastTranscripts +from .prompts import get_podcast_generation_prompt +from app.config import config as app_config + + +async def create_podcast_transcript(state: State, config: RunnableConfig) -> Dict[str, Any]: + """Each node does work.""" + + # Initialize LLM + llm = app_config.long_context_llm_instance + + # Get the prompt + prompt = get_podcast_generation_prompt() + + # Create the messages + messages = [ + SystemMessage(content=prompt), + HumanMessage(content=f"{state.source_content}") + ] + + # Generate the podcast transcript + llm_response = await llm.ainvoke(messages) + + # First try the direct approach + try: + podcast_transcript = PodcastTranscripts.model_validate(json.loads(llm_response.content)) + except (json.JSONDecodeError, ValueError) as e: + print(f"Direct JSON parsing failed, trying fallback approach: {str(e)}") + + # Fallback: Parse the JSON response manually + try: + # Extract JSON content from the response + content = llm_response.content + + # Find the JSON in the content (handle case where LLM might add additional text) + json_start = content.find('{') + json_end = content.rfind('}') + 1 + if json_start >= 0 and json_end > json_start: + json_str = content[json_start:json_end] + + # Parse the JSON string + parsed_data = json.loads(json_str) + + # Convert to Pydantic model + podcast_transcript = PodcastTranscripts.model_validate(parsed_data) + + print(f"Successfully parsed podcast transcript using fallback approach") + else: + # If JSON structure not found, raise a clear error + error_message = f"Could not find valid JSON in LLM response. Raw response: {content}" + print(error_message) + raise ValueError(error_message) + + except (json.JSONDecodeError, ValueError) as e2: + # Log the error and re-raise it + error_message = f"Error parsing LLM response (fallback also failed): {str(e2)}" + print(f"Error parsing LLM response: {str(e2)}") + print(f"Raw response: {llm_response.content}") + raise + + return { + "podcast_transcript": podcast_transcript.podcast_transcripts + } + + +async def create_merged_podcast_audio(state: State, config: RunnableConfig) -> Dict[str, Any]: + """Generate audio for each transcript and merge them into a single podcast file.""" + + configuration = Configuration.from_runnable_config(config) + + starting_transcript = PodcastTranscriptEntry( + speaker_id=1, + dialog=f"Welcome to {configuration.podcast_title} Podcast." + ) + + transcript = state.podcast_transcript + + # Merge the starting transcript with the podcast transcript + # Check if transcript is a PodcastTranscripts object or already a list + if hasattr(transcript, 'podcast_transcripts'): + transcript_entries = transcript.podcast_transcripts + else: + transcript_entries = transcript + + merged_transcript = [starting_transcript] + transcript_entries + + # Create a temporary directory for audio files + temp_dir = Path("temp_audio") + temp_dir.mkdir(exist_ok=True) + + # Generate a unique session ID for this podcast + session_id = str(uuid.uuid4()) + output_path = f"podcasts/{session_id}_podcast.mp3" + os.makedirs("podcasts", exist_ok=True) + + # Map of speaker_id to voice + voice_mapping = { + 0: "alloy", # Default/intro voice + 1: "echo", # First speaker + # 2: "fable", # Second speaker + # 3: "onyx", # Third speaker + # 4: "nova", # Fourth speaker + # 5: "shimmer" # Fifth speaker + } + + # Generate audio for each transcript segment + audio_files = [] + + async def generate_speech_for_segment(segment, index): + # Handle both dictionary and PodcastTranscriptEntry objects + if hasattr(segment, 'speaker_id'): + speaker_id = segment.speaker_id + dialog = segment.dialog + else: + speaker_id = segment.get("speaker_id", 0) + dialog = segment.get("dialog", "") + + # Select voice based on speaker_id + voice = voice_mapping.get(speaker_id, "alloy") + + # Generate a unique filename for this segment + filename = f"{temp_dir}/{session_id}_{index}.mp3" + + try: + if app_config.TTS_SERVICE_API_BASE: + response = await aspeech( + model=app_config.TTS_SERVICE, + api_base=app_config.TTS_SERVICE_API_BASE, + voice=voice, + input=dialog, + max_retries=2, + timeout=600, + ) + else: + response = await aspeech( + model=app_config.TTS_SERVICE, + voice=voice, + input=dialog, + max_retries=2, + timeout=600, + ) + + # Save the audio to a file - use proper streaming method + with open(filename, 'wb') as f: + f.write(response.content) + + return filename + except Exception as e: + print(f"Error generating speech for segment {index}: {str(e)}") + raise + + # Generate all audio files concurrently + tasks = [generate_speech_for_segment(segment, i) for i, segment in enumerate(merged_transcript)] + audio_files = await asyncio.gather(*tasks) + + # Merge audio files using ffmpeg + try: + # Create FFmpeg instance with the first input + ffmpeg = FFmpeg().option("y") + + # Add each audio file as input + for audio_file in audio_files: + ffmpeg = ffmpeg.input(audio_file) + + # Configure the concatenation and output + filter_complex = [] + for i in range(len(audio_files)): + filter_complex.append(f"[{i}:0]") + + filter_complex_str = "".join(filter_complex) + f"concat=n={len(audio_files)}:v=0:a=1[outa]" + ffmpeg = ffmpeg.option("filter_complex", filter_complex_str) + ffmpeg = ffmpeg.output(output_path, map="[outa]") + + # Execute FFmpeg + await ffmpeg.execute() + + print(f"Successfully created podcast audio: {output_path}") + + except Exception as e: + print(f"Error merging audio files: {str(e)}") + raise + finally: + # Clean up temporary files + for audio_file in audio_files: + try: + os.remove(audio_file) + except: + pass + + return { + "podcast_transcript": merged_transcript, + "final_podcast_file_path": output_path + } diff --git a/surfsense_backend/app/agents/podcaster/prompts.py b/surfsense_backend/app/agents/podcaster/prompts.py new file mode 100644 index 000000000..c08d38e31 --- /dev/null +++ b/surfsense_backend/app/agents/podcaster/prompts.py @@ -0,0 +1,111 @@ +import datetime + + +def get_podcast_generation_prompt(): + return f""" +Today's date: {datetime.datetime.now().strftime("%Y-%m-%d")} + +You are a master podcast scriptwriter, adept at transforming diverse input content into a lively, engaging, and natural-sounding conversation between two distinct podcast hosts. Your primary objective is to craft authentic, flowing dialogue that captures the spontaneity and chemistry of a real podcast discussion, completely avoiding any hint of robotic scripting or stiff formality. Think dynamic interplay, not just information delivery. + + +- '': A block of text containing the information to be discussed in the podcast. This could be research findings, an article summary, a detailed outline, user chat history related to the topic, or any other relevant raw information. The content might be unstructured but serves as the factual basis for the podcast dialogue. + + + +A JSON object containing the podcast transcript with alternating speakers: +{{ + "podcast_transcripts": [ + {{ + "speaker_id": 0, + "dialog": "Speaker 0 dialog here" + }}, + {{ + "speaker_id": 1, + "dialog": "Speaker 1 dialog here" + }}, + {{ + "speaker_id": 0, + "dialog": "Speaker 0 dialog here" + }}, + {{ + "speaker_id": 1, + "dialog": "Speaker 1 dialog here" + }} + ] +}} + + + +1. **Establish Distinct & Consistent Host Personas:** + * **Speaker 0 (Lead Host):** Drives the conversation forward, introduces segments, poses key questions derived from the source content, and often summarizes takeaways. Maintain a guiding, clear, and engaging tone. + * **Speaker 1 (Co-Host/Expert):** Offers deeper insights, provides alternative viewpoints or elaborations on the source content, asks clarifying or challenging questions, and shares relevant anecdotes or examples. Adopt a complementary tone (e.g., analytical, enthusiastic, reflective, slightly skeptical). + * **Consistency is Key:** Ensure each speaker maintains their distinct voice, vocabulary choice, sentence structure, and perspective throughout the entire script. Avoid having them sound interchangeable. Their interaction should feel like a genuine partnership. + +2. **Craft Natural & Dynamic Dialogue:** + * **Emulate Real Conversation:** Use contractions (e.g., "don't", "it's"), interjections ("Oh!", "Wow!", "Hmm"), discourse markers ("you know", "right?", "well"), and occasional natural pauses or filler words. Avoid overly formal language or complex sentence structures typical of written text. + * **Foster Interaction & Chemistry:** Write dialogue where speakers genuinely react *to each other*. They should build on points ("Exactly, and that reminds me..."), ask follow-up questions ("Could you expand on that?"), express agreement/disagreement respectfully ("That's a fair point, but have you considered...?"), and show active listening. + * **Vary Rhythm & Pace:** Mix short, punchy lines with longer, more explanatory ones. Vary sentence beginnings. Use questions to break up exposition. The rhythm should feel spontaneous, not monotonous. + * **Inject Personality & Relatability:** Allow for appropriate humor, moments of surprise or curiosity, brief personal reflections ("I actually experienced something similar..."), or relatable asides that fit the hosts' personas and the topic. Lightly reference past discussions if it enhances context ("Remember last week when we touched on...?"). + +3. **Structure for Flow and Listener Engagement:** + * **Natural Beginning:** Start with dialogue that flows naturally after an introduction (which will be added manually). Avoid redundant greetings or podcast name mentions since these will be added separately. + * **Logical Progression & Signposting:** Guide the listener through the information smoothly. Use clear transitions to link different ideas or segments ("So, now that we've covered X, let's dive into Y...", "That actually brings me to another key finding..."). Ensure topics flow logically from one to the next. + * **Meaningful Conclusion:** Summarize the key takeaways or main points discussed, reinforcing the core message derived from the source content. End with a final thought, a lingering question for the audience, or a brief teaser for what's next, providing a sense of closure. Avoid abrupt endings. + +4. **Integrate Source Content Seamlessly & Accurately:** + * **Translate, Don't Recite:** Rephrase information from the `` into conversational language suitable for each host's persona. Avoid directly copying dense sentences or technical jargon without explanation. The goal is discussion, not narration. + * **Explain & Contextualize:** Use analogies, simple examples, storytelling, or have one host ask clarifying questions (acting as a listener surrogate) to break down complex ideas from the source. + * **Weave Information Naturally:** Integrate facts, data, or key points from the source *within* the dialogue, not as standalone, undigested blocks. Attribute information conversationally where appropriate ("The research mentioned...", "Apparently, the key factor is..."). + * **Balance Depth & Accessibility:** Ensure the conversation is informative and factually accurate based on the source content, but prioritize clear communication and engaging delivery over exhaustive technical detail. Make it understandable and interesting for a general audience. + +5. **Length & Pacing:** + * **Six-Minute Duration:** Create a transcript that, when read at a natural speaking pace, would result in approximately 6 minutes of audio. Typically, this means around 1000 words total (based on average speaking rate of 150 words per minute). + * **Concise Speaking Turns:** Keep most speaking turns relatively brief and focused. Aim for a natural back-and-forth rhythm rather than extended monologues. + * **Essential Content Only:** Prioritize the most important information from the source content. Focus on quality over quantity, ensuring every line contributes meaningfully to the topic. + + + +Input: "Quantum computing uses quantum bits or qubits which can exist in multiple states simultaneously due to superposition." + +Output: +{{ + "podcast_transcripts": [ + {{ + "speaker_id": 0, + "dialog": "Today we're diving into the mind-bending world of quantum computing. You know, this is a topic I've been excited to cover for weeks." + }}, + {{ + "speaker_id": 1, + "dialog": "Same here! And I know our listeners have been asking for it. But I have to admit, the concept of quantum computing makes my head spin a little. Can we start with the basics?" + }}, + {{ + "speaker_id": 0, + "dialog": "Absolutely. So regular computers use bits, right? Little on-off switches that are either 1 or 0. But quantum computers use something called qubits, and this is where it gets fascinating." + }}, + {{ + "speaker_id": 1, + "dialog": "Wait, what makes qubits so special compared to regular bits?" + }}, + {{ + "speaker_id": 0, + "dialog": "The magic is in something called superposition. These qubits can exist in multiple states at the same time, not just 1 or 0." + }}, + {{ + "speaker_id": 1, + "dialog": "That sounds impossible! How would you even picture that?" + }}, + {{ + "speaker_id": 0, + "dialog": "Think of it like a coin spinning in the air. Before it lands, is it heads or tails?" + }}, + {{ + "speaker_id": 1, + "dialog": "Well, it's... neither? Or I guess both, until it lands? Oh, I think I see where you're going with this." + }} + ] +}} + + +Transform the source material into a lively and engaging podcast conversation. Craft dialogue that showcases authentic host chemistry and natural interaction (including occasional disagreement, building on points, or asking follow-up questions). Use varied speech patterns reflecting real human conversation, ensuring the final script effectively educates *and* entertains the listener while keeping within a 5-minute audio duration. + +""" \ No newline at end of file diff --git a/surfsense_backend/app/agents/podcaster/state.py b/surfsense_backend/app/agents/podcaster/state.py new file mode 100644 index 000000000..d77270d22 --- /dev/null +++ b/surfsense_backend/app/agents/podcaster/state.py @@ -0,0 +1,38 @@ +"""Define the state structures for the agent.""" + +from __future__ import annotations + +from dataclasses import dataclass +from typing import List, Optional +from pydantic import BaseModel, Field + + +class PodcastTranscriptEntry(BaseModel): + """ + Represents a single entry in a podcast transcript. + """ + speaker_id: int = Field(..., description="The ID of the speaker (0 or 1)") + dialog: str = Field(..., description="The dialog text spoken by the speaker") + + +class PodcastTranscripts(BaseModel): + """ + Represents the full podcast transcript structure. + """ + podcast_transcripts: List[PodcastTranscriptEntry] = Field( + ..., + description="List of transcript entries with alternating speakers" + ) + +@dataclass +class State: + """Defines the input state for the agent, representing a narrower interface to the outside world. + + This class is used to define the initial state and structure of incoming data. + See: https://langchain-ai.github.io/langgraph/concepts/low_level/#state + for more information. + """ + + source_content: str + podcast_transcript: Optional[List[PodcastTranscriptEntry]] = None + final_podcast_file_path: Optional[str] = None diff --git a/surfsense_backend/app/agents/researcher/configuration.py b/surfsense_backend/app/agents/researcher/configuration.py index 8ba3849a3..0eb34b540 100644 --- a/surfsense_backend/app/agents/researcher/configuration.py +++ b/surfsense_backend/app/agents/researcher/configuration.py @@ -3,10 +3,16 @@ from __future__ import annotations from dataclasses import dataclass, fields +from enum import Enum from typing import Optional, List, Any from langchain_core.runnables import RunnableConfig +class SearchMode(Enum): + """Enum defining the type of search mode.""" + CHUNKS = "CHUNKS" + DOCUMENTS = "DOCUMENTS" + @dataclass(kw_only=True) class Configuration: @@ -18,6 +24,7 @@ class Configuration: connectors_to_search: List[str] user_id: str search_space_id: int + search_mode: SearchMode @classmethod diff --git a/surfsense_backend/app/agents/researcher/graph.py b/surfsense_backend/app/agents/researcher/graph.py index 31835da4a..0f6915f7a 100644 --- a/surfsense_backend/app/agents/researcher/graph.py +++ b/surfsense_backend/app/agents/researcher/graph.py @@ -1,6 +1,6 @@ from langgraph.graph import StateGraph from .state import State -from .nodes import write_answer_outline, process_sections +from .nodes import reformulate_user_query, write_answer_outline, process_sections from .configuration import Configuration from typing import TypedDict, List, Dict, Any, Optional @@ -25,11 +25,13 @@ def build_graph(): workflow = StateGraph(State, config_schema=Configuration) # Add nodes to the graph + workflow.add_node("reformulate_user_query", reformulate_user_query) workflow.add_node("write_answer_outline", write_answer_outline) workflow.add_node("process_sections", process_sections) # Define the edges - create a linear flow - workflow.add_edge("__start__", "write_answer_outline") + workflow.add_edge("__start__", "reformulate_user_query") + workflow.add_edge("reformulate_user_query", "write_answer_outline") workflow.add_edge("write_answer_outline", "process_sections") workflow.add_edge("process_sections", "__end__") diff --git a/surfsense_backend/app/agents/researcher/nodes.py b/surfsense_backend/app/agents/researcher/nodes.py index 15935f2ea..fcec44096 100644 --- a/surfsense_backend/app/agents/researcher/nodes.py +++ b/surfsense_backend/app/agents/researcher/nodes.py @@ -10,10 +10,14 @@ from langchain_core.runnables import RunnableConfig from pydantic import BaseModel, Field from sqlalchemy.ext.asyncio import AsyncSession -from .configuration import Configuration +from .configuration import Configuration, SearchMode from .prompts import get_answer_outline_system_prompt from .state import State from .sub_section_writer.graph import graph as sub_section_writer_graph +from .sub_section_writer.configuration import SubSectionType + +from app.utils.query_service import QueryService + from langgraph.types import StreamWriter @@ -41,14 +45,15 @@ async def write_answer_outline(state: State, config: RunnableConfig, writer: Str """ streaming_service = state.streaming_service - streaming_service.only_update_terminal("Generating answer outline...") + streaming_service.only_update_terminal("🔍 Generating answer outline...") writer({"yeild_value": streaming_service._format_annotations()}) # Get configuration from runnable config configuration = Configuration.from_runnable_config(config) + reformulated_query = state.reformulated_query user_query = configuration.user_query num_sections = configuration.num_sections - streaming_service.only_update_terminal(f"Planning research approach for query: {user_query[:100]}...") + streaming_service.only_update_terminal(f"🤔 Planning research approach for: \"{user_query[:100]}...\"") writer({"yeild_value": streaming_service._format_annotations()}) # Initialize LLM @@ -58,7 +63,7 @@ async def write_answer_outline(state: State, config: RunnableConfig, writer: Str human_message_content = f""" Now Please create an answer outline for the following query: - User Query: {user_query} + User Query: {reformulated_query} Number of Sections: {num_sections} Remember to format your response as valid JSON exactly matching this structure: @@ -78,7 +83,7 @@ async def write_answer_outline(state: State, config: RunnableConfig, writer: Str Your output MUST be valid JSON in exactly this format. Do not include any other text or explanation. """ - streaming_service.only_update_terminal("Designing structured outline with AI...") + streaming_service.only_update_terminal("📝 Designing structured outline with AI...") writer({"yeild_value": streaming_service._format_annotations()}) # Create messages for the LLM @@ -88,7 +93,7 @@ async def write_answer_outline(state: State, config: RunnableConfig, writer: Str ] # Call the LLM directly without using structured output - streaming_service.only_update_terminal("Processing answer structure...") + streaming_service.only_update_terminal("âš™ī¸ Processing answer structure...") writer({"yeild_value": streaming_service._format_annotations()}) response = await llm.ainvoke(messages) @@ -111,7 +116,7 @@ async def write_answer_outline(state: State, config: RunnableConfig, writer: Str answer_outline = AnswerOutline(**parsed_data) total_questions = sum(len(section.questions) for section in answer_outline.answer_outline) - streaming_service.only_update_terminal(f"Successfully generated outline with {len(answer_outline.answer_outline)} sections and {total_questions} research questions") + streaming_service.only_update_terminal(f"✅ Successfully generated outline with {len(answer_outline.answer_outline)} sections and {total_questions} research questions!") writer({"yeild_value": streaming_service._format_annotations()}) print(f"Successfully generated answer outline with {len(answer_outline.answer_outline)} sections") @@ -121,14 +126,14 @@ async def write_answer_outline(state: State, config: RunnableConfig, writer: Str else: # If JSON structure not found, raise a clear error error_message = f"Could not find valid JSON in LLM response. Raw response: {content}" - streaming_service.only_update_terminal(error_message, "error") + streaming_service.only_update_terminal(f"❌ {error_message}", "error") writer({"yeild_value": streaming_service._format_annotations()}) raise ValueError(error_message) except (json.JSONDecodeError, ValueError) as e: # Log the error and re-raise it error_message = f"Error parsing LLM response: {str(e)}" - streaming_service.only_update_terminal(error_message, "error") + streaming_service.only_update_terminal(f"❌ {error_message}", "error") writer({"yeild_value": streaming_service._format_annotations()}) print(f"Error parsing LLM response: {str(e)}") @@ -143,11 +148,18 @@ async def fetch_relevant_documents( connectors_to_search: List[str], writer: StreamWriter = None, state: State = None, - top_k: int = 20 + top_k: int = 10, + connector_service: ConnectorService = None, + search_mode: SearchMode = SearchMode.CHUNKS ) -> List[Dict[str, Any]]: """ Fetch relevant documents for research questions using the provided connectors. + This function searches across multiple data sources for information related to the + research questions. It provides user-friendly feedback during the search process by + displaying connector names (like "Web Search" instead of "TAVILY_API") and adding + relevant emojis to indicate the type of source being searched. + Args: research_questions: List of research questions to find documents for user_id: The user ID @@ -157,19 +169,22 @@ async def fetch_relevant_documents( writer: StreamWriter for sending progress updates state: The current state containing the streaming service top_k: Number of top results to retrieve per connector per question + connector_service: An initialized connector service to use for searching Returns: List of relevant documents """ # Initialize services - connector_service = ConnectorService(db_session) + # connector_service = ConnectorService(db_session) # Only use streaming if both writer and state are provided streaming_service = state.streaming_service if state is not None else None # Stream initial status update if streaming_service and writer: - streaming_service.only_update_terminal(f"Starting research on {len(research_questions)} questions using {len(connectors_to_search)} connectors...") + connector_names = [get_connector_friendly_name(connector) for connector in connectors_to_search] + connector_names_str = ", ".join(connector_names) + streaming_service.only_update_terminal(f"🔎 Starting research on {len(research_questions)} questions using {connector_names_str} data sources") writer({"yeild_value": streaming_service._format_annotations()}) all_raw_documents = [] # Store all raw documents @@ -178,7 +193,7 @@ async def fetch_relevant_documents( for i, user_query in enumerate(research_questions): # Stream question being researched if streaming_service and writer: - streaming_service.only_update_terminal(f"Researching question {i+1}/{len(research_questions)}: {user_query[:100]}...") + streaming_service.only_update_terminal(f"🧠 Researching question {i+1}/{len(research_questions)}: \"{user_query[:100]}...\"") writer({"yeild_value": streaming_service._format_annotations()}) # Use original research question as the query @@ -188,7 +203,9 @@ async def fetch_relevant_documents( for connector in connectors_to_search: # Stream connector being searched if streaming_service and writer: - streaming_service.only_update_terminal(f"Searching {connector} for relevant information...") + connector_emoji = get_connector_emoji(connector) + friendly_name = get_connector_friendly_name(connector) + streaming_service.only_update_terminal(f"{connector_emoji} Searching {friendly_name} for relevant information...") writer({"yeild_value": streaming_service._format_annotations()}) try: @@ -197,7 +214,8 @@ async def fetch_relevant_documents( user_query=reformulated_query, user_id=user_id, search_space_id=search_space_id, - top_k=top_k + top_k=top_k, + search_mode=search_mode ) # Add to sources and raw documents @@ -207,7 +225,7 @@ async def fetch_relevant_documents( # Stream found document count if streaming_service and writer: - streaming_service.only_update_terminal(f"Found {len(youtube_chunks)} YouTube chunks relevant to the query") + streaming_service.only_update_terminal(f"📹 Found {len(youtube_chunks)} YouTube chunks related to your query") writer({"yeild_value": streaming_service._format_annotations()}) elif connector == "EXTENSION": @@ -215,7 +233,8 @@ async def fetch_relevant_documents( user_query=reformulated_query, user_id=user_id, search_space_id=search_space_id, - top_k=top_k + top_k=top_k, + search_mode=search_mode ) # Add to sources and raw documents @@ -225,7 +244,7 @@ async def fetch_relevant_documents( # Stream found document count if streaming_service and writer: - streaming_service.only_update_terminal(f"Found {len(extension_chunks)} extension chunks relevant to the query") + streaming_service.only_update_terminal(f"🧩 Found {len(extension_chunks)} Browser Extension chunks related to your query") writer({"yeild_value": streaming_service._format_annotations()}) elif connector == "CRAWLED_URL": @@ -233,7 +252,8 @@ async def fetch_relevant_documents( user_query=reformulated_query, user_id=user_id, search_space_id=search_space_id, - top_k=top_k + top_k=top_k, + search_mode=search_mode ) # Add to sources and raw documents @@ -243,7 +263,7 @@ async def fetch_relevant_documents( # Stream found document count if streaming_service and writer: - streaming_service.only_update_terminal(f"Found {len(crawled_urls_chunks)} crawled URL chunks relevant to the query") + streaming_service.only_update_terminal(f"🌐 Found {len(crawled_urls_chunks)} Web Pages chunks related to your query") writer({"yeild_value": streaming_service._format_annotations()}) elif connector == "FILE": @@ -251,7 +271,8 @@ async def fetch_relevant_documents( user_query=reformulated_query, user_id=user_id, search_space_id=search_space_id, - top_k=top_k + top_k=top_k, + search_mode=search_mode ) # Add to sources and raw documents @@ -261,9 +282,86 @@ async def fetch_relevant_documents( # Stream found document count if streaming_service and writer: - streaming_service.only_update_terminal(f"Found {len(files_chunks)} file chunks relevant to the query") + streaming_service.only_update_terminal(f"📄 Found {len(files_chunks)} Files chunks related to your query") writer({"yeild_value": streaming_service._format_annotations()}) + + elif connector == "SLACK_CONNECTOR": + source_object, slack_chunks = await connector_service.search_slack( + user_query=reformulated_query, + user_id=user_id, + search_space_id=search_space_id, + top_k=top_k, + search_mode=search_mode + ) + + # Add to sources and raw documents + if source_object: + all_sources.append(source_object) + all_raw_documents.extend(slack_chunks) + + # Stream found document count + if streaming_service and writer: + streaming_service.only_update_terminal(f"đŸ’Ŧ Found {len(slack_chunks)} Slack messages related to your query") + writer({"yeild_value": streaming_service._format_annotations()}) + + elif connector == "NOTION_CONNECTOR": + source_object, notion_chunks = await connector_service.search_notion( + user_query=reformulated_query, + user_id=user_id, + search_space_id=search_space_id, + top_k=top_k, + search_mode=search_mode + ) + + # Add to sources and raw documents + if source_object: + all_sources.append(source_object) + all_raw_documents.extend(notion_chunks) + + # Stream found document count + if streaming_service and writer: + streaming_service.only_update_terminal(f"📘 Found {len(notion_chunks)} Notion pages/blocks related to your query") + writer({"yeild_value": streaming_service._format_annotations()}) + + elif connector == "GITHUB_CONNECTOR": + source_object, github_chunks = await connector_service.search_github( + user_query=reformulated_query, + user_id=user_id, + search_space_id=search_space_id, + top_k=top_k, + search_mode=search_mode + ) + + # Add to sources and raw documents + if source_object: + all_sources.append(source_object) + all_raw_documents.extend(github_chunks) + + # Stream found document count + if streaming_service and writer: + streaming_service.only_update_terminal(f"🐙 Found {len(github_chunks)} GitHub files/issues related to your query") + writer({"yeild_value": streaming_service._format_annotations()}) + + elif connector == "LINEAR_CONNECTOR": + source_object, linear_chunks = await connector_service.search_linear( + user_query=reformulated_query, + user_id=user_id, + search_space_id=search_space_id, + top_k=top_k, + search_mode=search_mode + ) + + # Add to sources and raw documents + if source_object: + all_sources.append(source_object) + all_raw_documents.extend(linear_chunks) + + # Stream found document count + if streaming_service and writer: + streaming_service.only_update_terminal(f"📊 Found {len(linear_chunks)} Linear issues related to your query") + writer({"yeild_value": streaming_service._format_annotations()}) + elif connector == "TAVILY_API": source_object, tavily_chunks = await connector_service.search_tavily( user_query=reformulated_query, @@ -278,87 +376,40 @@ async def fetch_relevant_documents( # Stream found document count if streaming_service and writer: - streaming_service.only_update_terminal(f"Found {len(tavily_chunks)} web search results relevant to the query") + streaming_service.only_update_terminal(f"🔍 Found {len(tavily_chunks)} Web Search results related to your query") writer({"yeild_value": streaming_service._format_annotations()}) - - elif connector == "SLACK_CONNECTOR": - source_object, slack_chunks = await connector_service.search_slack( + + elif connector == "LINKUP_API": + if top_k > 10: + linkup_mode = "deep" + else: + linkup_mode = "standard" + + source_object, linkup_chunks = await connector_service.search_linkup( user_query=reformulated_query, user_id=user_id, - search_space_id=search_space_id, - top_k=top_k - ) + mode=linkup_mode + ) # Add to sources and raw documents if source_object: all_sources.append(source_object) - all_raw_documents.extend(slack_chunks) + all_raw_documents.extend(linkup_chunks) # Stream found document count if streaming_service and writer: - streaming_service.only_update_terminal(f"Found {len(slack_chunks)} Slack messages relevant to the query") + streaming_service.only_update_terminal(f"🔗 Found {len(linkup_chunks)} Linkup results related to your query") writer({"yeild_value": streaming_service._format_annotations()}) - elif connector == "NOTION_CONNECTOR": - source_object, notion_chunks = await connector_service.search_notion( - user_query=reformulated_query, - user_id=user_id, - search_space_id=search_space_id, - top_k=top_k - ) - - # Add to sources and raw documents - if source_object: - all_sources.append(source_object) - all_raw_documents.extend(notion_chunks) - - # Stream found document count - if streaming_service and writer: - streaming_service.only_update_terminal(f"Found {len(notion_chunks)} Notion pages/blocks relevant to the query") - writer({"yeild_value": streaming_service._format_annotations()}) - - elif connector == "GITHUB_CONNECTOR": - source_object, github_chunks = await connector_service.search_github( - user_query=reformulated_query, - user_id=user_id, - search_space_id=search_space_id, - top_k=top_k - ) - - # Add to sources and raw documents - if source_object: - all_sources.append(source_object) - all_raw_documents.extend(github_chunks) - - # Stream found document count - if streaming_service and writer: - streaming_service.only_update_terminal(f"Found {len(github_chunks)} GitHub files/issues relevant to the query") - writer({"yeild_value": streaming_service._format_annotations()}) - - elif connector == "LINEAR_CONNECTOR": - source_object, linear_chunks = await connector_service.search_linear( - user_query=reformulated_query, - user_id=user_id, - search_space_id=search_space_id, - top_k=top_k - ) - - # Add to sources and raw documents - if source_object: - all_sources.append(source_object) - all_raw_documents.extend(linear_chunks) - - # Stream found document count - if streaming_service and writer: - streaming_service.only_update_terminal(f"Found {len(linear_chunks)} Linear issues relevant to the query") - writer({"yeild_value": streaming_service._format_annotations()}) + except Exception as e: error_message = f"Error searching connector {connector}: {str(e)}" print(error_message) # Stream error message if streaming_service and writer: - streaming_service.only_update_terminal(error_message, "error") + friendly_name = get_connector_friendly_name(connector) + streaming_service.only_update_terminal(f"âš ī¸ Error searching {friendly_name}: {str(e)}", "error") writer({"yeild_value": streaming_service._format_annotations()}) # Continue with other connectors on error @@ -385,7 +436,7 @@ async def fetch_relevant_documents( # Stream info about deduplicated sources if streaming_service and writer: - streaming_service.only_update_terminal(f"Collected {len(deduplicated_sources)} unique sources across all connectors") + streaming_service.only_update_terminal(f"📚 Collected {len(deduplicated_sources)} unique sources across all connectors") writer({"yeild_value": streaming_service._format_annotations()}) # After all sources are collected and deduplicated, stream them @@ -415,12 +466,44 @@ async def fetch_relevant_documents( # Stream info about deduplicated documents if streaming_service and writer: - streaming_service.only_update_terminal(f"Found {len(deduplicated_docs)} unique document chunks after deduplication") + streaming_service.only_update_terminal(f"🧹 Found {len(deduplicated_docs)} unique document chunks after removing duplicates") writer({"yeild_value": streaming_service._format_annotations()}) # Return deduplicated documents return deduplicated_docs +def get_connector_emoji(connector_name: str) -> str: + """Get an appropriate emoji for a connector type.""" + connector_emojis = { + "YOUTUBE_VIDEO": "📹", + "EXTENSION": "🧩", + "CRAWLED_URL": "🌐", + "FILE": "📄", + "SLACK_CONNECTOR": "đŸ’Ŧ", + "NOTION_CONNECTOR": "📘", + "GITHUB_CONNECTOR": "🐙", + "LINEAR_CONNECTOR": "📊", + "TAVILY_API": "🔍", + "LINKUP_API": "🔗" + } + return connector_emojis.get(connector_name, "🔎") + +def get_connector_friendly_name(connector_name: str) -> str: + """Convert technical connector IDs to user-friendly names.""" + connector_friendly_names = { + "YOUTUBE_VIDEO": "YouTube", + "EXTENSION": "Browser Extension", + "CRAWLED_URL": "Web Pages", + "FILE": "Files", + "SLACK_CONNECTOR": "Slack", + "NOTION_CONNECTOR": "Notion", + "GITHUB_CONNECTOR": "GitHub", + "LINEAR_CONNECTOR": "Linear", + "TAVILY_API": "Tavily Search", + "LINKUP_API": "Linkup Search" + } + return connector_friendly_names.get(connector_name, connector_name) + async def process_sections(state: State, config: RunnableConfig, writer: StreamWriter) -> Dict[str, Any]: """ Process all sections in parallel and combine the results. @@ -437,13 +520,17 @@ async def process_sections(state: State, config: RunnableConfig, writer: StreamW answer_outline = state.answer_outline streaming_service = state.streaming_service - streaming_service.only_update_terminal(f"Starting to process research sections...") + # Initialize a dictionary to track content for all sections + # This is used to maintain section content while streaming multiple sections + section_contents = {} + + streaming_service.only_update_terminal(f"🚀 Starting to process research sections...") writer({"yeild_value": streaming_service._format_annotations()}) print(f"Processing sections from outline: {answer_outline is not None}") if not answer_outline: - streaming_service.only_update_terminal("Error: No answer outline was provided. Cannot generate report.", "error") + streaming_service.only_update_terminal("❌ Error: No answer outline was provided. Cannot generate report.", "error") writer({"yeild_value": streaming_service._format_annotations()}) return { "final_written_report": "No answer outline was provided. Cannot generate final report." @@ -455,16 +542,26 @@ async def process_sections(state: State, config: RunnableConfig, writer: StreamW all_questions.extend(section.questions) print(f"Collected {len(all_questions)} questions from all sections") - streaming_service.only_update_terminal(f"Found {len(all_questions)} research questions across {len(answer_outline.answer_outline)} sections") + streaming_service.only_update_terminal(f"🧩 Found {len(all_questions)} research questions across {len(answer_outline.answer_outline)} sections") writer({"yeild_value": streaming_service._format_annotations()}) # Fetch relevant documents once for all questions - streaming_service.only_update_terminal("Searching for relevant information across all connectors...") + streaming_service.only_update_terminal("🔍 Searching for relevant information across all connectors...") writer({"yeild_value": streaming_service._format_annotations()}) + if configuration.num_sections == 1: + TOP_K = 10 + elif configuration.num_sections == 3: + TOP_K = 20 + elif configuration.num_sections == 6: + TOP_K = 30 + relevant_documents = [] async with async_session_maker() as db_session: try: + # Create connector service inside the db_session scope + connector_service = ConnectorService(db_session) + relevant_documents = await fetch_relevant_documents( research_questions=all_questions, user_id=configuration.user_id, @@ -472,30 +569,47 @@ async def process_sections(state: State, config: RunnableConfig, writer: StreamW db_session=db_session, connectors_to_search=configuration.connectors_to_search, writer=writer, - state=state + state=state, + top_k=TOP_K, + connector_service=connector_service, + search_mode=configuration.search_mode ) except Exception as e: error_message = f"Error fetching relevant documents: {str(e)}" print(error_message) - streaming_service.only_update_terminal(error_message, "error") + streaming_service.only_update_terminal(f"❌ {error_message}", "error") writer({"yeild_value": streaming_service._format_annotations()}) # Log the error and continue with an empty list of documents # This allows the process to continue, but the report might lack information relevant_documents = [] - # Consider adding more robust error handling or reporting if needed print(f"Fetched {len(relevant_documents)} relevant documents for all sections") - streaming_service.only_update_terminal(f"Starting to draft {len(answer_outline.answer_outline)} sections using {len(relevant_documents)} relevant document chunks") + streaming_service.only_update_terminal(f"✨ Starting to draft {len(answer_outline.answer_outline)} sections using {len(relevant_documents)} relevant document chunks") writer({"yeild_value": streaming_service._format_annotations()}) # Create tasks to process each section in parallel with the same document set section_tasks = [] - streaming_service.only_update_terminal("Creating processing tasks for each section...") + streaming_service.only_update_terminal("âš™ī¸ Creating processing tasks for each section...") writer({"yeild_value": streaming_service._format_annotations()}) - for section in answer_outline.answer_outline: + for i, section in enumerate(answer_outline.answer_outline): + if i == 0: + sub_section_type = SubSectionType.START + elif i == len(answer_outline.answer_outline) - 1: + sub_section_type = SubSectionType.END + else: + sub_section_type = SubSectionType.MIDDLE + + # Initialize the section_contents entry for this section + section_contents[i] = { + "title": section.section_title, + "content": "", + "index": i + } + section_tasks.append( process_section_with_documents( + section_id=i, section_title=section.section_title, section_questions=section.questions, user_query=configuration.user_query, @@ -503,19 +617,21 @@ async def process_sections(state: State, config: RunnableConfig, writer: StreamW search_space_id=configuration.search_space_id, relevant_documents=relevant_documents, state=state, - writer=writer + writer=writer, + sub_section_type=sub_section_type, + section_contents=section_contents ) ) # Run all section processing tasks in parallel print(f"Running {len(section_tasks)} section processing tasks in parallel") - streaming_service.only_update_terminal(f"Processing {len(section_tasks)} sections simultaneously...") + streaming_service.only_update_terminal(f"âŗ Processing {len(section_tasks)} sections simultaneously...") writer({"yeild_value": streaming_service._format_annotations()}) section_results = await asyncio.gather(*section_tasks, return_exceptions=True) # Handle any exceptions in the results - streaming_service.only_update_terminal("Combining section results into final report...") + streaming_service.only_update_terminal("đŸ§ĩ Combining section results into final report...") writer({"yeild_value": streaming_service._format_annotations()}) processed_results = [] @@ -524,7 +640,7 @@ async def process_sections(state: State, config: RunnableConfig, writer: StreamW section_title = answer_outline.answer_outline[i].section_title error_message = f"Error processing section '{section_title}': {str(result)}" print(error_message) - streaming_service.only_update_terminal(error_message, "error") + streaming_service.only_update_terminal(f"âš ī¸ {error_message}", "error") writer({"yeild_value": streaming_service._format_annotations()}) processed_results.append(error_message) else: @@ -542,31 +658,18 @@ async def process_sections(state: State, config: RunnableConfig, writer: StreamW final_written_report = "\n".join(final_report) print(f"Generated final report with {len(final_report)} parts") - streaming_service.only_update_terminal("Final research report generated successfully!") + streaming_service.only_update_terminal("🎉 Final research report generated successfully!") writer({"yeild_value": streaming_service._format_annotations()}) - if hasattr(state, 'streaming_service') and state.streaming_service: - # Convert the final report to the expected format for UI: - # A list of strings where empty strings represent line breaks - formatted_report = [] - for section in final_report: - if section == "\n": - # Add an empty string for line breaks - formatted_report.append("") - else: - # Split any multiline content by newlines and add each line - section_lines = section.split("\n") - formatted_report.extend(section_lines) - - state.streaming_service.only_update_answer(formatted_report) - writer({"yeild_value": state.streaming_service._format_annotations()}) - + # Skip the final update since we've been streaming incremental updates + # The final answer from each section is already shown in the UI return { "final_written_report": final_written_report } async def process_section_with_documents( + section_id: int, section_title: str, section_questions: List[str], user_id: str, @@ -574,12 +677,15 @@ async def process_section_with_documents( relevant_documents: List[Dict[str, Any]], user_query: str, state: State = None, - writer: StreamWriter = None + writer: StreamWriter = None, + sub_section_type: SubSectionType = SubSectionType.MIDDLE, + section_contents: Dict[int, Dict[str, Any]] = None ) -> str: """ Process a single section using pre-fetched documents. Args: + section_id: The ID of the section section_title: The title of the section section_questions: List of research questions for this section user_id: The user ID @@ -587,6 +693,8 @@ async def process_section_with_documents( relevant_documents: Pre-fetched documents to use for this section state: The current state writer: StreamWriter for sending progress updates + sub_section_type: The type of section (start, middle, end) + section_contents: Dictionary to track content across multiple sections Returns: The written section content @@ -597,14 +705,14 @@ async def process_section_with_documents( # Send status update via streaming if available if state and state.streaming_service and writer: - state.streaming_service.only_update_terminal(f"Writing section: {section_title} with {len(section_questions)} research questions") + state.streaming_service.only_update_terminal(f"📝 Writing section: \"{section_title}\" with {len(section_questions)} research questions") writer({"yeild_value": state.streaming_service._format_annotations()}) # Fallback if no documents found if not documents_to_use: print(f"No relevant documents found for section: {section_title}") if state and state.streaming_service and writer: - state.streaming_service.only_update_terminal(f"Warning: No relevant documents found for section: {section_title}", "warning") + state.streaming_service.only_update_terminal(f"âš ī¸ Warning: No relevant documents found for section: \"{section_title}\"", "warning") writer({"yeild_value": state.streaming_service._format_annotations()}) documents_to_use = [ @@ -619,6 +727,7 @@ async def process_section_with_documents( "configurable": { "sub_section_title": section_title, "sub_section_questions": section_questions, + "sub_section_type": sub_section_type, "user_query": user_query, "relevant_documents": documents_to_use, "user_id": user_id, @@ -626,33 +735,94 @@ async def process_section_with_documents( } } - # Create the initial state with db_session - sub_state = {"db_session": db_session} + # Create the initial state with db_session and chat_history + sub_state = { + "db_session": db_session, + "chat_history": state.chat_history + } - # Invoke the sub-section writer graph + # Invoke the sub-section writer graph with streaming print(f"Invoking sub_section_writer for: {section_title}") if state and state.streaming_service and writer: - state.streaming_service.only_update_terminal(f"Analyzing information and drafting content for section: {section_title}") + state.streaming_service.only_update_terminal(f"🧠 Analyzing information and drafting content for section: \"{section_title}\"") writer({"yeild_value": state.streaming_service._format_annotations()}) - - result = await sub_section_writer_graph.ainvoke(sub_state, config) - # Return the final answer from the sub_section_writer - final_answer = result.get("final_answer", "No content was generated for this section.") + # Variables to track streaming state + complete_content = "" # Tracks the complete content received so far - # Send section content update via streaming if available + async for chunk_type, chunk in sub_section_writer_graph.astream(sub_state, config, stream_mode=["values"]): + if "final_answer" in chunk: + new_content = chunk["final_answer"] + if new_content and new_content != complete_content: + # Extract only the new content (delta) + delta = new_content[len(complete_content):] + + # Update what we've processed so far + complete_content = new_content + + # Only stream if there's actual new content + if delta and state and state.streaming_service and writer: + # Update terminal with real-time progress indicator + state.streaming_service.only_update_terminal(f"âœī¸ Writing section {section_id+1}... ({len(complete_content.split())} words)") + + # Update section_contents with just the new delta + section_contents[section_id]["content"] += delta + + # Build UI-friendly content for all sections + complete_answer = [] + for i in range(len(section_contents)): + if i in section_contents and section_contents[i]["content"]: + # Add section header + complete_answer.append(f"# {section_contents[i]['title']}") + complete_answer.append("") # Empty line after title + + # Add section content + content_lines = section_contents[i]["content"].split("\n") + complete_answer.extend(content_lines) + complete_answer.append("") # Empty line after content + + # Update answer in UI in real-time + state.streaming_service.only_update_answer(complete_answer) + writer({"yeild_value": state.streaming_service._format_annotations()}) + + # Set default if no content was received + if not complete_content: + complete_content = "No content was generated for this section." + section_contents[section_id]["content"] = complete_content + + # Final terminal update if state and state.streaming_service and writer: - state.streaming_service.only_update_terminal(f"Completed writing section: {section_title}") + state.streaming_service.only_update_terminal(f"✅ Completed section: \"{section_title}\"") writer({"yeild_value": state.streaming_service._format_annotations()}) - - return final_answer + + return complete_content except Exception as e: print(f"Error processing section '{section_title}': {str(e)}") # Send error update via streaming if available if state and state.streaming_service and writer: - state.streaming_service.only_update_terminal(f"Error processing section '{section_title}': {str(e)}", "error") + state.streaming_service.only_update_terminal(f"❌ Error processing section \"{section_title}\": {str(e)}", "error") writer({"yeild_value": state.streaming_service._format_annotations()}) return f"Error processing section: {section_title}. Details: {str(e)}" + + +async def reformulate_user_query(state: State, config: RunnableConfig, writer: StreamWriter) -> Dict[str, Any]: + """ + Reforms the user query based on the chat history. + """ + + configuration = Configuration.from_runnable_config(config) + user_query = configuration.user_query + chat_history_str = await QueryService.langchain_chat_history_to_str(state.chat_history) + if len(state.chat_history) == 0: + reformulated_query = user_query + else: + reformulated_query = await QueryService.reformulate_query_with_chat_history(user_query, chat_history_str) + + return { + "reformulated_query": reformulated_query + } + + diff --git a/surfsense_backend/app/agents/researcher/state.py b/surfsense_backend/app/agents/researcher/state.py index dd36163b6..edc73f1e0 100644 --- a/surfsense_backend/app/agents/researcher/state.py +++ b/surfsense_backend/app/agents/researcher/state.py @@ -3,7 +3,7 @@ from __future__ import annotations from dataclasses import dataclass, field -from typing import Optional, Any +from typing import List, Optional, Any from sqlalchemy.ext.asyncio import AsyncSession from app.utils.streaming_service import StreamingService @@ -21,7 +21,9 @@ class State: # Streaming service streaming_service: StreamingService - # Intermediate state - populated during workflow + chat_history: Optional[List[Any]] = field(default_factory=list) + + reformulated_query: Optional[str] = field(default=None) # Using field to explicitly mark as part of state answer_outline: Optional[Any] = field(default=None) diff --git a/surfsense_backend/app/agents/researcher/sub_section_writer/configuration.py b/surfsense_backend/app/agents/researcher/sub_section_writer/configuration.py index 9e1ca32b5..b7acf8bb1 100644 --- a/surfsense_backend/app/agents/researcher/sub_section_writer/configuration.py +++ b/surfsense_backend/app/agents/researcher/sub_section_writer/configuration.py @@ -3,11 +3,19 @@ from __future__ import annotations from dataclasses import dataclass, fields +from enum import Enum from typing import Optional, List, Any from langchain_core.runnables import RunnableConfig +class SubSectionType(Enum): + """Enum defining the type of sub-section.""" + START = "START" + MIDDLE = "MIDDLE" + END = "END" + + @dataclass(kw_only=True) class Configuration: """The configuration for the agent.""" @@ -15,6 +23,7 @@ class Configuration: # Input parameters provided at invocation sub_section_title: str sub_section_questions: List[str] + sub_section_type: SubSectionType user_query: str relevant_documents: List[Any] # Documents provided directly to the agent user_id: str diff --git a/surfsense_backend/app/agents/researcher/sub_section_writer/nodes.py b/surfsense_backend/app/agents/researcher/sub_section_writer/nodes.py index 0bec4618c..deb48449f 100644 --- a/surfsense_backend/app/agents/researcher/sub_section_writer/nodes.py +++ b/surfsense_backend/app/agents/researcher/sub_section_writer/nodes.py @@ -5,6 +5,7 @@ from typing import Any, Dict from app.config import config as app_config from .prompts import get_citation_system_prompt from langchain_core.messages import HumanMessage, SystemMessage +from .configuration import SubSectionType async def rerank_documents(state: State, config: RunnableConfig) -> Dict[str, Any]: """ @@ -38,7 +39,9 @@ async def rerank_documents(state: State, config: RunnableConfig) -> Dict[str, An try: # Use the sub-section questions for reranking context # rerank_query = "\n".join(sub_section_questions) - rerank_query = configuration.user_query + # rerank_query = configuration.user_query + + rerank_query = configuration.user_query + "\n" + "\n".join(sub_section_questions) # Convert documents to format expected by reranker if needed reranker_input_docs = [ @@ -102,13 +105,14 @@ async def write_sub_section(state: State, config: RunnableConfig) -> Dict[str, A # Extract content and metadata content = doc.get("content", "") doc_info = doc.get("document", {}) - document_id = doc_info.get("id", f"{i+1}") # Use document ID or index+1 as source_id + document_id = doc_info.get("id") # Use document ID # Format document according to the citation system prompt's expected format formatted_doc = f""" {document_id} + {doc_info.get("document_type", "CRAWLED_URL")} {content} @@ -122,12 +126,27 @@ async def write_sub_section(state: State, config: RunnableConfig) -> Dict[str, A sub_section_questions = configuration.sub_section_questions user_query = configuration.user_query # Get the original user query documents_text = "\n".join(formatted_documents) - + sub_section_type = configuration.sub_section_type + # Format the questions as bullet points for clarity questions_text = "\n".join([f"- {question}" for question in sub_section_questions]) + # Provide more context based on the subsection type + section_position_context = "" + if sub_section_type == SubSectionType.START: + section_position_context = "This is the INTRODUCTION section. " + elif sub_section_type == SubSectionType.MIDDLE: + section_position_context = "This is a MIDDLE section. Ensure this content flows naturally from previous sections and into subsequent ones. This could be any middle section in the document, so maintain coherence with the overall structure while addressing the specific topic of this section. Do not provide any conclusions in this section, as conclusions should only appear in the final section." + elif sub_section_type == SubSectionType.END: + section_position_context = "This is the CONCLUSION section. Focus on summarizing key points, providing closure." + # Construct a clear, structured query for the LLM human_message_content = f""" + Source material: + + {documents_text} + + Now user's query is: {user_query} @@ -137,21 +156,24 @@ async def write_sub_section(state: State, config: RunnableConfig) -> Dict[str, A {section_title} + + + {section_position_context} + - Use the provided documents as your source material and cite them properly using the IEEE citation format [X] where X is the source_id. - - {documents_text} - + + {questions_text} + """ # Create messages for the LLM - messages = [ + messages_with_chat_history = state.chat_history + [ SystemMessage(content=get_citation_system_prompt()), HumanMessage(content=human_message_content) ] # Call the LLM and get the response - response = await llm.ainvoke(messages) + response = await llm.ainvoke(messages_with_chat_history) final_answer = response.content return { diff --git a/surfsense_backend/app/agents/researcher/sub_section_writer/prompts.py b/surfsense_backend/app/agents/researcher/sub_section_writer/prompts.py index 18a91eb07..e87c9e8a8 100644 --- a/surfsense_backend/app/agents/researcher/sub_section_writer/prompts.py +++ b/surfsense_backend/app/agents/researcher/sub_section_writer/prompts.py @@ -4,16 +4,28 @@ import datetime def get_citation_system_prompt(): return f""" Today's date: {datetime.datetime.now().strftime("%Y-%m-%d")} -You are a research assistant tasked with analyzing documents and providing comprehensive answers with proper citations in IEEE format. +You are SurfSense, an advanced AI research assistant that synthesizes information from multiple knowledge sources to provide comprehensive, well-cited answers to user queries. + +- EXTENSION: "Web content saved via SurfSense browser extension" (personal browsing history) +- CRAWLED_URL: "Webpages indexed by SurfSense web crawler" (personally selected websites) +- FILE: "User-uploaded documents (PDFs, Word, etc.)" (personal files) +- SLACK_CONNECTOR: "Slack conversations and shared content" (personal workspace communications) +- NOTION_CONNECTOR: "Notion workspace pages and databases" (personal knowledge management) +- YOUTUBE_VIDEO: "YouTube video transcripts and metadata" (personally saved videos) +- GITHUB_CONNECTOR: "GitHub repository content and issues" (personal repositories and interactions) +- LINEAR_CONNECTOR: "Linear project issues and discussions" (personal project management) +- TAVILY_API: "Tavily search API results" (personalized search results) +- LINKUP_API: "Linkup search API results" (personalized search results) + 1. Carefully analyze all provided documents in the section's. 2. Extract relevant information that addresses the user's query. -3. Synthesize a comprehensive, well-structured answer using information from these documents. +3. Synthesize a comprehensive, personalized answer using information from the user's personal knowledge sources. 4. For EVERY piece of information you include from the documents, add an IEEE-style citation in square brackets [X] where X is the source_id from the document's metadata. 5. Make sure ALL factual statements from the documents have proper citations. 6. If multiple documents support the same point, include all relevant citations [X], [Y]. -7. Present information in a logical, coherent flow. +7. Present information in a logical, coherent flow that reflects the user's personal context. 8. Use your own words to connect ideas, but cite ALL information from the documents. 9. If documents contain conflicting information, acknowledge this and present both perspectives with appropriate citations. 10. Do not make up or include information not found in the provided documents. @@ -25,10 +37,14 @@ You are a research assistant tasked with analyzing documents and providing compr 16. CRITICAL: Citations must ONLY appear as [X] or [X], [Y], [Z] format - never with parentheses, hyperlinks, or other formatting. 17. CRITICAL: Never make up citation numbers. Only use source_id values that are explicitly provided in the document metadata. 18. CRITICAL: If you are unsure about a source_id, do not include a citation rather than guessing or making one up. +19. CRITICAL: Focus only on answering the user's query. Any guiding questions provided are for your thinking process only and should not be mentioned in your response. +20. CRITICAL: Ensure your response aligns with the provided sub-section title and section position. +21. CRITICAL: Remember that all knowledge sources contain personal information - provide answers that reflect this personal context. - Write in clear, professional language suitable for academic or technical audiences +- Tailor your response to the user's personal context based on their knowledge sources - Organize your response with appropriate paragraphs, headings, and structure - Every fact from the documents must have an IEEE-style citation in square brackets [X] where X is the EXACT source_id from the document's metadata - Citations should appear at the end of the sentence containing the information they support @@ -37,12 +53,17 @@ You are a research assistant tasked with analyzing documents and providing compr - NEVER create your own citation numbering system - use the exact source_id values from the documents. - NEVER format citations as clickable links or as markdown links like "([1](https://example.com))". Always use plain square brackets only. - NEVER make up citation numbers if you are unsure about the source_id. It is better to omit the citation than to guess. +- NEVER include or mention the guiding questions in your response. They are only to help guide your thinking. +- ALWAYS focus on answering the user's query directly from the information in the documents. +- ALWAYS provide personalized answers that reflect the user's own knowledge and context. + 1 + EXTENSION The Great Barrier Reef is the world's largest coral reef system, stretching over 2,300 kilometers along the coast of Queensland, Australia. It comprises over 2,900 individual reefs and 900 islands. @@ -52,6 +73,7 @@ You are a research assistant tasked with analyzing documents and providing compr 13 + YOUTUBE_VIDEO Climate change poses a significant threat to coral reefs worldwide. Rising ocean temperatures have led to mass coral bleaching events in the Great Barrier Reef in 2016, 2017, and 2020. @@ -61,15 +83,17 @@ You are a research assistant tasked with analyzing documents and providing compr 21 + CRAWLED_URL The Great Barrier Reef was designated a UNESCO World Heritage Site in 1981 due to its outstanding universal value and biological diversity. It is home to over 1,500 species of fish and 400 types of coral. + - The Great Barrier Reef is the world's largest coral reef system, stretching over 2,300 kilometers along the coast of Queensland, Australia [1]. It was designated a UNESCO World Heritage Site in 1981 due to its outstanding universal value and biological diversity [21]. The reef is home to over 1,500 species of fish and 400 types of coral [21]. Unfortunately, climate change poses a significant threat to coral reefs worldwide, with rising ocean temperatures leading to mass coral bleaching events in the Great Barrier Reef in 2016, 2017, and 2020 [13]. The reef system comprises over 2,900 individual reefs and 900 islands [1], making it an ecological treasure that requires protection from multiple threats [1], [13]. + Based on your saved browser content and videos, the Great Barrier Reef is the world's largest coral reef system, stretching over 2,300 kilometers along the coast of Queensland, Australia [1]. From your browsing history, you've looked into its designation as a UNESCO World Heritage Site in 1981 due to its outstanding universal value and biological diversity [21]. The reef is home to over 1,500 species of fish and 400 types of coral [21]. According to a YouTube video you've watched, climate change poses a significant threat to coral reefs worldwide, with rising ocean temperatures leading to mass coral bleaching events in the Great Barrier Reef in 2016, 2017, and 2020 [13]. The reef system comprises over 2,900 individual reefs and 900 islands [1], making it an ecological treasure that requires protection from multiple threats [1], [13]. @@ -84,4 +108,22 @@ ONLY use plain square brackets [1] or multiple citations [1], [2], [3] Note that the citation numbers match exactly with the source_id values (1, 13, and 21) and are not renumbered sequentially. Citations follow IEEE style with square brackets and appear at the end of sentences. + + +When you see a user query like: + + Give all linear issues. + + +Focus exclusively on answering this query using information from the provided documents, which contain the user's personal knowledge and data. + +If guiding questions are provided in a section, use them only to guide your thinking process. Do not mention or list these questions in your response. + +Make sure your response: +1. Directly answers the user's query with personalized information from their own knowledge sources +2. Fits the provided sub-section title and section position +3. Uses proper citations for all information from documents +4. Is well-structured and professional in tone +5. Acknowledges the personal nature of the information being provided + """ \ No newline at end of file diff --git a/surfsense_backend/app/agents/researcher/sub_section_writer/state.py b/surfsense_backend/app/agents/researcher/sub_section_writer/state.py index b33abe6bd..7998279be 100644 --- a/surfsense_backend/app/agents/researcher/sub_section_writer/state.py +++ b/surfsense_backend/app/agents/researcher/sub_section_writer/state.py @@ -2,7 +2,7 @@ from __future__ import annotations -from dataclasses import dataclass +from dataclasses import dataclass, field from typing import List, Optional, Any from sqlalchemy.ext.asyncio import AsyncSession @@ -17,6 +17,7 @@ class State: # Runtime context db_session: AsyncSession + chat_history: Optional[List[Any]] = field(default_factory=list) # OUTPUT: Populated by agent nodes reranked_documents: Optional[List[Any]] = None final_answer: Optional[str] = None diff --git a/surfsense_backend/app/app.py b/surfsense_backend/app/app.py index 500861c69..956740f7b 100644 --- a/surfsense_backend/app/app.py +++ b/surfsense_backend/app/app.py @@ -6,16 +6,18 @@ from fastapi.middleware.cors import CORSMiddleware from sqlalchemy.ext.asyncio import AsyncSession from app.db import User, create_db_and_tables, get_async_session -from app.retriver.chunks_hybrid_search import ChucksHybridSearchRetriever from app.schemas import UserCreate, UserRead, UserUpdate + + +from app.routes import router as crud_router +from app.config import config + from app.users import ( SECRET, auth_backend, fastapi_users, - google_oauth_client, - current_active_user, + current_active_user ) -from app.routes import router as crud_router @asynccontextmanager @@ -59,11 +61,20 @@ app.include_router( prefix="/users", tags=["users"], ) -app.include_router( - fastapi_users.get_oauth_router(google_oauth_client, auth_backend, SECRET, is_verified_by_default=True), - prefix="/auth/google", - tags=["auth"], -) + +if config.AUTH_TYPE == "GOOGLE": + from app.users import google_oauth_client + app.include_router( + fastapi_users.get_oauth_router( + google_oauth_client, + auth_backend, + SECRET, + is_verified_by_default=True + ), + prefix="/auth/google", + tags=["auth"], + ) + app.include_router(crud_router, prefix="/api/v1", tags=["crud"]) diff --git a/surfsense_backend/app/config/__init__.py b/surfsense_backend/app/config/__init__.py index c7f842b71..9135c3222 100644 --- a/surfsense_backend/app/config/__init__.py +++ b/surfsense_backend/app/config/__init__.py @@ -1,12 +1,12 @@ import os from pathlib import Path +import shutil -from chonkie import AutoEmbeddings, LateChunker -from rerankers import Reranker -from langchain_community.chat_models import ChatLiteLLM - - +from chonkie import AutoEmbeddings, CodeChunker, RecursiveChunker from dotenv import load_dotenv +from langchain_community.chat_models import ChatLiteLLM +from rerankers import Reranker + # Get the base directory of the project BASE_DIR = Path(__file__).resolve().parent.parent.parent @@ -15,33 +15,74 @@ env_file = BASE_DIR / ".env" load_dotenv(env_file) +def is_ffmpeg_installed(): + """ + Check if ffmpeg is installed on the current system. + + Returns: + bool: True if ffmpeg is installed, False otherwise. + """ + return shutil.which("ffmpeg") is not None + + class Config: + # Check if ffmpeg is installed + if not is_ffmpeg_installed(): + import static_ffmpeg + # ffmpeg installed on first call to add_paths(), threadsafe. + static_ffmpeg.add_paths() + # check if ffmpeg is installed again + if not is_ffmpeg_installed(): + raise ValueError("FFmpeg is not installed on the system. Please install it to use the Surfsense Podcaster.") + # Database DATABASE_URL = os.getenv("DATABASE_URL") - - # Google OAuth - GOOGLE_OAUTH_CLIENT_ID = os.getenv("GOOGLE_OAUTH_CLIENT_ID") - GOOGLE_OAUTH_CLIENT_SECRET = os.getenv("GOOGLE_OAUTH_CLIENT_SECRET") + NEXT_FRONTEND_URL = os.getenv("NEXT_FRONTEND_URL") + + # AUTH: Google OAuth + AUTH_TYPE = os.getenv("AUTH_TYPE") + if AUTH_TYPE == "GOOGLE": + GOOGLE_OAUTH_CLIENT_ID = os.getenv("GOOGLE_OAUTH_CLIENT_ID") + GOOGLE_OAUTH_CLIENT_SECRET = os.getenv("GOOGLE_OAUTH_CLIENT_SECRET") + + # LONG-CONTEXT LLMS LONG_CONTEXT_LLM = os.getenv("LONG_CONTEXT_LLM") - long_context_llm_instance = ChatLiteLLM(model=LONG_CONTEXT_LLM) + LONG_CONTEXT_LLM_API_BASE = os.getenv("LONG_CONTEXT_LLM_API_BASE") + if LONG_CONTEXT_LLM_API_BASE: + long_context_llm_instance = ChatLiteLLM(model=LONG_CONTEXT_LLM, api_base=LONG_CONTEXT_LLM_API_BASE) + else: + long_context_llm_instance = ChatLiteLLM(model=LONG_CONTEXT_LLM) - # GPT Researcher + # FAST LLM FAST_LLM = os.getenv("FAST_LLM") - STRATEGIC_LLM = os.getenv("STRATEGIC_LLM") - fast_llm_instance = ChatLiteLLM(model=FAST_LLM) - strategic_llm_instance = ChatLiteLLM(model=STRATEGIC_LLM) + FAST_LLM_API_BASE = os.getenv("FAST_LLM_API_BASE") + if FAST_LLM_API_BASE: + fast_llm_instance = ChatLiteLLM(model=FAST_LLM, api_base=FAST_LLM_API_BASE) + else: + fast_llm_instance = ChatLiteLLM(model=FAST_LLM) + + + # STRATEGIC LLM + STRATEGIC_LLM = os.getenv("STRATEGIC_LLM") + STRATEGIC_LLM_API_BASE = os.getenv("STRATEGIC_LLM_API_BASE") + if STRATEGIC_LLM_API_BASE: + strategic_llm_instance = ChatLiteLLM(model=STRATEGIC_LLM, api_base=STRATEGIC_LLM_API_BASE) + else: + strategic_llm_instance = ChatLiteLLM(model=STRATEGIC_LLM) # Chonkie Configuration | Edit this to your needs EMBEDDING_MODEL = os.getenv("EMBEDDING_MODEL") embedding_model_instance = AutoEmbeddings.get_embeddings(EMBEDDING_MODEL) - chunker_instance = LateChunker( - embedding_model=EMBEDDING_MODEL, - chunk_size=embedding_model_instance.max_seq_length, + chunker_instance = RecursiveChunker( + chunk_size=getattr(embedding_model_instance, 'max_seq_length', 512) + ) + code_chunker_instance = CodeChunker( + chunk_size=getattr(embedding_model_instance, 'max_seq_length', 512) ) # Reranker's Configuration | Pinecode, Cohere etc. Read more at https://github.com/AnswerDotAI/rerankers?tab=readme-ov-file#usage @@ -55,12 +96,30 @@ class Config: # OAuth JWT SECRET_KEY = os.getenv("SECRET_KEY") - # Unstructured API Key - UNSTRUCTURED_API_KEY = os.getenv("UNSTRUCTURED_API_KEY") + # ETL Service + ETL_SERVICE = os.getenv("ETL_SERVICE") + if ETL_SERVICE == "UNSTRUCTURED": + # Unstructured API Key + UNSTRUCTURED_API_KEY = os.getenv("UNSTRUCTURED_API_KEY") + + elif ETL_SERVICE == "LLAMACLOUD": + # LlamaCloud API Key + LLAMA_CLOUD_API_KEY = os.getenv("LLAMA_CLOUD_API_KEY") + + # Firecrawl API Key FIRECRAWL_API_KEY = os.getenv("FIRECRAWL_API_KEY", None) + # Litellm TTS Configuration + TTS_SERVICE = os.getenv("TTS_SERVICE") + TTS_SERVICE_API_BASE = os.getenv("TTS_SERVICE_API_BASE") + + # Litellm STT Configuration + STT_SERVICE = os.getenv("STT_SERVICE") + STT_SERVICE_API_BASE = os.getenv("STT_SERVICE_API_BASE") + + # Validation Checks # Check embedding dimension if hasattr(embedding_model_instance, 'dimension') and embedding_model_instance.dimension > 2000: diff --git a/surfsense_backend/app/connectors/github_connector.py b/surfsense_backend/app/connectors/github_connector.py index 265f89b0a..a25bc980a 100644 --- a/surfsense_backend/app/connectors/github_connector.py +++ b/surfsense_backend/app/connectors/github_connector.py @@ -80,7 +80,7 @@ class GitHubConnector: # type='owner' fetches repos owned by the user # type='member' fetches repos the user is a collaborator on (including orgs) # type='all' fetches both - for repo in self.gh.repositories(type='owner', sort='updated'): + for repo in self.gh.repositories(type='all', sort='updated'): repos_data.append({ "id": repo.id, "name": repo.name, diff --git a/surfsense_backend/app/connectors/linear_connector.py b/surfsense_backend/app/connectors/linear_connector.py index be9a1a49d..52b770445 100644 --- a/surfsense_backend/app/connectors/linear_connector.py +++ b/surfsense_backend/app/connectors/linear_connector.py @@ -6,7 +6,7 @@ Allows fetching issue lists and their comments with date range filtering. """ import requests -from datetime import datetime, timedelta +from datetime import datetime from typing import Dict, List, Optional, Tuple, Any, Union diff --git a/surfsense_backend/app/connectors/slack_history.py b/surfsense_backend/app/connectors/slack_history.py index 67e540354..1c1ec5602 100644 --- a/surfsense_backend/app/connectors/slack_history.py +++ b/surfsense_backend/app/connectors/slack_history.py @@ -6,11 +6,15 @@ Allows fetching channel lists and message history with date range filtering. """ import os +import time # Added import +import logging # Added import from slack_sdk import WebClient from slack_sdk.errors import SlackApiError -from datetime import datetime, timedelta +from datetime import datetime from typing import Dict, List, Optional, Tuple, Any, Union +logger = logging.getLogger(__name__) # Added logger + class SlackHistory: """Class for retrieving conversation history from Slack channels.""" @@ -33,56 +37,88 @@ class SlackHistory: """ self.client = WebClient(token=token) - def get_all_channels(self, include_private: bool = True) -> Dict[str, str]: + def get_all_channels(self, include_private: bool = True) -> List[Dict[str, Any]]: """ - Fetch all channels that the bot has access to. + Fetch all channels that the bot has access to, with rate limit handling. Args: include_private: Whether to include private channels Returns: - Dictionary mapping channel names to channel IDs + List of dictionaries, each representing a channel with id, name, is_private, is_member. Raises: ValueError: If no Slack client has been initialized - SlackApiError: If there's an error calling the Slack API + SlackApiError: If there's an unrecoverable error calling the Slack API + RuntimeError: For unexpected errors during channel fetching. """ if not self.client: raise ValueError("Slack client not initialized. Call set_token() first.") - channels_dict = {} + channels_list = [] # Changed from dict to list types = "public_channel" if include_private: types += ",private_channel" - - try: - # Call the conversations.list method - result = self.client.conversations_list( - types=types, - limit=1000 # Maximum allowed by API - ) - channels = result["channels"] - - # Handle pagination for workspaces with many channels - while result.get("response_metadata", {}).get("next_cursor"): - next_cursor = result["response_metadata"]["next_cursor"] - - # Get the next batch of channels - result = self.client.conversations_list( + + next_cursor = None + is_first_request = True + + while is_first_request or next_cursor: + try: + if not is_first_request: # Add delay only for paginated requests + logger.info(f"Paginating for channels, waiting 3 seconds before next call. Cursor: {next_cursor}") + time.sleep(3) + + current_limit = 1000 # Max limit + api_result = self.client.conversations_list( types=types, cursor=next_cursor, - limit=1000 + limit=current_limit ) - channels.extend(result["channels"]) - - # Create a dictionary mapping channel names to IDs - for channel in channels: - channels_dict[channel["name"]] = channel["id"] - - return channels_dict + + channels_on_page = api_result["channels"] + for channel in channels_on_page: + if "name" in channel and "id" in channel: + channel_data = { + "id": channel.get("id"), + "name": channel.get("name"), + "is_private": channel.get("is_private", False), + # is_member is often part of the channel object from conversations.list + # It indicates if the authenticated user (bot) is a member. + # For public channels, this might be true or the API might not focus on it + # if the bot can read it anyway. For private, it's crucial. + "is_member": channel.get("is_member", False) + } + channels_list.append(channel_data) + else: + logger.warning(f"Channel found with missing name or id. Data: {channel}") + + + next_cursor = api_result.get("response_metadata", {}).get("next_cursor") + is_first_request = False # Subsequent requests are not the first + + if not next_cursor: # All pages processed + break + + except SlackApiError as e: + if e.response is not None and e.response.status_code == 429: + retry_after_header = e.response.headers.get('Retry-After') + wait_duration = 60 # Default wait time + if retry_after_header and retry_after_header.isdigit(): + wait_duration = int(retry_after_header) + + logger.warning(f"Slack API rate limit hit while fetching channels. Waiting for {wait_duration} seconds. Cursor: {next_cursor}") + time.sleep(wait_duration) + # The loop will continue, retrying with the same cursor + else: + # Not a 429 error, or no response object, re-raise + raise SlackApiError(f"Error retrieving channels: {e}", e.response) + except Exception as general_error: + # Handle other potential errors like network issues if necessary, or re-raise + logger.error(f"An unexpected error occurred during channel fetching: {general_error}") + raise RuntimeError(f"An unexpected error occurred during channel fetching: {general_error}") - except SlackApiError as e: - raise SlackApiError(f"Error retrieving channels: {e}", e.response) + return channels_list def get_conversation_history( self, @@ -110,17 +146,18 @@ class SlackHistory: if not self.client: raise ValueError("Slack client not initialized. Call set_token() first.") - try: - # Call the conversations.history method - messages = [] - next_cursor = None - - while True: + messages = [] + next_cursor = None + + while True: + try: + # Proactive delay for conversations.history (Tier 3) + time.sleep(1.2) # Wait 1.2 seconds before each history call. + kwargs = { "channel": channel_id, "limit": min(limit, 1000), # API max is 1000 } - if oldest: kwargs["oldest"] = oldest if latest: @@ -128,22 +165,57 @@ class SlackHistory: if next_cursor: kwargs["cursor"] = next_cursor - result = self.client.conversations_history(**kwargs) + current_api_call_successful = False + result = None # Ensure result is defined + try: + result = self.client.conversations_history(**kwargs) + current_api_call_successful = True + except SlackApiError as e_history: + if e_history.response is not None and e_history.response.status_code == 429: + retry_after_str = e_history.response.headers.get('Retry-After') + wait_time = 60 # Default + if retry_after_str and retry_after_str.isdigit(): + wait_time = int(retry_after_str) + logger.warning( + f"Rate limited by Slack on conversations.history for channel {channel_id}. " + f"Retrying after {wait_time} seconds. Cursor: {next_cursor}" + ) + time.sleep(wait_time) + # current_api_call_successful remains False, loop will retry this page + else: + raise # Re-raise to outer handler for not_in_channel or other SlackApiErrors + + if not current_api_call_successful: + continue # Retry the current page fetch due to handled rate limit + + # Process result if successful batch = result["messages"] messages.extend(batch) - # Check if we need to paginate if result.get("has_more", False) and len(messages) < limit: next_cursor = result["response_metadata"]["next_cursor"] else: - break + break # Exit pagination loop - # Respect the overall limit parameter - return messages[:limit] + except SlackApiError as e: # Outer catch for not_in_channel or unhandled SlackApiErrors from inner try + if (e.response is not None and + hasattr(e.response, 'data') and + isinstance(e.response.data, dict) and + e.response.data.get('error') == 'not_in_channel'): + logger.warning( + f"Bot is not in channel '{channel_id}'. Cannot fetch history. " + "Please add the bot to this channel." + ) + return [] + # For other SlackApiErrors from inner block or this level + raise SlackApiError(f"Error retrieving history for channel {channel_id}: {e}", e.response) + except Exception as general_error: # Catch any other unexpected errors + logger.error(f"Unexpected error in get_conversation_history for channel {channel_id}: {general_error}") + # Re-raise the general error to allow higher-level handling or visibility + raise - except SlackApiError as e: - raise SlackApiError(f"Error retrieving history for channel {channel_id}: {e}", e.response) - + return messages[:limit] + @staticmethod def convert_date_to_timestamp(date_str: str) -> Optional[int]: """ @@ -220,12 +292,31 @@ class SlackHistory: """ if not self.client: raise ValueError("Slack client not initialized. Call set_token() first.") - - try: - result = self.client.users_info(user=user_id) - return result["user"] - except SlackApiError as e: - raise SlackApiError(f"Error retrieving user info for {user_id}: {e}", e.response) + + while True: + try: + # Proactive delay for users.info (Tier 4) - generally not needed unless called extremely rapidly. + # For now, we are only adding Retry-After as per plan. + # time.sleep(0.6) # Optional: ~100 req/min if ever needed. + + result = self.client.users_info(user=user_id) + return result["user"] # Success, return and exit loop implicitly + + except SlackApiError as e_user_info: + if e_user_info.response is not None and e_user_info.response.status_code == 429: + retry_after_str = e_user_info.response.headers.get('Retry-After') + wait_time = 30 # Default for Tier 4, can be adjusted + if retry_after_str and retry_after_str.isdigit(): + wait_time = int(retry_after_str) + logger.warning(f"Rate limited by Slack on users.info for user {user_id}. Retrying after {wait_time} seconds.") + time.sleep(wait_time) + continue # Retry the API call + else: + # Not a 429 error, or no response object, re-raise + raise SlackApiError(f"Error retrieving user info for {user_id}: {e_user_info}", e_user_info.response) + except Exception as general_error: # Catch any other unexpected errors + logger.error(f"Unexpected error in get_user_info for user {user_id}: {general_error}") + raise # Re-raise unexpected errors def format_message(self, msg: Dict[str, Any], include_user_info: bool = False) -> Dict[str, Any]: """ diff --git a/surfsense_backend/app/connectors/test_github_connector.py b/surfsense_backend/app/connectors/test_github_connector.py new file mode 100644 index 000000000..ad8a0682a --- /dev/null +++ b/surfsense_backend/app/connectors/test_github_connector.py @@ -0,0 +1,154 @@ +import unittest +from unittest.mock import patch, Mock, call +from datetime import datetime + +# Adjust the import path based on the actual location if test_github_connector.py +# is not in the same directory as github_connector.py or if paths are set up differently. +# Assuming surfsend_backend/app/connectors/test_github_connector.py +from surfsense_backend.app.connectors.github_connector import GitHubConnector +from github3.exceptions import ForbiddenError # Import the specific exception + +class TestGitHubConnector(unittest.TestCase): + + @patch('surfsense_backend.app.connectors.github_connector.github_login') + def test_get_user_repositories_uses_type_all(self, mock_github_login): + # Mock the GitHub client object and its methods + mock_gh_instance = Mock() + mock_github_login.return_value = mock_gh_instance + + # Mock the self.gh.me() call in __init__ to prevent an actual API call + mock_gh_instance.me.return_value = Mock() # Simple mock to pass initialization + + # Prepare mock repository data + mock_repo1_data = Mock() + mock_repo1_data.id = 1 + mock_repo1_data.name = "repo1" + mock_repo1_data.full_name = "user/repo1" + mock_repo1_data.private = False + mock_repo1_data.html_url = "http://example.com/user/repo1" + mock_repo1_data.description = "Test repo 1" + mock_repo1_data.updated_at = datetime(2023, 1, 1, 10, 30, 0) # Added time component + + mock_repo2_data = Mock() + mock_repo2_data.id = 2 + mock_repo2_data.name = "org-repo" + mock_repo2_data.full_name = "org/org-repo" + mock_repo2_data.private = True + mock_repo2_data.html_url = "http://example.com/org/org-repo" + mock_repo2_data.description = "Org repo" + mock_repo2_data.updated_at = datetime(2023, 1, 2, 12, 0, 0) # Added time component + + # Configure the mock for gh.repositories() call + # This method is an iterator, so it should return an iterable (e.g., a list) + mock_gh_instance.repositories.return_value = [mock_repo1_data, mock_repo2_data] + + connector = GitHubConnector(token="fake_token") + repositories = connector.get_user_repositories() + + # Assert that gh.repositories was called correctly + mock_gh_instance.repositories.assert_called_once_with(type='all', sort='updated') + + # Assert the structure and content of the returned data + expected_repositories = [ + { + "id": 1, "name": "repo1", "full_name": "user/repo1", "private": False, + "url": "http://example.com/user/repo1", "description": "Test repo 1", + "last_updated": datetime(2023, 1, 1, 10, 30, 0) + }, + { + "id": 2, "name": "org-repo", "full_name": "org/org-repo", "private": True, + "url": "http://example.com/org/org-repo", "description": "Org repo", + "last_updated": datetime(2023, 1, 2, 12, 0, 0) + } + ] + self.assertEqual(repositories, expected_repositories) + self.assertEqual(len(repositories), 2) + + @patch('surfsense_backend.app.connectors.github_connector.github_login') + def test_get_user_repositories_handles_empty_description_and_none_updated_at(self, mock_github_login): + # Mock the GitHub client object and its methods + mock_gh_instance = Mock() + mock_github_login.return_value = mock_gh_instance + mock_gh_instance.me.return_value = Mock() + + mock_repo_data = Mock() + mock_repo_data.id = 1 + mock_repo_data.name = "repo_no_desc" + mock_repo_data.full_name = "user/repo_no_desc" + mock_repo_data.private = False + mock_repo_data.html_url = "http://example.com/user/repo_no_desc" + mock_repo_data.description = None # Test None description + mock_repo_data.updated_at = None # Test None updated_at + + mock_gh_instance.repositories.return_value = [mock_repo_data] + connector = GitHubConnector(token="fake_token") + repositories = connector.get_user_repositories() + + mock_gh_instance.repositories.assert_called_once_with(type='all', sort='updated') + expected_repositories = [ + { + "id": 1, "name": "repo_no_desc", "full_name": "user/repo_no_desc", "private": False, + "url": "http://example.com/user/repo_no_desc", "description": "", # Expect empty string + "last_updated": None # Expect None + } + ] + self.assertEqual(repositories, expected_repositories) + + @patch('surfsense_backend.app.connectors.github_connector.github_login') + def test_github_connector_initialization_failure_forbidden(self, mock_github_login): + # Test that __init__ raises ValueError on auth failure (ForbiddenError) + mock_gh_instance = Mock() + mock_github_login.return_value = mock_gh_instance + + # Create a mock response object for the ForbiddenError + # The actual response structure might vary, but github3.py's ForbiddenError + # can be instantiated with just a response object that has a status_code. + mock_response = Mock() + mock_response.status_code = 403 # Typically Forbidden + + # Setup the side_effect for self.gh.me() + mock_gh_instance.me.side_effect = ForbiddenError(mock_response) + + with self.assertRaises(ValueError) as context: + GitHubConnector(token="invalid_token_forbidden") + self.assertIn("Invalid GitHub token or insufficient permissions.", str(context.exception)) + + @patch('surfsense_backend.app.connectors.github_connector.github_login') + def test_github_connector_initialization_failure_authentication_failed(self, mock_github_login): + # Test that __init__ raises ValueError on auth failure (AuthenticationFailed, which is a subclass of ForbiddenError) + # For github3.py, AuthenticationFailed is more specific for token issues. + from github3.exceptions import AuthenticationFailed + + mock_gh_instance = Mock() + mock_github_login.return_value = mock_gh_instance + + mock_response = Mock() + mock_response.status_code = 401 # Typically Unauthorized + + mock_gh_instance.me.side_effect = AuthenticationFailed(mock_response) + + with self.assertRaises(ValueError) as context: + GitHubConnector(token="invalid_token_authfailed") + self.assertIn("Invalid GitHub token or insufficient permissions.", str(context.exception)) + + @patch('surfsense_backend.app.connectors.github_connector.github_login') + def test_get_user_repositories_handles_api_exception(self, mock_github_login): + mock_gh_instance = Mock() + mock_github_login.return_value = mock_gh_instance + mock_gh_instance.me.return_value = Mock() + + # Simulate an exception when calling repositories + mock_gh_instance.repositories.side_effect = Exception("API Error") + + connector = GitHubConnector(token="fake_token") + # We expect it to log an error and return an empty list + with patch('surfsense_backend.app.connectors.github_connector.logger') as mock_logger: + repositories = connector.get_user_repositories() + + self.assertEqual(repositories, []) + mock_logger.error.assert_called_once() + self.assertIn("Failed to fetch GitHub repositories: API Error", mock_logger.error.call_args[0][0]) + + +if __name__ == '__main__': + unittest.main() diff --git a/surfsense_backend/app/connectors/test_slack_history.py b/surfsense_backend/app/connectors/test_slack_history.py new file mode 100644 index 000000000..ecff2c58c --- /dev/null +++ b/surfsense_backend/app/connectors/test_slack_history.py @@ -0,0 +1,420 @@ +import unittest +import time # Imported to be available for patching target module +from unittest.mock import patch, Mock, call +from slack_sdk.errors import SlackApiError + +# Since test_slack_history.py is in the same directory as slack_history.py +from .slack_history import SlackHistory + +class TestSlackHistoryGetAllChannels(unittest.TestCase): + + @patch('surfsense_backend.app.connectors.slack_history.logger') + @patch('surfsense_backend.app.connectors.slack_history.time.sleep') + @patch('slack_sdk.WebClient') + def test_get_all_channels_pagination_with_delay(self, MockWebClient, mock_sleep, mock_logger): + mock_client_instance = MockWebClient.return_value + + # Mock API responses now include is_private and is_member + page1_response = { + "channels": [ + {"name": "general", "id": "C1", "is_private": False, "is_member": True}, + {"name": "dev", "id": "C0", "is_private": False, "is_member": True} + ], + "response_metadata": {"next_cursor": "cursor123"} + } + page2_response = { + "channels": [{"name": "random", "id": "C2", "is_private": True, "is_member": True}], + "response_metadata": {"next_cursor": ""} + } + + mock_client_instance.conversations_list.side_effect = [ + page1_response, + page2_response + ] + + slack_history = SlackHistory(token="fake_token") + channels_list = slack_history.get_all_channels(include_private=True) + + expected_channels_list = [ + {"id": "C1", "name": "general", "is_private": False, "is_member": True}, + {"id": "C0", "name": "dev", "is_private": False, "is_member": True}, + {"id": "C2", "name": "random", "is_private": True, "is_member": True} + ] + + self.assertEqual(len(channels_list), 3) + self.assertListEqual(channels_list, expected_channels_list) # Assert list equality + + expected_calls = [ + call(types="public_channel,private_channel", cursor=None, limit=1000), + call(types="public_channel,private_channel", cursor="cursor123", limit=1000) + ] + mock_client_instance.conversations_list.assert_has_calls(expected_calls) + self.assertEqual(mock_client_instance.conversations_list.call_count, 2) + + mock_sleep.assert_called_once_with(3) + mock_logger.info.assert_called_once_with("Paginating for channels, waiting 3 seconds before next call. Cursor: cursor123") + + @patch('surfsense_backend.app.connectors.slack_history.logger') + @patch('surfsense_backend.app.connectors.slack_history.time.sleep') + @patch('slack_sdk.WebClient') + def test_get_all_channels_rate_limit_with_retry_after(self, MockWebClient, mock_sleep, mock_logger): + mock_client_instance = MockWebClient.return_value + + mock_error_response = Mock() + mock_error_response.status_code = 429 + mock_error_response.headers = {'Retry-After': '5'} + + successful_response = { + "channels": [{"name": "general", "id": "C1", "is_private": False, "is_member": True}], + "response_metadata": {"next_cursor": ""} + } + + mock_client_instance.conversations_list.side_effect = [ + SlackApiError(message="ratelimited", response=mock_error_response), + successful_response + ] + + slack_history = SlackHistory(token="fake_token") + channels_list = slack_history.get_all_channels(include_private=True) + + expected_channels_list = [{"id": "C1", "name": "general", "is_private": False, "is_member": True}] + self.assertEqual(len(channels_list), 1) + self.assertListEqual(channels_list, expected_channels_list) + + mock_sleep.assert_called_once_with(5) + mock_logger.warning.assert_called_once_with("Slack API rate limit hit while fetching channels. Waiting for 5 seconds. Cursor: None") + + expected_calls = [ + call(types="public_channel,private_channel", cursor=None, limit=1000), + call(types="public_channel,private_channel", cursor=None, limit=1000) + ] + mock_client_instance.conversations_list.assert_has_calls(expected_calls) + self.assertEqual(mock_client_instance.conversations_list.call_count, 2) + + @patch('surfsense_backend.app.connectors.slack_history.logger') + @patch('surfsense_backend.app.connectors.slack_history.time.sleep') + @patch('slack_sdk.WebClient') + def test_get_all_channels_rate_limit_no_retry_after_valid_header(self, MockWebClient, mock_sleep, mock_logger): + mock_client_instance = MockWebClient.return_value + + mock_error_response = Mock() + mock_error_response.status_code = 429 + mock_error_response.headers = {'Retry-After': 'invalid_value'} + + successful_response = { + "channels": [{"name": "general", "id": "C1", "is_private": False, "is_member": True}], + "response_metadata": {"next_cursor": ""} + } + + mock_client_instance.conversations_list.side_effect = [ + SlackApiError(message="ratelimited", response=mock_error_response), + successful_response + ] + + slack_history = SlackHistory(token="fake_token") + channels_list = slack_history.get_all_channels(include_private=True) + + expected_channels_list = [{"id": "C1", "name": "general", "is_private": False, "is_member": True}] + self.assertListEqual(channels_list, expected_channels_list) + mock_sleep.assert_called_once_with(60) # Default fallback + mock_logger.warning.assert_called_once_with("Slack API rate limit hit while fetching channels. Waiting for 60 seconds. Cursor: None") + self.assertEqual(mock_client_instance.conversations_list.call_count, 2) + + @patch('surfsense_backend.app.connectors.slack_history.logger') + @patch('surfsense_backend.app.connectors.slack_history.time.sleep') + @patch('slack_sdk.WebClient') + def test_get_all_channels_rate_limit_no_retry_after_header(self, MockWebClient, mock_sleep, mock_logger): + mock_client_instance = MockWebClient.return_value + + mock_error_response = Mock() + mock_error_response.status_code = 429 + mock_error_response.headers = {} + + successful_response = { + "channels": [{"name": "general", "id": "C1", "is_private": False, "is_member": True}], + "response_metadata": {"next_cursor": ""} + } + + mock_client_instance.conversations_list.side_effect = [ + SlackApiError(message="ratelimited", response=mock_error_response), + successful_response + ] + + slack_history = SlackHistory(token="fake_token") + channels_list = slack_history.get_all_channels(include_private=True) + + expected_channels_list = [{"id": "C1", "name": "general", "is_private": False, "is_member": True}] + self.assertListEqual(channels_list, expected_channels_list) + mock_sleep.assert_called_once_with(60) # Default fallback + mock_logger.warning.assert_called_once_with("Slack API rate limit hit while fetching channels. Waiting for 60 seconds. Cursor: None") + self.assertEqual(mock_client_instance.conversations_list.call_count, 2) + + @patch('surfsense_backend.app.connectors.slack_history.logger') + @patch('surfsense_backend.app.connectors.slack_history.time.sleep') + @patch('slack_sdk.WebClient') + def test_get_all_channels_other_slack_api_error(self, MockWebClient, mock_sleep, mock_logger): + mock_client_instance = MockWebClient.return_value + + mock_error_response = Mock() + mock_error_response.status_code = 500 + mock_error_response.headers = {} + mock_error_response.data = {"ok": False, "error": "internal_error"} + + original_error = SlackApiError(message="server error", response=mock_error_response) + mock_client_instance.conversations_list.side_effect = original_error + + slack_history = SlackHistory(token="fake_token") + + with self.assertRaises(SlackApiError) as context: + slack_history.get_all_channels(include_private=True) + + self.assertEqual(context.exception.response.status_code, 500) + self.assertIn("server error", str(context.exception)) + mock_sleep.assert_not_called() + mock_logger.warning.assert_not_called() # Ensure no rate limit log + mock_client_instance.conversations_list.assert_called_once_with( + types="public_channel,private_channel", cursor=None, limit=1000 + ) + + @patch('surfsense_backend.app.connectors.slack_history.logger') + @patch('surfsense_backend.app.connectors.slack_history.time.sleep') + @patch('slack_sdk.WebClient') + def test_get_all_channels_handles_missing_name_id_gracefully(self, MockWebClient, mock_sleep, mock_logger): + mock_client_instance = MockWebClient.return_value + + response_with_malformed_data = { + "channels": [ + {"id": "C1_missing_name", "is_private": False, "is_member": True}, + {"name": "channel_missing_id", "is_private": False, "is_member": True}, + {"name": "general", "id": "C2_valid", "is_private": False, "is_member": True} + ], + "response_metadata": {"next_cursor": ""} + } + + mock_client_instance.conversations_list.return_value = response_with_malformed_data + + slack_history = SlackHistory(token="fake_token") + channels_list = slack_history.get_all_channels(include_private=True) + + expected_channels_list = [ + {"id": "C2_valid", "name": "general", "is_private": False, "is_member": True} + ] + self.assertEqual(len(channels_list), 1) + self.assertListEqual(channels_list, expected_channels_list) + + self.assertEqual(mock_logger.warning.call_count, 2) + mock_logger.warning.assert_any_call("Channel found with missing name or id. Data: {'id': 'C1_missing_name', 'is_private': False, 'is_member': True}") + mock_logger.warning.assert_any_call("Channel found with missing name or id. Data: {'name': 'channel_missing_id', 'is_private': False, 'is_member': True}") + + mock_sleep.assert_not_called() + mock_client_instance.conversations_list.assert_called_once_with( + types="public_channel,private_channel", cursor=None, limit=1000 + ) + +if __name__ == '__main__': + unittest.main() + +class TestSlackHistoryGetConversationHistory(unittest.TestCase): + + @patch('surfsense_backend.app.connectors.slack_history.logger') + @patch('surfsense_backend.app.connectors.slack_history.time.sleep') + @patch('slack_sdk.WebClient') + def test_proactive_delay_single_page(self, MockWebClient, mock_time_sleep, mock_logger): + mock_client_instance = MockWebClient.return_value + mock_client_instance.conversations_history.return_value = { + "messages": [{"text": "msg1"}], + "has_more": False + } + + slack_history = SlackHistory(token="fake_token") + slack_history.get_conversation_history(channel_id="C123") + + mock_time_sleep.assert_called_once_with(1.2) # Proactive delay + + @patch('surfsense_backend.app.connectors.slack_history.logger') + @patch('surfsense_backend.app.connectors.slack_history.time.sleep') + @patch('slack_sdk.WebClient') + def test_proactive_delay_multiple_pages(self, MockWebClient, mock_time_sleep, mock_logger): + mock_client_instance = MockWebClient.return_value + mock_client_instance.conversations_history.side_effect = [ + { + "messages": [{"text": "msg1"}], + "has_more": True, + "response_metadata": {"next_cursor": "cursor1"} + }, + { + "messages": [{"text": "msg2"}], + "has_more": False + } + ] + + slack_history = SlackHistory(token="fake_token") + slack_history.get_conversation_history(channel_id="C123") + + # Expected calls: 1.2 (page1), 1.2 (page2) + self.assertEqual(mock_time_sleep.call_count, 2) + mock_time_sleep.assert_has_calls([call(1.2), call(1.2)]) + + @patch('surfsense_backend.app.connectors.slack_history.logger') + @patch('surfsense_backend.app.connectors.slack_history.time.sleep') + @patch('slack_sdk.WebClient') + def test_retry_after_logic(self, MockWebClient, mock_time_sleep, mock_logger): + mock_client_instance = MockWebClient.return_value + + mock_error_response = Mock() + mock_error_response.status_code = 429 + mock_error_response.headers = {'Retry-After': '5'} + + mock_client_instance.conversations_history.side_effect = [ + SlackApiError(message="ratelimited", response=mock_error_response), + {"messages": [{"text": "msg1"}], "has_more": False} + ] + + slack_history = SlackHistory(token="fake_token") + messages = slack_history.get_conversation_history(channel_id="C123") + + self.assertEqual(len(messages), 1) + self.assertEqual(messages[0]["text"], "msg1") + + # Expected sleep calls: 1.2 (proactive for 1st attempt), 5 (rate limit), 1.2 (proactive for 2nd attempt) + mock_time_sleep.assert_has_calls([call(1.2), call(5), call(1.2)], any_order=False) + mock_logger.warning.assert_called_once() # Check that a warning was logged for rate limiting + + @patch('surfsense_backend.app.connectors.slack_history.logger') + @patch('surfsense_backend.app.connectors.slack_history.time.sleep') + @patch('slack_sdk.WebClient') + def test_not_in_channel_error(self, MockWebClient, mock_time_sleep, mock_logger): + mock_client_instance = MockWebClient.return_value + + mock_error_response = Mock() + mock_error_response.status_code = 403 # Typical for not_in_channel, but data matters more + mock_error_response.data = {'ok': False, 'error': 'not_in_channel'} + + # This error is now raised by the inner try-except, then caught by the outer one + mock_client_instance.conversations_history.side_effect = SlackApiError( + message="not_in_channel error", + response=mock_error_response + ) + + slack_history = SlackHistory(token="fake_token") + messages = slack_history.get_conversation_history(channel_id="C123") + + self.assertEqual(messages, []) + mock_logger.warning.assert_called_with( + "Bot is not in channel 'C123'. Cannot fetch history. Please add the bot to this channel." + ) + mock_time_sleep.assert_called_once_with(1.2) # Proactive delay before the API call + + @patch('surfsense_backend.app.connectors.slack_history.logger') + @patch('surfsense_backend.app.connectors.slack_history.time.sleep') + @patch('slack_sdk.WebClient') + def test_other_slack_api_error_propagates(self, MockWebClient, mock_time_sleep, mock_logger): + mock_client_instance = MockWebClient.return_value + + mock_error_response = Mock() + mock_error_response.status_code = 500 + mock_error_response.data = {'ok': False, 'error': 'internal_error'} + original_error = SlackApiError(message="server error", response=mock_error_response) + + mock_client_instance.conversations_history.side_effect = original_error + + slack_history = SlackHistory(token="fake_token") + + with self.assertRaises(SlackApiError) as context: + slack_history.get_conversation_history(channel_id="C123") + + self.assertIn("Error retrieving history for channel C123", str(context.exception)) + self.assertIs(context.exception.response, mock_error_response) + mock_time_sleep.assert_called_once_with(1.2) # Proactive delay + + @patch('surfsense_backend.app.connectors.slack_history.logger') + @patch('surfsense_backend.app.connectors.slack_history.time.sleep') + @patch('slack_sdk.WebClient') + def test_general_exception_propagates(self, MockWebClient, mock_time_sleep, mock_logger): + mock_client_instance = MockWebClient.return_value + original_error = Exception("Something broke") + mock_client_instance.conversations_history.side_effect = original_error + + slack_history = SlackHistory(token="fake_token") + + with self.assertRaises(Exception) as context: # Check for generic Exception + slack_history.get_conversation_history(channel_id="C123") + + self.assertIs(context.exception, original_error) # Should re-raise the original error + mock_logger.error.assert_called_once_with("Unexpected error in get_conversation_history for channel C123: Something broke") + mock_time_sleep.assert_called_once_with(1.2) # Proactive delay + +class TestSlackHistoryGetUserInfo(unittest.TestCase): + + @patch('surfsense_backend.app.connectors.slack_history.logger') + @patch('surfsense_backend.app.connectors.slack_history.time.sleep') + @patch('slack_sdk.WebClient') + def test_retry_after_logic(self, MockWebClient, mock_time_sleep, mock_logger): + mock_client_instance = MockWebClient.return_value + + mock_error_response = Mock() + mock_error_response.status_code = 429 + mock_error_response.headers = {'Retry-After': '3'} # Using 3 seconds for test + + successful_user_data = {"id": "U123", "name": "testuser"} + + mock_client_instance.users_info.side_effect = [ + SlackApiError(message="ratelimited_userinfo", response=mock_error_response), + {"user": successful_user_data} + ] + + slack_history = SlackHistory(token="fake_token") + user_info = slack_history.get_user_info(user_id="U123") + + self.assertEqual(user_info, successful_user_data) + + # Assert that time.sleep was called for the rate limit + mock_time_sleep.assert_called_once_with(3) + mock_logger.warning.assert_called_once_with( + "Rate limited by Slack on users.info for user U123. Retrying after 3 seconds." + ) + # Assert users_info was called twice (original + retry) + self.assertEqual(mock_client_instance.users_info.call_count, 2) + mock_client_instance.users_info.assert_has_calls([call(user="U123"), call(user="U123")]) + + @patch('surfsense_backend.app.connectors.slack_history.logger') + @patch('surfsense_backend.app.connectors.slack_history.time.sleep') # time.sleep might be called by other logic, but not expected here + @patch('slack_sdk.WebClient') + def test_other_slack_api_error_propagates(self, MockWebClient, mock_time_sleep, mock_logger): + mock_client_instance = MockWebClient.return_value + + mock_error_response = Mock() + mock_error_response.status_code = 500 # Some other error + mock_error_response.data = {'ok': False, 'error': 'internal_server_error'} + original_error = SlackApiError(message="internal server error", response=mock_error_response) + + mock_client_instance.users_info.side_effect = original_error + + slack_history = SlackHistory(token="fake_token") + + with self.assertRaises(SlackApiError) as context: + slack_history.get_user_info(user_id="U123") + + # Check that the raised error is the one we expect + self.assertIn("Error retrieving user info for U123", str(context.exception)) + self.assertIs(context.exception.response, mock_error_response) + mock_time_sleep.assert_not_called() # No rate limit sleep + + @patch('surfsense_backend.app.connectors.slack_history.logger') + @patch('surfsense_backend.app.connectors.slack_history.time.sleep') + @patch('slack_sdk.WebClient') + def test_general_exception_propagates(self, MockWebClient, mock_time_sleep, mock_logger): + mock_client_instance = MockWebClient.return_value + original_error = Exception("A very generic problem") + mock_client_instance.users_info.side_effect = original_error + + slack_history = SlackHistory(token="fake_token") + + with self.assertRaises(Exception) as context: + slack_history.get_user_info(user_id="U123") + + self.assertIs(context.exception, original_error) # Check it's the exact same exception + mock_logger.error.assert_called_once_with( + "Unexpected error in get_user_info for user U123: A very generic problem" + ) + mock_time_sleep.assert_not_called() # No rate limit sleep diff --git a/surfsense_backend/app/db.py b/surfsense_backend/app/db.py index 4426f4ffa..7ee566311 100644 --- a/surfsense_backend/app/db.py +++ b/surfsense_backend/app/db.py @@ -3,11 +3,7 @@ from datetime import datetime, timezone from enum import Enum from fastapi import Depends -from fastapi_users.db import ( - SQLAlchemyBaseOAuthAccountTableUUID, - SQLAlchemyBaseUserTableUUID, - SQLAlchemyUserDatabase, -) + from pgvector.sqlalchemy import Vector from sqlalchemy import ( ARRAY, @@ -30,6 +26,18 @@ from app.config import config from app.retriver.chunks_hybrid_search import ChucksHybridSearchRetriever from app.retriver.documents_hybrid_search import DocumentHybridSearchRetriever +if config.AUTH_TYPE == "GOOGLE": + from fastapi_users.db import ( + SQLAlchemyBaseOAuthAccountTableUUID, + SQLAlchemyBaseUserTableUUID, + SQLAlchemyUserDatabase, + ) +else: + from fastapi_users.db import ( + SQLAlchemyBaseUserTableUUID, + SQLAlchemyUserDatabase, + ) + DATABASE_URL = config.DATABASE_URL @@ -44,8 +52,9 @@ class DocumentType(str, Enum): LINEAR_CONNECTOR = "LINEAR_CONNECTOR" class SearchSourceConnectorType(str, Enum): - SERPER_API = "SERPER_API" + SERPER_API = "SERPER_API" # NOT IMPLEMENTED YET : DON'T REMEMBER WHY : MOST PROBABLY BECAUSE WE NEED TO CRAWL THE RESULTS RETURNED BY IT TAVILY_API = "TAVILY_API" + LINKUP_API = "LINKUP_API" SLACK_CONNECTOR = "SLACK_CONNECTOR" NOTION_CONNECTOR = "NOTION_CONNECTOR" GITHUB_CONNECTOR = "GITHUB_CONNECTOR" @@ -75,7 +84,7 @@ class Chat(BaseModel, TimestampMixin): __tablename__ = "chats" type = Column(SQLAlchemyEnum(ChatType), nullable=False) - title = Column(String(200), nullable=False, index=True) + title = Column(String, nullable=False, index=True) initial_connectors = Column(ARRAY(String), nullable=True) messages = Column(JSON, nullable=False) @@ -85,11 +94,12 @@ class Chat(BaseModel, TimestampMixin): class Document(BaseModel, TimestampMixin): __tablename__ = "documents" - title = Column(String(200), nullable=False, index=True) + title = Column(String, nullable=False, index=True) document_type = Column(SQLAlchemyEnum(DocumentType), nullable=False) document_metadata = Column(JSON, nullable=True) content = Column(Text, nullable=False) + content_hash = Column(String, nullable=False, index=True, unique=True) embedding = Column(Vector(config.embedding_model_instance.dimension)) search_space_id = Column(Integer, ForeignKey("searchspaces.id", ondelete='CASCADE'), nullable=False) @@ -108,9 +118,8 @@ class Chunk(BaseModel, TimestampMixin): class Podcast(BaseModel, TimestampMixin): __tablename__ = "podcasts" - title = Column(String(200), nullable=False, index=True) - is_generated = Column(Boolean, nullable=False, default=False) - podcast_content = Column(Text, nullable=False, default="") + title = Column(String, nullable=False, index=True) + podcast_transcript = Column(JSON, nullable=False, default={}) file_location = Column(String(500), nullable=False, default="") search_space_id = Column(Integer, ForeignKey("searchspaces.id", ondelete='CASCADE'), nullable=False) @@ -141,17 +150,22 @@ class SearchSourceConnector(BaseModel, TimestampMixin): user_id = Column(UUID(as_uuid=True), ForeignKey("user.id", ondelete='CASCADE'), nullable=False) user = relationship("User", back_populates="search_source_connectors") - -class OAuthAccount(SQLAlchemyBaseOAuthAccountTableUUID, Base): - pass +if config.AUTH_TYPE == "GOOGLE": + class OAuthAccount(SQLAlchemyBaseOAuthAccountTableUUID, Base): + pass -class User(SQLAlchemyBaseUserTableUUID, Base): - oauth_accounts: Mapped[list[OAuthAccount]] = relationship( - "OAuthAccount", lazy="joined" - ) - search_spaces = relationship("SearchSpace", back_populates="user") - search_source_connectors = relationship("SearchSourceConnector", back_populates="user") + class User(SQLAlchemyBaseUserTableUUID, Base): + oauth_accounts: Mapped[list[OAuthAccount]] = relationship( + "OAuthAccount", lazy="joined" + ) + search_spaces = relationship("SearchSpace", back_populates="user") + search_source_connectors = relationship("SearchSourceConnector", back_populates="user") +else: + class User(SQLAlchemyBaseUserTableUUID, Base): + + search_spaces = relationship("SearchSpace", back_populates="user") + search_source_connectors = relationship("SearchSourceConnector", back_populates="user") engine = create_async_engine(DATABASE_URL) @@ -180,8 +194,12 @@ async def get_async_session() -> AsyncGenerator[AsyncSession, None]: yield session -async def get_user_db(session: AsyncSession = Depends(get_async_session)): - yield SQLAlchemyUserDatabase(session, User, OAuthAccount) +if config.AUTH_TYPE == "GOOGLE": + async def get_user_db(session: AsyncSession = Depends(get_async_session)): + yield SQLAlchemyUserDatabase(session, User, OAuthAccount) +else: + async def get_user_db(session: AsyncSession = Depends(get_async_session)): + yield SQLAlchemyUserDatabase(session, User) async def get_chucks_hybrid_search_retriever(session: AsyncSession = Depends(get_async_session)): return ChucksHybridSearchRetriever(session) diff --git a/surfsense_backend/app/retriver/documents_hybrid_search.py b/surfsense_backend/app/retriver/documents_hybrid_search.py index 060c3b17e..2163635eb 100644 --- a/surfsense_backend/app/retriver/documents_hybrid_search.py +++ b/surfsense_backend/app/retriver/documents_hybrid_search.py @@ -113,8 +113,6 @@ class DocumentHybridSearchRetriever: search_space_id: Optional search space ID to filter results document_type: Optional document type to filter results (e.g., "FILE", "CRAWLED_URL") - Returns: - List of dictionaries containing document data and relevance scores """ from sqlalchemy import select, func, text from sqlalchemy.orm import joinedload @@ -224,10 +222,22 @@ class DocumentHybridSearchRetriever: # Convert to serializable dictionaries serialized_results = [] for document, score in documents_with_scores: + # Fetch associated chunks for this document + from sqlalchemy import select + from app.db import Chunk + + chunks_query = select(Chunk).where(Chunk.document_id == document.id).order_by(Chunk.id) + chunks_result = await self.db_session.execute(chunks_query) + chunks = chunks_result.scalars().all() + + # Concatenate chunks content + concatenated_chunks_content = " ".join([chunk.content for chunk in chunks]) if chunks else document.content + serialized_results.append({ "document_id": document.id, "title": document.title, "content": document.content, + "chunks_content": concatenated_chunks_content, "document_type": document.document_type.value if hasattr(document, 'document_type') else None, "metadata": document.document_metadata, "score": float(score), # Ensure score is a Python float diff --git a/surfsense_backend/app/routes/chats_routes.py b/surfsense_backend/app/routes/chats_routes.py index 74ea97b06..9a2aa79b6 100644 --- a/surfsense_backend/app/routes/chats_routes.py +++ b/surfsense_backend/app/routes/chats_routes.py @@ -10,6 +10,8 @@ from fastapi.responses import StreamingResponse from sqlalchemy.exc import IntegrityError, OperationalError from sqlalchemy.ext.asyncio import AsyncSession from sqlalchemy.future import select +from langchain.schema import HumanMessage, AIMessage + router = APIRouter() @@ -20,14 +22,16 @@ async def handle_chat_data( user: User = Depends(current_active_user) ): messages = request.messages - if messages[-1].role != "user": + if messages[-1]['role'] != "user": raise HTTPException( status_code=400, detail="Last message must be a user message") - user_query = messages[-1].content + user_query = messages[-1]['content'] search_space_id = request.data.get('search_space_id') research_mode: str = request.data.get('research_mode') selected_connectors: List[str] = request.data.get('selected_connectors') + + search_mode_str = request.data.get('search_mode', "CHUNKS") # Convert search_space_id to integer if it's a string if search_space_id and isinstance(search_space_id, str): @@ -43,6 +47,21 @@ async def handle_chat_data( except HTTPException: raise HTTPException( status_code=403, detail="You don't have access to this search space") + + langchain_chat_history = [] + for message in messages[:-1]: + if message['role'] == "user": + langchain_chat_history.append(HumanMessage(content=message['content'])) + elif message['role'] == "assistant": + # Last annotation type will always be "ANSWER" here + answer_annotation = message['annotations'][-1] + answer_text = "" + if answer_annotation['type'] == "ANSWER": + answer_text = answer_annotation['content'] + # If content is a list, join it into a single string + if isinstance(answer_text, list): + answer_text = "\n".join(answer_text) + langchain_chat_history.append(AIMessage(content=answer_text)) response = StreamingResponse(stream_connector_search_results( user_query, @@ -50,7 +69,9 @@ async def handle_chat_data( search_space_id, # Already converted to int in lines 32-37 session, research_mode, - selected_connectors + selected_connectors, + langchain_chat_history, + search_mode_str )) response.headers['x-vercel-ai-data-stream'] = 'v1' return response diff --git a/surfsense_backend/app/routes/documents_routes.py b/surfsense_backend/app/routes/documents_routes.py index aa42476cc..acd246e18 100644 --- a/surfsense_backend/app/routes/documents_routes.py +++ b/surfsense_backend/app/routes/documents_routes.py @@ -1,3 +1,4 @@ +from litellm import atranscription from fastapi import APIRouter, Depends, BackgroundTasks, UploadFile, Form, HTTPException from sqlalchemy.ext.asyncio import AsyncSession from sqlalchemy.future import select @@ -6,7 +7,8 @@ from app.db import get_async_session, User, SearchSpace, Document, DocumentType from app.schemas import DocumentsCreate, DocumentUpdate, DocumentRead from app.users import current_active_user from app.utils.check_ownership import check_ownership -from app.tasks.background_tasks import add_extension_received_document, add_received_file_document, add_crawled_url_document, add_youtube_video_document +from app.tasks.background_tasks import add_received_markdown_file_document, add_extension_received_document, add_received_file_document_using_unstructured, add_crawled_url_document, add_youtube_video_document, add_received_file_document_using_llamacloud +from app.config import config as app_config # Force asyncio to use standard event loop before unstructured imports import asyncio try: @@ -15,12 +17,11 @@ except RuntimeError: pass import os os.environ["UNSTRUCTURED_HAS_PATCHED_LOOP"] = "1" -from langchain_unstructured import UnstructuredLoader -from app.config import config -import json + router = APIRouter() + @router.post("/documents/") async def create_documents( request: DocumentsCreate, @@ -31,19 +32,19 @@ async def create_documents( try: # Check if the user owns the search space await check_ownership(session, SearchSpace, request.search_space_id, user) - + if request.document_type == DocumentType.EXTENSION: for individual_document in request.content: fastapi_background_tasks.add_task( - process_extension_document_with_new_session, - individual_document, + process_extension_document_with_new_session, + individual_document, request.search_space_id ) elif request.document_type == DocumentType.CRAWLED_URL: - for url in request.content: + for url in request.content: fastapi_background_tasks.add_task( - process_crawled_url_with_new_session, - url, + process_crawled_url_with_new_session, + url, request.search_space_id ) elif request.document_type == DocumentType.YOUTUBE_VIDEO: @@ -58,7 +59,7 @@ async def create_documents( status_code=400, detail="Invalid document type" ) - + await session.commit() return {"message": "Documents processed successfully"} except HTTPException: @@ -70,6 +71,7 @@ async def create_documents( detail=f"Failed to process documents: {str(e)}" ) + @router.post("/documents/fileupload") async def create_documents( files: list[UploadFile], @@ -80,27 +82,26 @@ async def create_documents( ): try: await check_ownership(session, SearchSpace, search_space_id, user) - + if not files: raise HTTPException(status_code=400, detail="No files provided") - + for file in files: try: # Save file to a temporary location to avoid stream issues import tempfile import aiofiles import os - + # Create temp file with tempfile.NamedTemporaryFile(delete=False, suffix=os.path.splitext(file.filename)[1]) as temp_file: temp_path = temp_file.name - + # Write uploaded file to temp file content = await file.read() with open(temp_path, "wb") as f: f.write(content) - - # Process in background to avoid uvloop conflicts + fastapi_background_tasks.add_task( process_file_in_background_with_new_session, temp_path, @@ -112,7 +113,7 @@ async def create_documents( status_code=422, detail=f"Failed to process file {file.filename}: {str(e)}" ) - + await session.commit() return {"message": "Files uploaded for processing"} except HTTPException: @@ -132,40 +133,136 @@ async def process_file_in_background( session: AsyncSession ): try: - # Use synchronous unstructured API to avoid event loop issues - from langchain_community.document_loaders import UnstructuredFileLoader - - # Process the file - loader = UnstructuredFileLoader( - file_path, - mode="elements", - post_processors=[], - languages=["eng"], - include_orig_elements=False, - include_metadata=False, - strategy="auto", - ) - - docs = loader.load() - - # Clean up the temp file - import os - try: - os.unlink(file_path) - except: - pass - - # Pass the documents to the existing background task - await add_received_file_document( - session, - filename, - docs, - search_space_id - ) + # Check if the file is a markdown file + if filename.lower().endswith(('.md', '.markdown')): + # For markdown files, read the content directly + with open(file_path, 'r', encoding='utf-8') as f: + markdown_content = f.read() + + # Clean up the temp file + import os + try: + os.unlink(file_path) + except: + pass + + # Process markdown directly through specialized function + await add_received_markdown_file_document( + session, + filename, + markdown_content, + search_space_id + ) + # Check if the file is an audio file + elif filename.lower().endswith(('.mp3', '.mp4', '.mpeg', '.mpga', '.m4a', '.wav', '.webm')): + # Open the audio file for transcription + with open(file_path, "rb") as audio_file: + # Use LiteLLM for audio transcription + if app_config.STT_SERVICE_API_BASE: + transcription_response = await atranscription( + model=app_config.STT_SERVICE, + file=audio_file, + api_base=app_config.STT_SERVICE_API_BASE + ) + else: + transcription_response = await atranscription( + model=app_config.STT_SERVICE, + file=audio_file + ) + + # Extract the transcribed text + transcribed_text = transcription_response.get("text", "") + + # Add metadata about the transcription + transcribed_text = f"# Transcription of {filename}\n\n{transcribed_text}" + + # Clean up the temp file + try: + os.unlink(file_path) + except: + pass + + # Process transcription as markdown document + await add_received_markdown_file_document( + session, + filename, + transcribed_text, + search_space_id + ) + else: + if app_config.ETL_SERVICE == "UNSTRUCTURED": + from langchain_unstructured import UnstructuredLoader + + # Process the file + loader = UnstructuredLoader( + file_path, + mode="elements", + post_processors=[], + languages=["eng"], + include_orig_elements=False, + include_metadata=False, + strategy="auto", + ) + + docs = await loader.aload() + + # Clean up the temp file + import os + try: + os.unlink(file_path) + except: + pass + + # Pass the documents to the existing background task + await add_received_file_document_using_unstructured( + session, + filename, + docs, + search_space_id + ) + elif app_config.ETL_SERVICE == "LLAMACLOUD": + from llama_cloud_services import LlamaParse + from llama_cloud_services.parse.utils import ResultType + + + # Create LlamaParse parser instance + parser = LlamaParse( + api_key=app_config.LLAMA_CLOUD_API_KEY, + num_workers=1, # Use single worker for file processing + verbose=True, + language="en", + result_type=ResultType.MD + ) + + # Parse the file asynchronously + result = await parser.aparse(file_path) + + # Clean up the temp file + import os + try: + os.unlink(file_path) + except: + pass + + # Get markdown documents from the result + markdown_documents = await result.aget_markdown_documents(split_by_page=False) + + for doc in markdown_documents: + # Extract text content from the markdown documents + markdown_content = doc.text + + # Process the documents using our LlamaCloud background task + await add_received_file_document_using_llamacloud( + session, + filename, + llamacloud_markdown_document=markdown_content, + search_space_id=search_space_id + ) except Exception as e: import logging logging.error(f"Error processing file in background: {str(e)}") + @router.get("/documents/", response_model=List[DocumentRead]) async def read_documents( skip: int = 0, @@ -175,17 +272,18 @@ async def read_documents( user: User = Depends(current_active_user) ): try: - query = select(Document).join(SearchSpace).filter(SearchSpace.user_id == user.id) - + query = select(Document).join(SearchSpace).filter( + SearchSpace.user_id == user.id) + # Filter by search_space_id if provided if search_space_id is not None: query = query.filter(Document.search_space_id == search_space_id) - + result = await session.execute( query.offset(skip).limit(limit) ) db_documents = result.scalars().all() - + # Convert database objects to API-friendly format api_documents = [] for doc in db_documents: @@ -198,7 +296,7 @@ async def read_documents( created_at=doc.created_at, search_space_id=doc.search_space_id )) - + return api_documents except Exception as e: raise HTTPException( @@ -206,6 +304,7 @@ async def read_documents( detail=f"Failed to fetch documents: {str(e)}" ) + @router.get("/documents/{document_id}", response_model=DocumentRead) async def read_document( document_id: int, @@ -219,13 +318,13 @@ async def read_document( .filter(Document.id == document_id, SearchSpace.user_id == user.id) ) document = result.scalars().first() - + if not document: raise HTTPException( status_code=404, detail=f"Document with id {document_id} not found" ) - + # Convert database object to API-friendly format return DocumentRead( id=document.id, @@ -242,6 +341,7 @@ async def read_document( detail=f"Failed to fetch document: {str(e)}" ) + @router.put("/documents/{document_id}", response_model=DocumentRead) async def update_document( document_id: int, @@ -257,19 +357,19 @@ async def update_document( .filter(Document.id == document_id, SearchSpace.user_id == user.id) ) db_document = result.scalars().first() - + if not db_document: raise HTTPException( status_code=404, detail=f"Document with id {document_id} not found" ) - + update_data = document_update.model_dump(exclude_unset=True) for key, value in update_data.items(): setattr(db_document, key, value) await session.commit() await session.refresh(db_document) - + # Convert to DocumentRead for response return DocumentRead( id=db_document.id, @@ -289,6 +389,7 @@ async def update_document( detail=f"Failed to update document: {str(e)}" ) + @router.delete("/documents/{document_id}", response_model=dict) async def delete_document( document_id: int, @@ -303,13 +404,13 @@ async def delete_document( .filter(Document.id == document_id, SearchSpace.user_id == user.id) ) document = result.scalars().first() - + if not document: raise HTTPException( status_code=404, detail=f"Document with id {document_id} not found" ) - + await session.delete(document) await session.commit() return {"message": "Document deleted successfully"} @@ -320,16 +421,16 @@ async def delete_document( raise HTTPException( status_code=500, detail=f"Failed to delete document: {str(e)}" - ) - - + ) + + async def process_extension_document_with_new_session( individual_document, search_space_id: int ): """Create a new session and process extension document.""" from app.db import async_session_maker - + async with async_session_maker() as session: try: await add_extension_received_document(session, individual_document, search_space_id) @@ -337,13 +438,14 @@ async def process_extension_document_with_new_session( import logging logging.error(f"Error processing extension document: {str(e)}") + async def process_crawled_url_with_new_session( url: str, search_space_id: int ): """Create a new session and process crawled URL.""" from app.db import async_session_maker - + async with async_session_maker() as session: try: await add_crawled_url_document(session, url, search_space_id) @@ -351,6 +453,7 @@ async def process_crawled_url_with_new_session( import logging logging.error(f"Error processing crawled URL: {str(e)}") + async def process_file_in_background_with_new_session( file_path: str, filename: str, @@ -358,17 +461,18 @@ async def process_file_in_background_with_new_session( ): """Create a new session and process file.""" from app.db import async_session_maker - + async with async_session_maker() as session: await process_file_in_background(file_path, filename, search_space_id, session) + async def process_youtube_video_with_new_session( url: str, search_space_id: int ): """Create a new session and process YouTube video.""" from app.db import async_session_maker - + async with async_session_maker() as session: try: await add_youtube_video_document(session, url, search_space_id) @@ -376,3 +480,4 @@ async def process_youtube_video_with_new_session( import logging logging.error(f"Error processing YouTube video: {str(e)}") + diff --git a/surfsense_backend/app/routes/podcasts_routes.py b/surfsense_backend/app/routes/podcasts_routes.py index 7ac1da1ba..bc82e21d0 100644 --- a/surfsense_backend/app/routes/podcasts_routes.py +++ b/surfsense_backend/app/routes/podcasts_routes.py @@ -1,12 +1,16 @@ -from fastapi import APIRouter, Depends, HTTPException +from fastapi import APIRouter, BackgroundTasks, Depends, HTTPException from sqlalchemy.ext.asyncio import AsyncSession from sqlalchemy.future import select from sqlalchemy.exc import IntegrityError, SQLAlchemyError from typing import List -from app.db import get_async_session, User, SearchSpace, Podcast -from app.schemas import PodcastCreate, PodcastUpdate, PodcastRead +from app.db import get_async_session, User, SearchSpace, Podcast, Chat +from app.schemas import PodcastCreate, PodcastUpdate, PodcastRead, PodcastGenerateRequest from app.users import current_active_user from app.utils.check_ownership import check_ownership +from app.tasks.podcast_tasks import generate_chat_podcast +from fastapi.responses import StreamingResponse +import os +from pathlib import Path router = APIRouter() @@ -119,4 +123,121 @@ async def delete_podcast( raise he except SQLAlchemyError: await session.rollback() - raise HTTPException(status_code=500, detail="Database error occurred while deleting podcast") \ No newline at end of file + raise HTTPException(status_code=500, detail="Database error occurred while deleting podcast") + +async def generate_chat_podcast_with_new_session( + chat_id: int, + search_space_id: int, + podcast_title: str = "SurfSense Podcast" +): + """Create a new session and process chat podcast generation.""" + from app.db import async_session_maker + + async with async_session_maker() as session: + try: + await generate_chat_podcast(session, chat_id, search_space_id, podcast_title) + except Exception as e: + import logging + logging.error(f"Error generating podcast from chat: {str(e)}") + +@router.post("/podcasts/generate/") +async def generate_podcast( + request: PodcastGenerateRequest, + session: AsyncSession = Depends(get_async_session), + user: User = Depends(current_active_user), + fastapi_background_tasks: BackgroundTasks = BackgroundTasks() +): + try: + # Check if the user owns the search space + await check_ownership(session, SearchSpace, request.search_space_id, user) + + if request.type == "CHAT": + # Verify that all chat IDs belong to this user and search space + query = select(Chat).filter( + Chat.id.in_(request.ids), + Chat.search_space_id == request.search_space_id + ).join(SearchSpace).filter(SearchSpace.user_id == user.id) + + result = await session.execute(query) + valid_chats = result.scalars().all() + valid_chat_ids = [chat.id for chat in valid_chats] + + # If any requested ID is not in valid IDs, raise error immediately + if len(valid_chat_ids) != len(request.ids): + raise HTTPException( + status_code=403, + detail="One or more chat IDs do not belong to this user or search space" + ) + + # Only add a single task with the first chat ID + for chat_id in valid_chat_ids: + fastapi_background_tasks.add_task( + generate_chat_podcast_with_new_session, + chat_id, + request.search_space_id, + request.podcast_title + ) + + return { + "message": "Podcast generation started", + } + except HTTPException as he: + raise he + except IntegrityError as e: + await session.rollback() + raise HTTPException(status_code=400, detail="Podcast generation failed due to constraint violation") + except SQLAlchemyError as e: + await session.rollback() + raise HTTPException(status_code=500, detail="Database error occurred while generating podcast") + except Exception as e: + await session.rollback() + raise HTTPException(status_code=500, detail=f"An unexpected error occurred: {str(e)}") + +@router.get("/podcasts/{podcast_id}/stream") +async def stream_podcast( + podcast_id: int, + session: AsyncSession = Depends(get_async_session), + user: User = Depends(current_active_user) +): + """Stream a podcast audio file.""" + try: + # Get the podcast and check if user has access + result = await session.execute( + select(Podcast) + .join(SearchSpace) + .filter(Podcast.id == podcast_id, SearchSpace.user_id == user.id) + ) + podcast = result.scalars().first() + + if not podcast: + raise HTTPException( + status_code=404, + detail="Podcast not found or you don't have permission to access it" + ) + + # Get the file path + file_path = podcast.file_location + + # Check if the file exists + if not os.path.isfile(file_path): + raise HTTPException(status_code=404, detail="Podcast audio file not found") + + # Define a generator function to stream the file + def iterfile(): + with open(file_path, mode="rb") as file_like: + yield from file_like + + # Return a streaming response with appropriate headers + return StreamingResponse( + iterfile(), + media_type="audio/mpeg", + headers={ + "Accept-Ranges": "bytes", + "Content-Disposition": f"inline; filename={Path(file_path).name}" + } + ) + + except HTTPException as he: + raise he + except Exception as e: + raise HTTPException(status_code=500, detail=f"Error streaming podcast: {str(e)}") \ No newline at end of file diff --git a/surfsense_backend/app/routes/search_source_connectors_routes.py b/surfsense_backend/app/routes/search_source_connectors_routes.py index ff5cce148..15c815032 100644 --- a/surfsense_backend/app/routes/search_source_connectors_routes.py +++ b/surfsense_backend/app/routes/search_source_connectors_routes.py @@ -21,7 +21,7 @@ from app.utils.check_ownership import check_ownership from pydantic import BaseModel, Field, ValidationError from app.tasks.connectors_indexing_tasks import index_slack_messages, index_notion_pages, index_github_repos, index_linear_issues from app.connectors.github_connector import GitHubConnector -from datetime import datetime, timezone, timedelta +from datetime import datetime, timedelta import logging # Set up logging diff --git a/surfsense_backend/app/schemas/__init__.py b/surfsense_backend/app/schemas/__init__.py index 07adf24de..21688dfb0 100644 --- a/surfsense_backend/app/schemas/__init__.py +++ b/surfsense_backend/app/schemas/__init__.py @@ -10,7 +10,7 @@ from .documents import ( DocumentRead, ) from .chunks import ChunkBase, ChunkCreate, ChunkUpdate, ChunkRead -from .podcasts import PodcastBase, PodcastCreate, PodcastUpdate, PodcastRead +from .podcasts import PodcastBase, PodcastCreate, PodcastUpdate, PodcastRead, PodcastGenerateRequest from .chats import ChatBase, ChatCreate, ChatUpdate, ChatRead, AISDKChatRequest from .search_source_connector import SearchSourceConnectorBase, SearchSourceConnectorCreate, SearchSourceConnectorUpdate, SearchSourceConnectorRead @@ -39,6 +39,7 @@ __all__ = [ "PodcastCreate", "PodcastUpdate", "PodcastRead", + "PodcastGenerateRequest", "ChatBase", "ChatCreate", "ChatUpdate", diff --git a/surfsense_backend/app/schemas/base.py b/surfsense_backend/app/schemas/base.py index 4bd7b2262..d357aabcb 100644 --- a/surfsense_backend/app/schemas/base.py +++ b/surfsense_backend/app/schemas/base.py @@ -1,8 +1,10 @@ from datetime import datetime -from pydantic import BaseModel +from pydantic import BaseModel, ConfigDict class TimestampModel(BaseModel): created_at: datetime + model_config = ConfigDict(from_attributes=True) class IDModel(BaseModel): - id: int \ No newline at end of file + id: int + model_config = ConfigDict(from_attributes=True) \ No newline at end of file diff --git a/surfsense_backend/app/schemas/chats.py b/surfsense_backend/app/schemas/chats.py index ad7829b26..82191fbd6 100644 --- a/surfsense_backend/app/schemas/chats.py +++ b/surfsense_backend/app/schemas/chats.py @@ -1,8 +1,10 @@ from typing import Any, Dict, List, Optional -from pydantic import BaseModel -from sqlalchemy import JSON -from .base import IDModel, TimestampModel + from app.db import ChatType +from pydantic import BaseModel, ConfigDict + +from .base import IDModel, TimestampModel + class ChatBase(BaseModel): type: ChatType @@ -25,14 +27,14 @@ class ToolInvocation(BaseModel): result: dict -class ClientMessage(BaseModel): - role: str - content: str - experimental_attachments: Optional[List[ClientAttachment]] = None - toolInvocations: Optional[List[ToolInvocation]] = None +# class ClientMessage(BaseModel): +# role: str +# content: str +# experimental_attachments: Optional[List[ClientAttachment]] = None +# toolInvocations: Optional[List[ToolInvocation]] = None class AISDKChatRequest(BaseModel): - messages: List[ClientMessage] + messages: List[Any] data: Optional[Dict[str, Any]] = None class ChatCreate(ChatBase): @@ -42,5 +44,4 @@ class ChatUpdate(ChatBase): pass class ChatRead(ChatBase, IDModel, TimestampModel): - class Config: - from_attributes = True \ No newline at end of file + model_config = ConfigDict(from_attributes=True) \ No newline at end of file diff --git a/surfsense_backend/app/schemas/chunks.py b/surfsense_backend/app/schemas/chunks.py index 4230981ec..de0764fd4 100644 --- a/surfsense_backend/app/schemas/chunks.py +++ b/surfsense_backend/app/schemas/chunks.py @@ -1,4 +1,4 @@ -from pydantic import BaseModel +from pydantic import BaseModel, ConfigDict from .base import IDModel, TimestampModel class ChunkBase(BaseModel): @@ -12,5 +12,4 @@ class ChunkUpdate(ChunkBase): pass class ChunkRead(ChunkBase, IDModel, TimestampModel): - class Config: - from_attributes = True \ No newline at end of file + model_config = ConfigDict(from_attributes=True) \ No newline at end of file diff --git a/surfsense_backend/app/schemas/documents.py b/surfsense_backend/app/schemas/documents.py index dcd71879c..ad8bd9599 100644 --- a/surfsense_backend/app/schemas/documents.py +++ b/surfsense_backend/app/schemas/documents.py @@ -1,7 +1,5 @@ from typing import List, Any -from pydantic import BaseModel -from sqlalchemy import JSON -from .base import IDModel, TimestampModel +from pydantic import BaseModel, ConfigDict from app.db import DocumentType from datetime import datetime @@ -37,6 +35,5 @@ class DocumentRead(BaseModel): created_at: datetime search_space_id: int - class Config: - from_attributes = True + model_config = ConfigDict(from_attributes=True) diff --git a/surfsense_backend/app/schemas/podcasts.py b/surfsense_backend/app/schemas/podcasts.py index fbec5482b..0356dd0b7 100644 --- a/surfsense_backend/app/schemas/podcasts.py +++ b/surfsense_backend/app/schemas/podcasts.py @@ -1,10 +1,10 @@ -from pydantic import BaseModel +from pydantic import BaseModel, ConfigDict +from typing import Any, List, Literal from .base import IDModel, TimestampModel class PodcastBase(BaseModel): title: str - is_generated: bool = False - podcast_content: str = "" + podcast_transcript: List[Any] file_location: str = "" search_space_id: int @@ -15,5 +15,10 @@ class PodcastUpdate(PodcastBase): pass class PodcastRead(PodcastBase, IDModel, TimestampModel): - class Config: - from_attributes = True \ No newline at end of file + model_config = ConfigDict(from_attributes=True) + +class PodcastGenerateRequest(BaseModel): + type: Literal["DOCUMENT", "CHAT"] + ids: List[int] + search_space_id: int + podcast_title: str = "SurfSense Podcast" \ No newline at end of file diff --git a/surfsense_backend/app/schemas/search_source_connector.py b/surfsense_backend/app/schemas/search_source_connector.py index 6accc12af..b136757fd 100644 --- a/surfsense_backend/app/schemas/search_source_connector.py +++ b/surfsense_backend/app/schemas/search_source_connector.py @@ -1,7 +1,7 @@ from datetime import datetime import uuid from typing import Dict, Any, Optional -from pydantic import BaseModel, field_validator +from pydantic import BaseModel, field_validator, ConfigDict from .base import IDModel, TimestampModel from app.db import SearchSourceConnectorType @@ -36,6 +36,16 @@ class SearchSourceConnectorBase(BaseModel): # Ensure the API key is not empty if not config.get("TAVILY_API_KEY"): raise ValueError("TAVILY_API_KEY cannot be empty") + + elif connector_type == SearchSourceConnectorType.LINKUP_API: + # For LINKUP_API, only allow LINKUP_API_KEY + allowed_keys = ["LINKUP_API_KEY"] + if set(config.keys()) != set(allowed_keys): + raise ValueError(f"For LINKUP_API connector type, config must only contain these keys: {allowed_keys}") + + # Ensure the API key is not empty + if not config.get("LINKUP_API_KEY"): + raise ValueError("LINKUP_API_KEY cannot be empty") elif connector_type == SearchSourceConnectorType.SLACK_CONNECTOR: # For SLACK_CONNECTOR, only allow SLACK_BOT_TOKEN @@ -96,5 +106,4 @@ class SearchSourceConnectorUpdate(BaseModel): class SearchSourceConnectorRead(SearchSourceConnectorBase, IDModel, TimestampModel): user_id: uuid.UUID - class Config: - from_attributes = True + model_config = ConfigDict(from_attributes=True) diff --git a/surfsense_backend/app/schemas/search_space.py b/surfsense_backend/app/schemas/search_space.py index feebcf1ac..2c99c45ac 100644 --- a/surfsense_backend/app/schemas/search_space.py +++ b/surfsense_backend/app/schemas/search_space.py @@ -1,7 +1,7 @@ from datetime import datetime import uuid from typing import Optional -from pydantic import BaseModel +from pydantic import BaseModel, ConfigDict from .base import IDModel, TimestampModel class SearchSpaceBase(BaseModel): @@ -19,5 +19,4 @@ class SearchSpaceRead(SearchSpaceBase, IDModel, TimestampModel): created_at: datetime user_id: uuid.UUID - class Config: - from_attributes = True \ No newline at end of file + model_config = ConfigDict(from_attributes=True) \ No newline at end of file diff --git a/surfsense_backend/app/tasks/background_tasks.py b/surfsense_backend/app/tasks/background_tasks.py index b2f6f8c81..18ac2917a 100644 --- a/surfsense_backend/app/tasks/background_tasks.py +++ b/surfsense_backend/app/tasks/background_tasks.py @@ -1,27 +1,29 @@ from typing import Optional, List from sqlalchemy.ext.asyncio import AsyncSession from sqlalchemy.exc import SQLAlchemyError +from sqlalchemy.future import select from app.db import Document, DocumentType, Chunk from app.schemas import ExtensionDocumentContent from app.config import config from app.prompts import SUMMARY_PROMPT_TEMPLATE from datetime import datetime -from app.utils.document_converters import convert_document_to_markdown +from app.utils.document_converters import convert_document_to_markdown, generate_content_hash from langchain_core.documents import Document as LangChainDocument from langchain_community.document_loaders import FireCrawlLoader, AsyncChromiumLoader from langchain_community.document_transformers import MarkdownifyTransformer import validators +from youtube_transcript_api import YouTubeTranscriptApi +from urllib.parse import urlparse, parse_qs +import aiohttp +import logging md = MarkdownifyTransformer() async def add_crawled_url_document( - session: AsyncSession, - url: str, - search_space_id: int + session: AsyncSession, url: str, search_space_id: int ) -> Optional[Document]: try: - if not validators.url(url): raise ValueError(f"Url {url} is not a valid URL address") @@ -33,7 +35,7 @@ async def add_crawled_url_document( params={ "formats": ["markdown"], "excludeTags": ["a"], - } + }, ) else: crawl_loader = AsyncChromiumLoader(urls=[url], headless=True) @@ -43,20 +45,21 @@ async def add_crawled_url_document( if type(crawl_loader) == FireCrawlLoader: content_in_markdown = url_crawled[0].page_content elif type(crawl_loader) == AsyncChromiumLoader: - content_in_markdown = md.transform_documents(url_crawled)[ - 0].page_content + content_in_markdown = md.transform_documents(url_crawled)[0].page_content # Format document metadata in a more maintainable way metadata_sections = [ - ("METADATA", [ - f"{key.upper()}: {value}" for key, value in url_crawled[0].metadata.items() - ]), - ("CONTENT", [ - "FORMAT: markdown", - "TEXT_START", - content_in_markdown, - "TEXT_END" - ]) + ( + "METADATA", + [ + f"{key.upper()}: {value}" + for key, value in url_crawled[0].metadata.items() + ], + ), + ( + "CONTENT", + ["FORMAT: markdown", "TEXT_START", content_in_markdown, "TEXT_END"], + ), ] # Build the document string more efficiently @@ -69,31 +72,48 @@ async def add_crawled_url_document( document_parts.append(f"") document_parts.append("") - combined_document_string = '\n'.join(document_parts) + combined_document_string = "\n".join(document_parts) + content_hash = generate_content_hash(combined_document_string) + + # Check if document with this content hash already exists + existing_doc_result = await session.execute( + select(Document).where(Document.content_hash == content_hash) + ) + existing_document = existing_doc_result.scalars().first() + + if existing_document: + logging.info(f"Document with content hash {content_hash} already exists. Skipping processing.") + return existing_document # Generate summary summary_chain = SUMMARY_PROMPT_TEMPLATE | config.long_context_llm_instance - summary_result = await summary_chain.ainvoke({"document": combined_document_string}) + summary_result = await summary_chain.ainvoke( + {"document": combined_document_string} + ) summary_content = summary_result.content - summary_embedding = config.embedding_model_instance.embed( - summary_content) + summary_embedding = config.embedding_model_instance.embed(summary_content) # Process chunks chunks = [ - Chunk(content=chunk.text, embedding=chunk.embedding) + Chunk( + content=chunk.text, + embedding=config.embedding_model_instance.embed(chunk.text), + ) for chunk in config.chunker_instance.chunk(content_in_markdown) ] # Create and store document document = Document( search_space_id=search_space_id, - title=url_crawled[0].metadata['title'] if type( - crawl_loader) == FireCrawlLoader else url_crawled[0].metadata['source'], + title=url_crawled[0].metadata["title"] + if type(crawl_loader) == FireCrawlLoader + else url_crawled[0].metadata["source"], document_type=DocumentType.CRAWLED_URL, document_metadata=url_crawled[0].metadata, content=summary_content, embedding=summary_embedding, - chunks=chunks + chunks=chunks, + content_hash=content_hash, ) session.add(document) @@ -111,9 +131,7 @@ async def add_crawled_url_document( async def add_extension_received_document( - session: AsyncSession, - content: ExtensionDocumentContent, - search_space_id: int + session: AsyncSession, content: ExtensionDocumentContent, search_space_id: int ) -> Optional[Document]: """ Process and store document content received from the SurfSense Extension. @@ -129,20 +147,21 @@ async def add_extension_received_document( try: # Format document metadata in a more maintainable way metadata_sections = [ - ("METADATA", [ - f"SESSION_ID: {content.metadata.BrowsingSessionId}", - f"URL: {content.metadata.VisitedWebPageURL}", - f"TITLE: {content.metadata.VisitedWebPageTitle}", - f"REFERRER: {content.metadata.VisitedWebPageReffererURL}", - f"TIMESTAMP: {content.metadata.VisitedWebPageDateWithTimeInISOString}", - f"DURATION_MS: {content.metadata.VisitedWebPageVisitDurationInMilliseconds}" - ]), - ("CONTENT", [ - "FORMAT: markdown", - "TEXT_START", - content.pageContent, - "TEXT_END" - ]) + ( + "METADATA", + [ + f"SESSION_ID: {content.metadata.BrowsingSessionId}", + f"URL: {content.metadata.VisitedWebPageURL}", + f"TITLE: {content.metadata.VisitedWebPageTitle}", + f"REFERRER: {content.metadata.VisitedWebPageReffererURL}", + f"TIMESTAMP: {content.metadata.VisitedWebPageDateWithTimeInISOString}", + f"DURATION_MS: {content.metadata.VisitedWebPageVisitDurationInMilliseconds}", + ], + ), + ( + "CONTENT", + ["FORMAT: markdown", "TEXT_START", content.pageContent, "TEXT_END"], + ), ] # Build the document string more efficiently @@ -155,18 +174,33 @@ async def add_extension_received_document( document_parts.append(f"") document_parts.append("") - combined_document_string = '\n'.join(document_parts) + combined_document_string = "\n".join(document_parts) + content_hash = generate_content_hash(combined_document_string) + + # Check if document with this content hash already exists + existing_doc_result = await session.execute( + select(Document).where(Document.content_hash == content_hash) + ) + existing_document = existing_doc_result.scalars().first() + + if existing_document: + logging.info(f"Document with content hash {content_hash} already exists. Skipping processing.") + return existing_document # Generate summary summary_chain = SUMMARY_PROMPT_TEMPLATE | config.long_context_llm_instance - summary_result = await summary_chain.ainvoke({"document": combined_document_string}) + summary_result = await summary_chain.ainvoke( + {"document": combined_document_string} + ) summary_content = summary_result.content - summary_embedding = config.embedding_model_instance.embed( - summary_content) + summary_embedding = config.embedding_model_instance.embed(summary_content) # Process chunks chunks = [ - Chunk(content=chunk.text, embedding=chunk.embedding) + Chunk( + content=chunk.text, + embedding=config.embedding_model_instance.embed(chunk.text), + ) for chunk in config.chunker_instance.chunk(content.pageContent) ] @@ -178,7 +212,8 @@ async def add_extension_received_document( document_metadata=content.metadata.model_dump(), content=summary_content, embedding=summary_embedding, - chunks=chunks + chunks=chunks, + content_hash=content_hash, ) session.add(document) @@ -195,27 +230,34 @@ async def add_extension_received_document( raise RuntimeError(f"Failed to process extension document: {str(e)}") -async def add_received_file_document( - session: AsyncSession, - file_name: str, - unstructured_processed_elements: List[LangChainDocument], - search_space_id: int +async def add_received_markdown_file_document( + session: AsyncSession, file_name: str, file_in_markdown: str, search_space_id: int ) -> Optional[Document]: try: - file_in_markdown = await convert_document_to_markdown(unstructured_processed_elements) + content_hash = generate_content_hash(file_in_markdown) - # TODO: Check if file_markdown exceeds token limit of embedding model + # Check if document with this content hash already exists + existing_doc_result = await session.execute( + select(Document).where(Document.content_hash == content_hash) + ) + existing_document = existing_doc_result.scalars().first() + + if existing_document: + logging.info(f"Document with content hash {content_hash} already exists. Skipping processing.") + return existing_document # Generate summary summary_chain = SUMMARY_PROMPT_TEMPLATE | config.long_context_llm_instance summary_result = await summary_chain.ainvoke({"document": file_in_markdown}) summary_content = summary_result.content - summary_embedding = config.embedding_model_instance.embed( - summary_content) + summary_embedding = config.embedding_model_instance.embed(summary_content) - # Process chunks + # Process chunks chunks = [ - Chunk(content=chunk.text, embedding=chunk.embedding) + Chunk( + content=chunk.text, + embedding=config.embedding_model_instance.embed(chunk.text), + ) for chunk in config.chunker_instance.chunk(file_in_markdown) ] @@ -226,11 +268,11 @@ async def add_received_file_document( document_type=DocumentType.FILE, document_metadata={ "FILE_NAME": file_name, - "SAVED_AT": datetime.now().strftime("%Y-%m-%d %H:%M:%S") }, content=summary_content, embedding=summary_embedding, - chunks=chunks + chunks=chunks, + content_hash=content_hash, ) session.add(document) @@ -246,24 +288,176 @@ async def add_received_file_document( raise RuntimeError(f"Failed to process file document: {str(e)}") -async def add_youtube_video_document( +async def add_received_file_document_using_unstructured( session: AsyncSession, - url: str, - search_space_id: int -): + file_name: str, + unstructured_processed_elements: List[LangChainDocument], + search_space_id: int, +) -> Optional[Document]: + try: + file_in_markdown = await convert_document_to_markdown( + unstructured_processed_elements + ) + + content_hash = generate_content_hash(file_in_markdown) + + # Check if document with this content hash already exists + existing_doc_result = await session.execute( + select(Document).where(Document.content_hash == content_hash) + ) + existing_document = existing_doc_result.scalars().first() + + if existing_document: + logging.info(f"Document with content hash {content_hash} already exists. Skipping processing.") + return existing_document + + # TODO: Check if file_markdown exceeds token limit of embedding model + + # Generate summary + summary_chain = SUMMARY_PROMPT_TEMPLATE | config.long_context_llm_instance + summary_result = await summary_chain.ainvoke({"document": file_in_markdown}) + summary_content = summary_result.content + summary_embedding = config.embedding_model_instance.embed(summary_content) + + # Process chunks + chunks = [ + Chunk( + content=chunk.text, + embedding=config.embedding_model_instance.embed(chunk.text), + ) + for chunk in config.chunker_instance.chunk(file_in_markdown) + ] + + # Create and store document + document = Document( + search_space_id=search_space_id, + title=file_name, + document_type=DocumentType.FILE, + document_metadata={ + "FILE_NAME": file_name, + "ETL_SERVICE": "UNSTRUCTURED", + }, + content=summary_content, + embedding=summary_embedding, + chunks=chunks, + content_hash=content_hash, + ) + + session.add(document) + await session.commit() + await session.refresh(document) + + return document + except SQLAlchemyError as db_error: + await session.rollback() + raise db_error + except Exception as e: + await session.rollback() + raise RuntimeError(f"Failed to process file document: {str(e)}") + + +async def add_received_file_document_using_llamacloud( + session: AsyncSession, + file_name: str, + llamacloud_markdown_document: str, + search_space_id: int, +) -> Optional[Document]: """ - Process a YouTube video URL, extract transcripts, and add as document. + Process and store document content parsed by LlamaCloud. + + Args: + session: Database session + file_name: Name of the processed file + llamacloud_markdown_documents: List of markdown content from LlamaCloud parsing + search_space_id: ID of the search space + + Returns: + Document object if successful, None if failed """ try: - from youtube_transcript_api import YouTubeTranscriptApi + # Combine all markdown documents into one + file_in_markdown = llamacloud_markdown_document + + content_hash = generate_content_hash(file_in_markdown) + + # Check if document with this content hash already exists + existing_doc_result = await session.execute( + select(Document).where(Document.content_hash == content_hash) + ) + existing_document = existing_doc_result.scalars().first() + if existing_document: + logging.info(f"Document with content hash {content_hash} already exists. Skipping processing.") + return existing_document + + # Generate summary + summary_chain = SUMMARY_PROMPT_TEMPLATE | config.long_context_llm_instance + summary_result = await summary_chain.ainvoke({"document": file_in_markdown}) + summary_content = summary_result.content + summary_embedding = config.embedding_model_instance.embed(summary_content) + + # Process chunks + chunks = [ + Chunk( + content=chunk.text, + embedding=config.embedding_model_instance.embed(chunk.text), + ) + for chunk in config.chunker_instance.chunk(file_in_markdown) + ] + + # Create and store document + document = Document( + search_space_id=search_space_id, + title=file_name, + document_type=DocumentType.FILE, + document_metadata={ + "FILE_NAME": file_name, + "ETL_SERVICE": "LLAMACLOUD", + }, + content=summary_content, + embedding=summary_embedding, + chunks=chunks, + content_hash=content_hash, + ) + + session.add(document) + await session.commit() + await session.refresh(document) + + return document + except SQLAlchemyError as db_error: + await session.rollback() + raise db_error + except Exception as e: + await session.rollback() + raise RuntimeError(f"Failed to process file document using LlamaCloud: {str(e)}") + + +async def add_youtube_video_document( + session: AsyncSession, url: str, search_space_id: int +): + """ + Process a YouTube video URL, extract transcripts, and store as a document. + + Args: + session: Database session for storing the document + url: YouTube video URL (supports standard, shortened, and embed formats) + search_space_id: ID of the search space to add the document to + + Returns: + Document: The created document object + + Raises: + ValueError: If the YouTube video ID cannot be extracted from the URL + SQLAlchemyError: If there's a database error + RuntimeError: If the video processing fails + """ + try: # Extract video ID from URL def get_youtube_video_id(url: str): - from urllib.parse import urlparse, parse_qs - parsed_url = urlparse(url) hostname = parsed_url.hostname - + if hostname == "youtu.be": return parsed_url.path[1:] if hostname in ("www.youtube.com", "youtube.com"): @@ -275,26 +469,23 @@ async def add_youtube_video_document( if parsed_url.path.startswith("/v/"): return parsed_url.path.split("/")[2] return None - + # Get video ID video_id = get_youtube_video_id(url) if not video_id: raise ValueError(f"Could not extract video ID from URL: {url}") - - # Get video metadata - import json - from urllib.parse import urlencode - from urllib.request import urlopen - - params = {"format": "json", "url": f"https://www.youtube.com/watch?v={video_id}"} + + # Get video metadata using async HTTP client + params = { + "format": "json", + "url": f"https://www.youtube.com/watch?v={video_id}", + } oembed_url = "https://www.youtube.com/oembed" - query_string = urlencode(params) - full_url = oembed_url + "?" + query_string - - with urlopen(full_url) as response: - response_text = response.read() - video_data = json.loads(response_text.decode()) - + + async with aiohttp.ClientSession() as http_session: + async with http_session.get(oembed_url, params=params) as response: + video_data = await response.json() + # Get video transcript try: captions = YouTubeTranscriptApi.get_transcript(video_id) @@ -309,22 +500,23 @@ async def add_youtube_video_document( transcript_text = "\n".join(transcript_segments) except Exception as e: transcript_text = f"No captions available for this video. Error: {str(e)}" - + # Format document metadata in a more maintainable way metadata_sections = [ - ("METADATA", [ - f"TITLE: {video_data.get('title', 'YouTube Video')}", - f"URL: {url}", - f"VIDEO_ID: {video_id}", - f"AUTHOR: {video_data.get('author_name', 'Unknown')}", - f"THUMBNAIL: {video_data.get('thumbnail_url', '')}" - ]), - ("CONTENT", [ - "FORMAT: transcript", - "TEXT_START", - transcript_text, - "TEXT_END" - ]) + ( + "METADATA", + [ + f"TITLE: {video_data.get('title', 'YouTube Video')}", + f"URL: {url}", + f"VIDEO_ID: {video_id}", + f"AUTHOR: {video_data.get('author_name', 'Unknown')}", + f"THUMBNAIL: {video_data.get('thumbnail_url', '')}", + ], + ), + ( + "CONTENT", + ["FORMAT: transcript", "TEXT_START", transcript_text, "TEXT_END"], + ), ] # Build the document string more efficiently @@ -337,23 +529,38 @@ async def add_youtube_video_document( document_parts.append(f"") document_parts.append("") - combined_document_string = '\n'.join(document_parts) + combined_document_string = "\n".join(document_parts) + content_hash = generate_content_hash(combined_document_string) + + # Check if document with this content hash already exists + existing_doc_result = await session.execute( + select(Document).where(Document.content_hash == content_hash) + ) + existing_document = existing_doc_result.scalars().first() + + if existing_document: + logging.info(f"Document with content hash {content_hash} already exists. Skipping processing.") + return existing_document # Generate summary summary_chain = SUMMARY_PROMPT_TEMPLATE | config.long_context_llm_instance - summary_result = await summary_chain.ainvoke({"document": combined_document_string}) + summary_result = await summary_chain.ainvoke( + {"document": combined_document_string} + ) summary_content = summary_result.content summary_embedding = config.embedding_model_instance.embed(summary_content) # Process chunks chunks = [ - Chunk(content=chunk.text, embedding=chunk.embedding) - for chunk in config.chunker_instance.chunk(transcript_text) + Chunk( + content=chunk.text, + embedding=config.embedding_model_instance.embed(chunk.text), + ) + for chunk in config.chunker_instance.chunk(combined_document_string) ] - + # Create document - from app.db import Document, DocumentType - + document = Document( title=video_data.get("title", "YouTube Video"), document_type=DocumentType.YOUTUBE_VIDEO, @@ -362,24 +569,24 @@ async def add_youtube_video_document( "video_id": video_id, "video_title": video_data.get("title", "YouTube Video"), "author": video_data.get("author_name", "Unknown"), - "thumbnail": video_data.get("thumbnail_url", "") + "thumbnail": video_data.get("thumbnail_url", ""), }, content=summary_content, embedding=summary_embedding, chunks=chunks, - search_space_id=search_space_id + search_space_id=search_space_id, + content_hash=content_hash, ) - + session.add(document) await session.commit() await session.refresh(document) - + return document except SQLAlchemyError as db_error: await session.rollback() raise db_error except Exception as e: await session.rollback() - import logging logging.error(f"Failed to process YouTube video: {str(e)}") raise diff --git a/surfsense_backend/app/tasks/connectors_indexing_tasks.py b/surfsense_backend/app/tasks/connectors_indexing_tasks.py index 7c210628d..0e0964eeb 100644 --- a/surfsense_backend/app/tasks/connectors_indexing_tasks.py +++ b/surfsense_backend/app/tasks/connectors_indexing_tasks.py @@ -14,6 +14,8 @@ from app.connectors.linear_connector import LinearConnector from slack_sdk.errors import SlackApiError import logging +from app.utils.document_converters import generate_content_hash + # Set up logging logger = logging.getLogger(__name__) @@ -67,13 +69,13 @@ async def index_slack_messages( # Check if last_indexed_at is in the future or after end_date if last_indexed_naive > end_date: - logger.warning(f"Last indexed date ({last_indexed_naive.strftime('%Y-%m-%d')}) is in the future. Using 30 days ago instead.") - start_date = end_date - timedelta(days=30) + logger.warning(f"Last indexed date ({last_indexed_naive.strftime('%Y-%m-%d')}) is in the future. Using 365 days ago instead.") + start_date = end_date - timedelta(days=365) else: start_date = last_indexed_naive logger.info(f"Using last_indexed_at ({start_date.strftime('%Y-%m-%d')}) as start date") else: - start_date = end_date - timedelta(days=30) # Use 30 days instead of 365 to catch recent issues + start_date = end_date - timedelta(days=365) # Use 365 days as default logger.info(f"No last_indexed_at found, using {start_date.strftime('%Y-%m-%d')} (30 days ago) as start date") # Format dates for Slack API @@ -89,58 +91,31 @@ async def index_slack_messages( if not channels: return 0, "No Slack channels found" - # Get existing documents for this search space and connector type to prevent duplicates - existing_docs_result = await session.execute( - select(Document) - .filter( - Document.search_space_id == search_space_id, - Document.document_type == DocumentType.SLACK_CONNECTOR - ) - ) - existing_docs = existing_docs_result.scalars().all() - - # Create a lookup dictionary of existing documents by channel_id - existing_docs_by_channel_id = {} - for doc in existing_docs: - if "channel_id" in doc.document_metadata: - existing_docs_by_channel_id[doc.document_metadata["channel_id"]] = doc - - logger.info(f"Found {len(existing_docs_by_channel_id)} existing Slack documents in database") - # Track the number of documents indexed documents_indexed = 0 - documents_updated = 0 documents_skipped = 0 skipped_channels = [] # Process each channel - for channel_name, channel_id in channels.items(): + for channel_obj in channels: # Modified loop to iterate over list of channel objects + channel_id = channel_obj["id"] + channel_name = channel_obj["name"] + is_private = channel_obj["is_private"] + is_member = channel_obj["is_member"] # This might be False for public channels too + try: - # Check if the bot is a member of the channel - try: - # First try to get channel info to check if bot is a member - channel_info = slack_client.client.conversations_info(channel=channel_id) - - # For private channels, the bot needs to be a member - if channel_info.get("channel", {}).get("is_private", False): - # Check if bot is a member - is_member = channel_info.get("channel", {}).get("is_member", False) - if not is_member: - logger.warning(f"Bot is not a member of private channel {channel_name} ({channel_id}). Skipping.") - skipped_channels.append(f"{channel_name} (private, bot not a member)") - documents_skipped += 1 - continue - except SlackApiError as e: - if "not_in_channel" in str(e) or "channel_not_found" in str(e): - logger.warning(f"Bot cannot access channel {channel_name} ({channel_id}). Skipping.") - skipped_channels.append(f"{channel_name} (access error)") - documents_skipped += 1 - continue - else: - # Re-raise if it's a different error - raise + # If it's a private channel and the bot is not a member, skip. + # For public channels, if they are listed by conversations.list, the bot can typically read history. + # The `not_in_channel` error in get_conversation_history will be the ultimate gatekeeper if history is inaccessible. + if is_private and not is_member: + logger.warning(f"Bot is not a member of private channel {channel_name} ({channel_id}). Skipping.") + skipped_channels.append(f"{channel_name} (private, bot not a member)") + documents_skipped += 1 + continue # Get messages for this channel + # The get_history_by_date_range now uses get_conversation_history, + # which handles 'not_in_channel' by returning [] and logging. messages, error = slack_client.get_history_by_date_range( channel_id=channel_id, start_date=start_date_str, @@ -189,10 +164,9 @@ async def index_slack_messages( ("METADATA", [ f"CHANNEL_NAME: {channel_name}", f"CHANNEL_ID: {channel_id}", - f"START_DATE: {start_date_str}", - f"END_DATE: {end_date_str}", - f"MESSAGE_COUNT: {len(formatted_messages)}", - f"INDEXED_AT: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}" + # f"START_DATE: {start_date_str}", + # f"END_DATE: {end_date_str}", + f"MESSAGE_COUNT: {len(formatted_messages)}" ]), ("CONTENT", [ "FORMAT: markdown", @@ -213,6 +187,18 @@ async def index_slack_messages( document_parts.append("") combined_document_string = '\n'.join(document_parts) + content_hash = generate_content_hash(combined_document_string) + + # Check if document with this content hash already exists + existing_doc_by_hash_result = await session.execute( + select(Document).where(Document.content_hash == content_hash) + ) + existing_document_by_hash = existing_doc_by_hash_result.scalars().first() + + if existing_document_by_hash: + logger.info(f"Document with content hash {content_hash} already exists for channel {channel_name}. Skipping processing.") + documents_skipped += 1 + continue # Generate summary summary_chain = SUMMARY_PROMPT_TEMPLATE | config.long_context_llm_instance @@ -222,65 +208,32 @@ async def index_slack_messages( # Process chunks chunks = [ - Chunk(content=chunk.text, embedding=chunk.embedding) + Chunk(content=chunk.text, embedding=config.embedding_model_instance.embed(chunk.text)) for chunk in config.chunker_instance.chunk(channel_content) ] - # Check if this channel already exists in our database - existing_document = existing_docs_by_channel_id.get(channel_id) - - if existing_document: - # Update existing document instead of creating a new one - logger.info(f"Updating existing document for channel {channel_name}") - - # Update document fields - existing_document.title = f"Slack - {channel_name}" - existing_document.document_metadata = { + # Create and store new document + document = Document( + search_space_id=search_space_id, + title=f"Slack - {channel_name}", + document_type=DocumentType.SLACK_CONNECTOR, + document_metadata={ "channel_name": channel_name, "channel_id": channel_id, "start_date": start_date_str, "end_date": end_date_str, "message_count": len(formatted_messages), - "indexed_at": datetime.now().strftime("%Y-%m-%d %H:%M:%S"), - "last_updated": datetime.now().strftime("%Y-%m-%d %H:%M:%S") - } - existing_document.content = summary_content - existing_document.embedding = summary_embedding - - # Delete existing chunks and add new ones - await session.execute( - delete(Chunk) - .where(Chunk.document_id == existing_document.id) - ) - - # Assign new chunks to existing document - for chunk in chunks: - chunk.document_id = existing_document.id - session.add(chunk) - - documents_updated += 1 - else: - # Create and store new document - document = Document( - search_space_id=search_space_id, - title=f"Slack - {channel_name}", - document_type=DocumentType.SLACK_CONNECTOR, - document_metadata={ - "channel_name": channel_name, - "channel_id": channel_id, - "start_date": start_date_str, - "end_date": end_date_str, - "message_count": len(formatted_messages), - "indexed_at": datetime.now().strftime("%Y-%m-%d %H:%M:%S") - }, - content=summary_content, - embedding=summary_embedding, - chunks=chunks - ) - - session.add(document) - documents_indexed += 1 - logger.info(f"Successfully indexed new channel {channel_name} with {len(formatted_messages)} messages") + "indexed_at": datetime.now().strftime("%Y-%m-%d %H:%M:%S") + }, + content=summary_content, + embedding=summary_embedding, + chunks=chunks, + content_hash=content_hash, + ) + + session.add(document) + documents_indexed += 1 + logger.info(f"Successfully indexed new channel {channel_name} with {len(formatted_messages)} messages") except SlackApiError as slack_error: logger.error(f"Slack API error for channel {channel_name}: {str(slack_error)}") @@ -295,7 +248,7 @@ async def index_slack_messages( # Update the last_indexed_at timestamp for the connector only if requested # and if we successfully indexed at least one channel - total_processed = documents_indexed + documents_updated + total_processed = documents_indexed if update_last_indexed and total_processed > 0: connector.last_indexed_at = datetime.now() @@ -305,11 +258,11 @@ async def index_slack_messages( # Prepare result message result_message = None if skipped_channels: - result_message = f"Processed {total_processed} channels ({documents_indexed} new, {documents_updated} updated). Skipped {len(skipped_channels)} channels: {', '.join(skipped_channels)}" + result_message = f"Processed {total_processed} channels. Skipped {len(skipped_channels)} channels: {', '.join(skipped_channels)}" else: - result_message = f"Processed {total_processed} channels ({documents_indexed} new, {documents_updated} updated)." + result_message = f"Processed {total_processed} channels." - logger.info(f"Slack indexing completed: {documents_indexed} new channels, {documents_updated} updated, {documents_skipped} skipped") + logger.info(f"Slack indexing completed: {documents_indexed} new channels, {documents_skipped} skipped") return total_processed, result_message except SQLAlchemyError as db_error: @@ -386,27 +339,8 @@ async def index_notion_pages( logger.info("No Notion pages found to index") return 0, "No Notion pages found" - # Get existing documents for this search space and connector type to prevent duplicates - existing_docs_result = await session.execute( - select(Document) - .filter( - Document.search_space_id == search_space_id, - Document.document_type == DocumentType.NOTION_CONNECTOR - ) - ) - existing_docs = existing_docs_result.scalars().all() - - # Create a lookup dictionary of existing documents by page_id - existing_docs_by_page_id = {} - for doc in existing_docs: - if "page_id" in doc.document_metadata: - existing_docs_by_page_id[doc.document_metadata["page_id"]] = doc - - logger.info(f"Found {len(existing_docs_by_page_id)} existing Notion documents in database") - # Track the number of documents indexed documents_indexed = 0 - documents_updated = 0 documents_skipped = 0 skipped_pages = [] @@ -482,8 +416,7 @@ async def index_notion_pages( metadata_sections = [ ("METADATA", [ f"PAGE_TITLE: {page_title}", - f"PAGE_ID: {page_id}", - f"INDEXED_AT: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}" + f"PAGE_ID: {page_id}" ]), ("CONTENT", [ "FORMAT: markdown", @@ -504,6 +437,18 @@ async def index_notion_pages( document_parts.append("") combined_document_string = '\n'.join(document_parts) + content_hash = generate_content_hash(combined_document_string) + + # Check if document with this content hash already exists + existing_doc_by_hash_result = await session.execute( + select(Document).where(Document.content_hash == content_hash) + ) + existing_document_by_hash = existing_doc_by_hash_result.scalars().first() + + if existing_document_by_hash: + logger.info(f"Document with content hash {content_hash} already exists for page {page_title}. Skipping processing.") + documents_skipped += 1 + continue # Generate summary logger.debug(f"Generating summary for page {page_title}") @@ -515,59 +460,29 @@ async def index_notion_pages( # Process chunks logger.debug(f"Chunking content for page {page_title}") chunks = [ - Chunk(content=chunk.text, embedding=chunk.embedding) + Chunk(content=chunk.text, embedding=config.embedding_model_instance.embed(chunk.text)) for chunk in config.chunker_instance.chunk(markdown_content) ] - # Check if this page already exists in our database - existing_document = existing_docs_by_page_id.get(page_id) - - if existing_document: - # Update existing document instead of creating a new one - logger.info(f"Updating existing document for page {page_title}") - - # Update document fields - existing_document.title = f"Notion - {page_title}" - existing_document.document_metadata = { + # Create and store new document + document = Document( + search_space_id=search_space_id, + title=f"Notion - {page_title}", + document_type=DocumentType.NOTION_CONNECTOR, + document_metadata={ "page_title": page_title, "page_id": page_id, - "indexed_at": datetime.now().strftime("%Y-%m-%d %H:%M:%S"), - "last_updated": datetime.now().strftime("%Y-%m-%d %H:%M:%S") - } - existing_document.content = summary_content - existing_document.embedding = summary_embedding - - # Delete existing chunks and add new ones - await session.execute( - delete(Chunk) - .where(Chunk.document_id == existing_document.id) - ) - - # Assign new chunks to existing document - for chunk in chunks: - chunk.document_id = existing_document.id - session.add(chunk) - - documents_updated += 1 - else: - # Create and store new document - document = Document( - search_space_id=search_space_id, - title=f"Notion - {page_title}", - document_type=DocumentType.NOTION_CONNECTOR, - document_metadata={ - "page_title": page_title, - "page_id": page_id, - "indexed_at": datetime.now().strftime("%Y-%m-%d %H:%M:%S") - }, - content=summary_content, - embedding=summary_embedding, - chunks=chunks - ) - - session.add(document) - documents_indexed += 1 - logger.info(f"Successfully indexed new Notion page: {page_title}") + "indexed_at": datetime.now().strftime("%Y-%m-%d %H:%M:%S") + }, + content=summary_content, + content_hash=content_hash, + embedding=summary_embedding, + chunks=chunks + ) + + session.add(document) + documents_indexed += 1 + logger.info(f"Successfully indexed new Notion page: {page_title}") except Exception as e: logger.error(f"Error processing Notion page {page.get('title', 'Unknown')}: {str(e)}", exc_info=True) @@ -577,7 +492,7 @@ async def index_notion_pages( # Update the last_indexed_at timestamp for the connector only if requested # and if we successfully indexed at least one page - total_processed = documents_indexed + documents_updated + total_processed = documents_indexed if update_last_indexed and total_processed > 0: connector.last_indexed_at = datetime.now() logger.info(f"Updated last_indexed_at for connector {connector_id}") @@ -588,11 +503,11 @@ async def index_notion_pages( # Prepare result message result_message = None if skipped_pages: - result_message = f"Processed {total_processed} pages ({documents_indexed} new, {documents_updated} updated). Skipped {len(skipped_pages)} pages: {', '.join(skipped_pages)}" + result_message = f"Processed {total_processed} pages. Skipped {len(skipped_pages)} pages: {', '.join(skipped_pages)}" else: - result_message = f"Processed {total_processed} pages ({documents_indexed} new, {documents_updated} updated)." + result_message = f"Processed {total_processed} pages." - logger.info(f"Notion indexing completed: {documents_indexed} new pages, {documents_updated} updated, {documents_skipped} skipped") + logger.info(f"Notion indexing completed: {documents_indexed} new pages, {documents_skipped} skipped") return total_processed, result_message except SQLAlchemyError as db_error: @@ -660,19 +575,6 @@ async def index_github_repos( # If a repo is inaccessible, get_repository_files will likely fail gracefully later. logger.info(f"Starting indexing for {len(repo_full_names_to_index)} selected repositories.") - # 5. Get existing documents for this search space and connector type to prevent duplicates - existing_docs_result = await session.execute( - select(Document) - .filter( - Document.search_space_id == search_space_id, - Document.document_type == DocumentType.GITHUB_CONNECTOR - ) - ) - existing_docs = existing_docs_result.scalars().all() - # Create a lookup dict: key=repo_fullname/file_path, value=Document object - existing_docs_lookup = {doc.document_metadata.get("full_path"): doc for doc in existing_docs if doc.document_metadata.get("full_path")} - logger.info(f"Found {len(existing_docs_lookup)} existing GitHub documents in database for search space {search_space_id}") - # 6. Iterate through selected repositories and index files for repo_full_name in repo_full_names_to_index: if not repo_full_name or not isinstance(repo_full_name, str): @@ -699,12 +601,6 @@ async def index_github_repos( logger.warning(f"Skipping file with missing info in {repo_full_name}: {file_info}") continue - # Check if document already exists and if content hash matches - existing_doc = existing_docs_lookup.get(full_path_key) - if existing_doc and existing_doc.document_metadata.get("sha") == file_sha: - logger.debug(f"Skipping unchanged file: {full_path_key}") - continue # Skip if SHA matches (content hasn't changed) - # Get file content file_content = github_client.get_file_content(repo_full_name, file_path) @@ -712,6 +608,18 @@ async def index_github_repos( logger.warning(f"Could not retrieve content for {full_path_key}. Skipping.") continue # Skip if content fetch failed + content_hash = generate_content_hash(file_content) + + # Check if document with this content hash already exists + existing_doc_by_hash_result = await session.execute( + select(Document).where(Document.content_hash == content_hash) + ) + existing_document_by_hash = existing_doc_by_hash_result.scalars().first() + + if existing_document_by_hash: + logger.info(f"Document with content hash {content_hash} already exists for file {full_path_key}. Skipping processing.") + continue + # Use file_content directly for chunking, maybe summary for main content? # For now, let's use the full content for both, might need refinement summary_content = f"GitHub file: {full_path_key}\n\n{file_content[:1000]}..." # Simple summary @@ -720,8 +628,8 @@ async def index_github_repos( # Chunk the content try: chunks_data = [ - Chunk(content=chunk.text, embedding=chunk.embedding) - for chunk in config.chunker_instance.chunk(file_content) + Chunk(content=chunk.text, embedding=config.embedding_model_instance.embed(chunk.text)) + for chunk in config.code_chunker_instance.chunk(file_content) ] except Exception as chunk_err: logger.error(f"Failed to chunk file {full_path_key}: {chunk_err}") @@ -738,42 +646,20 @@ async def index_github_repos( "indexed_at": datetime.now(timezone.utc).isoformat() } - if existing_doc: - # Update existing document - logger.info(f"Updating document for file: {full_path_key}") - existing_doc.title = f"GitHub - {file_path}" - existing_doc.document_metadata = doc_metadata - existing_doc.content = summary_content # Update summary - existing_doc.embedding = summary_embedding # Update embedding - - # Delete old chunks - await session.execute( - delete(Chunk) - .where(Chunk.document_id == existing_doc.id) - ) - # Add new chunks - for chunk_obj in chunks_data: - chunk_obj.document_id = existing_doc.id - session.add(chunk_obj) - - documents_processed += 1 - else: - # Create new document - logger.info(f"Creating new document for file: {full_path_key}") - document = Document( - title=f"GitHub - {file_path}", - document_type=DocumentType.GITHUB_CONNECTOR, - document_metadata=doc_metadata, - content=summary_content, # Store summary - embedding=summary_embedding, - search_space_id=search_space_id, - chunks=chunks_data # Associate chunks directly - ) - session.add(document) - documents_processed += 1 - - # Commit periodically or at the end? For now, commit per repo - # await session.commit() + # Create new document + logger.info(f"Creating new document for file: {full_path_key}") + document = Document( + title=f"GitHub - {file_path}", + document_type=DocumentType.GITHUB_CONNECTOR, + document_metadata=doc_metadata, + content=summary_content, # Store summary + content_hash=content_hash, + embedding=summary_embedding, + search_space_id=search_space_id, + chunks=chunks_data # Associate chunks directly + ) + session.add(document) + documents_processed += 1 except Exception as repo_err: logger.error(f"Failed to process repository {repo_full_name}: {repo_err}") @@ -847,14 +733,14 @@ async def index_linear_issues( # Check if last_indexed_at is in the future or after end_date if last_indexed_naive > end_date: - logger.warning(f"Last indexed date ({last_indexed_naive.strftime('%Y-%m-%d')}) is in the future. Using 30 days ago instead.") - start_date = end_date - timedelta(days=30) + logger.warning(f"Last indexed date ({last_indexed_naive.strftime('%Y-%m-%d')}) is in the future. Using 365 days ago instead.") + start_date = end_date - timedelta(days=365) else: start_date = last_indexed_naive logger.info(f"Using last_indexed_at ({start_date.strftime('%Y-%m-%d')}) as start date") else: - start_date = end_date - timedelta(days=30) # Use 30 days instead of 365 to catch recent issues - logger.info(f"No last_indexed_at found, using {start_date.strftime('%Y-%m-%d')} (30 days ago) as start date") + start_date = end_date - timedelta(days=365) # Use 365 days as default + logger.info(f"No last_indexed_at found, using {start_date.strftime('%Y-%m-%d')} (365 days ago) as start date") # Format dates for Linear API start_date_str = start_date.strftime("%Y-%m-%d") @@ -905,35 +791,8 @@ async def index_linear_issues( if len(issues) > 10: logger.info(f" ...and {len(issues) - 10} more issues") - # Get existing documents for this search space and connector type to prevent duplicates - existing_docs_result = await session.execute( - select(Document) - .filter( - Document.search_space_id == search_space_id, - Document.document_type == DocumentType.LINEAR_CONNECTOR - ) - ) - existing_docs = existing_docs_result.scalars().all() - - # Create a lookup dictionary of existing documents by issue_id - existing_docs_by_issue_id = {} - for doc in existing_docs: - if "issue_id" in doc.document_metadata: - existing_docs_by_issue_id[doc.document_metadata["issue_id"]] = doc - - logger.info(f"Found {len(existing_docs_by_issue_id)} existing Linear documents in database") - - # Log existing document IDs for debugging - if existing_docs_by_issue_id: - logger.info("Existing Linear document issue IDs in database:") - for idx, (issue_id, doc) in enumerate(list(existing_docs_by_issue_id.items())[:10]): # Log first 10 - logger.info(f" {idx+1}. {issue_id} - {doc.document_metadata.get('issue_identifier', 'Unknown')} - {doc.document_metadata.get('issue_title', 'Unknown')}") - if len(existing_docs_by_issue_id) > 10: - logger.info(f" ...and {len(existing_docs_by_issue_id) - 10} more existing documents") - # Track the number of documents indexed documents_indexed = 0 - documents_updated = 0 documents_skipped = 0 skipped_issues = [] @@ -979,71 +838,51 @@ async def index_linear_issues( comment_count = len(formatted_issue.get("comments", [])) summary_content += f"Comments: {comment_count}" + content_hash = generate_content_hash(issue_content) + + # Check if document with this content hash already exists + existing_doc_by_hash_result = await session.execute( + select(Document).where(Document.content_hash == content_hash) + ) + existing_document_by_hash = existing_doc_by_hash_result.scalars().first() + + if existing_document_by_hash: + logger.info(f"Document with content hash {content_hash} already exists for issue {issue_identifier}. Skipping processing.") + documents_skipped += 1 + continue + # Generate embedding for the summary summary_embedding = config.embedding_model_instance.embed(summary_content) # Process chunks - using the full issue content with comments chunks = [ - Chunk(content=chunk.text, embedding=chunk.embedding) + Chunk(content=chunk.text, embedding=config.embedding_model_instance.embed(chunk.text)) for chunk in config.chunker_instance.chunk(issue_content) ] - # Check if this issue already exists in our database - existing_document = existing_docs_by_issue_id.get(issue_id) - - if existing_document: - # Update existing document instead of creating a new one - logger.info(f"Updating existing document for issue {issue_identifier} - {issue_title}") - - # Update document fields - existing_document.title = f"Linear - {issue_identifier}: {issue_title}" - existing_document.document_metadata = { + # Create and store new document + logger.info(f"Creating new document for issue {issue_identifier} - {issue_title}") + document = Document( + search_space_id=search_space_id, + title=f"Linear - {issue_identifier}: {issue_title}", + document_type=DocumentType.LINEAR_CONNECTOR, + document_metadata={ "issue_id": issue_id, "issue_identifier": issue_identifier, "issue_title": issue_title, "state": state, "comment_count": comment_count, - "indexed_at": datetime.now().strftime("%Y-%m-%d %H:%M:%S"), - "last_updated": datetime.now().strftime("%Y-%m-%d %H:%M:%S") - } - existing_document.content = summary_content - existing_document.embedding = summary_embedding - - # Delete existing chunks and add new ones - await session.execute( - delete(Chunk) - .where(Chunk.document_id == existing_document.id) - ) - - # Assign new chunks to existing document - for chunk in chunks: - chunk.document_id = existing_document.id - session.add(chunk) - - documents_updated += 1 - else: - # Create and store new document - logger.info(f"Creating new document for issue {issue_identifier} - {issue_title}") - document = Document( - search_space_id=search_space_id, - title=f"Linear - {issue_identifier}: {issue_title}", - document_type=DocumentType.LINEAR_CONNECTOR, - document_metadata={ - "issue_id": issue_id, - "issue_identifier": issue_identifier, - "issue_title": issue_title, - "state": state, - "comment_count": comment_count, - "indexed_at": datetime.now().strftime("%Y-%m-%d %H:%M:%S") - }, - content=summary_content, - embedding=summary_embedding, - chunks=chunks - ) - - session.add(document) - documents_indexed += 1 - logger.info(f"Successfully indexed new issue {issue_identifier} - {issue_title}") + "indexed_at": datetime.now().strftime("%Y-%m-%d %H:%M:%S") + }, + content=summary_content, + content_hash=content_hash, + embedding=summary_embedding, + chunks=chunks + ) + + session.add(document) + documents_indexed += 1 + logger.info(f"Successfully indexed new issue {issue_identifier} - {issue_title}") except Exception as e: logger.error(f"Error processing issue {issue.get('identifier', 'Unknown')}: {str(e)}", exc_info=True) @@ -1052,7 +891,7 @@ async def index_linear_issues( continue # Skip this issue and continue with others # Update the last_indexed_at timestamp for the connector only if requested - total_processed = documents_indexed + documents_updated + total_processed = documents_indexed if update_last_indexed: connector.last_indexed_at = datetime.now() logger.info(f"Updated last_indexed_at to {connector.last_indexed_at}") @@ -1062,7 +901,7 @@ async def index_linear_issues( logger.info(f"Successfully committed all Linear document changes to database") - logger.info(f"Linear indexing completed: {documents_indexed} new issues, {documents_updated} updated, {documents_skipped} skipped") + logger.info(f"Linear indexing completed: {documents_indexed} new issues, {documents_skipped} skipped") return total_processed, None # Return None as the error message to indicate success except SQLAlchemyError as db_error: diff --git a/surfsense_backend/app/tasks/podcast_tasks.py b/surfsense_backend/app/tasks/podcast_tasks.py new file mode 100644 index 000000000..12364e7ce --- /dev/null +++ b/surfsense_backend/app/tasks/podcast_tasks.py @@ -0,0 +1,93 @@ + +from app.agents.podcaster.graph import graph as podcaster_graph +from app.agents.podcaster.state import State +from app.db import Chat, Podcast +from sqlalchemy import select +from sqlalchemy.ext.asyncio import AsyncSession + + +async def generate_document_podcast( + session: AsyncSession, + document_id: int, + search_space_id: int, + user_id: int +): + # TODO: Need to fetch the document chunks, then concatenate them and pass them to the podcast generation model + pass + + + +async def generate_chat_podcast( + session: AsyncSession, + chat_id: int, + search_space_id: int, + podcast_title: str +): + # Fetch the chat with the specified ID + query = select(Chat).filter( + Chat.id == chat_id, + Chat.search_space_id == search_space_id + ) + + result = await session.execute(query) + chat = result.scalars().first() + + if not chat: + raise ValueError(f"Chat with id {chat_id} not found in search space {search_space_id}") + + # Create chat history structure + chat_history_str = "" + + for message in chat.messages: + if message["role"] == "user": + chat_history_str += f"{message['content']}" + elif message["role"] == "assistant": + # Last annotation type will always be "ANSWER" here + answer_annotation = message["annotations"][-1] + answer_text = "" + if answer_annotation["type"] == "ANSWER": + answer_text = answer_annotation["content"] + # If content is a list, join it into a single string + if isinstance(answer_text, list): + answer_text = "\n".join(answer_text) + chat_history_str += f"{answer_text}" + + chat_history_str += "" + + # Pass it to the SurfSense Podcaster + config = { + "configurable": { + "podcast_title" : "Surfsense", + } + } + # Initialize state with database session and streaming service + initial_state = State( + source_content=chat_history_str, + ) + + # Run the graph directly + result = await podcaster_graph.ainvoke(initial_state, config=config) + + # Convert podcast transcript entries to serializable format + serializable_transcript = [] + for entry in result["podcast_transcript"]: + serializable_transcript.append({ + "speaker_id": entry.speaker_id, + "dialog": entry.dialog + }) + + # Create a new podcast entry + podcast = Podcast( + title=f"{podcast_title}", + podcast_transcript=serializable_transcript, + file_location=result["final_podcast_file_path"], + search_space_id=search_space_id + ) + + # Add to session and commit + session.add(podcast) + await session.commit() + await session.refresh(podcast) + + return podcast + diff --git a/surfsense_backend/app/tasks/stream_connector_search_results.py b/surfsense_backend/app/tasks/stream_connector_search_results.py index c7eb07627..aa5f40179 100644 --- a/surfsense_backend/app/tasks/stream_connector_search_results.py +++ b/surfsense_backend/app/tasks/stream_connector_search_results.py @@ -1,4 +1,4 @@ -from typing import AsyncGenerator, List, Union +from typing import Any, AsyncGenerator, List, Union from uuid import UUID from app.agents.researcher.graph import graph as researcher_graph @@ -6,6 +6,8 @@ from app.agents.researcher.state import State from app.utils.streaming_service import StreamingService from sqlalchemy.ext.asyncio import AsyncSession +from app.agents.researcher.configuration import SearchMode + async def stream_connector_search_results( user_query: str, @@ -13,7 +15,9 @@ async def stream_connector_search_results( search_space_id: int, session: AsyncSession, research_mode: str, - selected_connectors: List[str] + selected_connectors: List[str], + langchain_chat_history: List[Any], + search_mode_str: str ) -> AsyncGenerator[str, None]: """ Stream connector search results to the client @@ -40,6 +44,11 @@ async def stream_connector_search_results( # Convert UUID to string if needed user_id_str = str(user_id) if isinstance(user_id, UUID) else user_id + if search_mode_str == "CHUNKS": + search_mode = SearchMode.CHUNKS + elif search_mode_str == "DOCUMENTS": + search_mode = SearchMode.DOCUMENTS + # Sample configuration config = { "configurable": { @@ -47,13 +56,15 @@ async def stream_connector_search_results( "num_sections": NUM_SECTIONS, "connectors_to_search": selected_connectors, "user_id": user_id_str, - "search_space_id": search_space_id + "search_space_id": search_space_id, + "search_mode": search_mode } } # Initialize state with database session and streaming service initial_state = State( db_session=session, - streaming_service=streaming_service + streaming_service=streaming_service, + chat_history=langchain_chat_history ) # Run the graph directly diff --git a/surfsense_backend/app/users.py b/surfsense_backend/app/users.py index ec4445c3f..d73baae4c 100644 --- a/surfsense_backend/app/users.py +++ b/surfsense_backend/app/users.py @@ -10,8 +10,8 @@ from fastapi_users.authentication import ( JWTStrategy, ) from fastapi_users.db import SQLAlchemyUserDatabase -from httpx_oauth.clients.google import GoogleOAuth2 - +from fastapi.responses import JSONResponse +from fastapi_users.schemas import model_dump from app.config import config from app.db import User, get_user_db from pydantic import BaseModel @@ -22,10 +22,13 @@ class BearerResponse(BaseModel): SECRET = config.SECRET_KEY -google_oauth_client = GoogleOAuth2( - config.GOOGLE_OAUTH_CLIENT_ID, - config.GOOGLE_OAUTH_CLIENT_SECRET, -) +if config.AUTH_TYPE == "GOOGLE": + from httpx_oauth.clients.google import GoogleOAuth2 + + google_oauth_client = GoogleOAuth2( + config.GOOGLE_OAUTH_CLIENT_ID, + config.GOOGLE_OAUTH_CLIENT_SECRET, + ) class UserManager(UUIDIDMixin, BaseUserManager[User, uuid.UUID]): @@ -79,7 +82,10 @@ class CustomBearerTransport(BearerTransport): async def get_login_response(self, token: str) -> Response: bearer_response = BearerResponse(access_token=token, token_type="bearer") redirect_url = f"{config.NEXT_FRONTEND_URL}/auth/callback?token={bearer_response.access_token}" - return RedirectResponse(redirect_url, status_code=302) + if config.AUTH_TYPE == "GOOGLE": + return RedirectResponse(redirect_url, status_code=302) + else: + return JSONResponse(model_dump(bearer_response)) bearer_transport = CustomBearerTransport(tokenUrl="auth/jwt/login") diff --git a/surfsense_backend/app/utils/connector_service.py b/surfsense_backend/app/utils/connector_service.py index 9a6e13c43..49c3b083a 100644 --- a/surfsense_backend/app/utils/connector_service.py +++ b/surfsense_backend/app/utils/connector_service.py @@ -1,52 +1,80 @@ import json from typing import List, Dict, Any, Optional, Tuple +import asyncio from sqlalchemy.ext.asyncio import AsyncSession from sqlalchemy.future import select from app.retriver.chunks_hybrid_search import ChucksHybridSearchRetriever +from app.retriver.documents_hybrid_search import DocumentHybridSearchRetriever from app.db import SearchSourceConnector, SearchSourceConnectorType from tavily import TavilyClient +from linkup import LinkupClient + +from app.agents.researcher.configuration import SearchMode class ConnectorService: def __init__(self, session: AsyncSession): self.session = session - self.retriever = ChucksHybridSearchRetriever(session) + self.chunk_retriever = ChucksHybridSearchRetriever(session) + self.document_retriever = DocumentHybridSearchRetriever(session) self.source_id_counter = 1 + self.counter_lock = asyncio.Lock() # Lock to protect counter in multithreaded environments - async def search_crawled_urls(self, user_query: str, user_id: str, search_space_id: int, top_k: int = 20) -> tuple: + async def search_crawled_urls(self, user_query: str, user_id: str, search_space_id: int, top_k: int = 20, search_mode: SearchMode = SearchMode.CHUNKS) -> tuple: """ Search for crawled URLs and return both the source information and langchain documents Returns: tuple: (sources_info, langchain_documents) """ - crawled_urls_chunks = await self.retriever.hybrid_search( - query_text=user_query, - top_k=top_k, - user_id=user_id, - search_space_id=search_space_id, - document_type="CRAWLED_URL" - ) + if search_mode == SearchMode.CHUNKS: + crawled_urls_chunks = await self.chunk_retriever.hybrid_search( + query_text=user_query, + top_k=top_k, + user_id=user_id, + search_space_id=search_space_id, + document_type="CRAWLED_URL" + ) + elif search_mode == SearchMode.DOCUMENTS: + crawled_urls_chunks = await self.document_retriever.hybrid_search( + query_text=user_query, + top_k=top_k, + user_id=user_id, + search_space_id=search_space_id, + document_type="CRAWLED_URL" + ) + # Transform document retriever results to match expected format + crawled_urls_chunks = self._transform_document_results(crawled_urls_chunks) + + # Early return if no results + if not crawled_urls_chunks: + return { + "id": 1, + "name": "Crawled URLs", + "type": "CRAWLED_URL", + "sources": [], + }, [] # Process each chunk and create sources directly without deduplication sources_list = [] - for i, chunk in enumerate(crawled_urls_chunks): - # Fix for UI - crawled_urls_chunks[i]['document']['id'] = self.source_id_counter - # Extract document metadata - document = chunk.get('document', {}) - metadata = document.get('metadata', {}) + async with self.counter_lock: + for i, chunk in enumerate(crawled_urls_chunks): + # Fix for UI + crawled_urls_chunks[i]['document']['id'] = self.source_id_counter + # Extract document metadata + document = chunk.get('document', {}) + metadata = document.get('metadata', {}) - # Create a source entry - source = { - "id": self.source_id_counter, - "title": document.get('title', 'Untitled Document'), - "description": metadata.get('og:description', metadata.get('ogDescription', chunk.get('content', '')[:100])), - "url": metadata.get('url', '') - } + # Create a source entry + source = { + "id": self.source_id_counter, + "title": document.get('title', 'Untitled Document'), + "description": metadata.get('og:description', metadata.get('ogDescription', chunk.get('content', '')[:100])), + "url": metadata.get('url', '') + } - self.source_id_counter += 1 - sources_list.append(source) + self.source_id_counter += 1 + sources_list.append(source) # Create result object result_object = { @@ -58,40 +86,61 @@ class ConnectorService: return result_object, crawled_urls_chunks - async def search_files(self, user_query: str, user_id: str, search_space_id: int, top_k: int = 20) -> tuple: + async def search_files(self, user_query: str, user_id: str, search_space_id: int, top_k: int = 20, search_mode: SearchMode = SearchMode.CHUNKS) -> tuple: """ Search for files and return both the source information and langchain documents Returns: tuple: (sources_info, langchain_documents) """ - files_chunks = await self.retriever.hybrid_search( - query_text=user_query, - top_k=top_k, - user_id=user_id, - search_space_id=search_space_id, - document_type="FILE" - ) + if search_mode == SearchMode.CHUNKS: + files_chunks = await self.chunk_retriever.hybrid_search( + query_text=user_query, + top_k=top_k, + user_id=user_id, + search_space_id=search_space_id, + document_type="FILE" + ) + elif search_mode == SearchMode.DOCUMENTS: + files_chunks = await self.document_retriever.hybrid_search( + query_text=user_query, + top_k=top_k, + user_id=user_id, + search_space_id=search_space_id, + document_type="FILE" + ) + # Transform document retriever results to match expected format + files_chunks = self._transform_document_results(files_chunks) + + # Early return if no results + if not files_chunks: + return { + "id": 2, + "name": "Files", + "type": "FILE", + "sources": [], + }, [] # Process each chunk and create sources directly without deduplication sources_list = [] - for i, chunk in enumerate(files_chunks): - # Fix for UI - files_chunks[i]['document']['id'] = self.source_id_counter - # Extract document metadata - document = chunk.get('document', {}) - metadata = document.get('metadata', {}) + async with self.counter_lock: + for i, chunk in enumerate(files_chunks): + # Fix for UI + files_chunks[i]['document']['id'] = self.source_id_counter + # Extract document metadata + document = chunk.get('document', {}) + metadata = document.get('metadata', {}) - # Create a source entry - source = { - "id": self.source_id_counter, - "title": document.get('title', 'Untitled Document'), - "description": metadata.get('og:description', metadata.get('ogDescription', chunk.get('content', '')[:100])), - "url": metadata.get('url', '') - } + # Create a source entry + source = { + "id": self.source_id_counter, + "title": document.get('title', 'Untitled Document'), + "description": metadata.get('og:description', metadata.get('ogDescription', chunk.get('content', '')[:100])), + "url": metadata.get('url', '') + } - self.source_id_counter += 1 - sources_list.append(source) + self.source_id_counter += 1 + sources_list.append(source) # Create result object result_object = { @@ -103,6 +152,31 @@ class ConnectorService: return result_object, files_chunks + def _transform_document_results(self, document_results: List[Dict]) -> List[Dict]: + """ + Transform results from document_retriever.hybrid_search() to match the format + expected by the processing code. + + Args: + document_results: Results from document_retriever.hybrid_search() + + Returns: + List of transformed results in the format expected by the processing code + """ + transformed_results = [] + for doc in document_results: + transformed_results.append({ + 'document': { + 'id': doc.get('document_id'), + 'title': doc.get('title', 'Untitled Document'), + 'document_type': doc.get('document_type'), + 'metadata': doc.get('metadata', {}), + }, + 'content': doc.get('chunks_content', doc.get('content', '')), + 'score': doc.get('score', 0.0) + }) + return transformed_results + async def get_connector_by_type(self, user_id: str, connector_type: SearchSourceConnectorType) -> Optional[SearchSourceConnector]: """ Get a connector by type for a specific user @@ -162,39 +236,49 @@ class ConnectorService: # Extract results from Tavily response tavily_results = response.get("results", []) + # Early return if no results + if not tavily_results: + return { + "id": 3, + "name": "Tavily Search", + "type": "TAVILY_API", + "sources": [], + }, [] + # Process each result and create sources directly without deduplication sources_list = [] documents = [] - for i, result in enumerate(tavily_results): - - # Create a source entry - source = { - "id": self.source_id_counter, - "title": result.get("title", "Tavily Result"), - "description": result.get("content", "")[:100], - "url": result.get("url", "") - } - sources_list.append(source) - - # Create a document entry - document = { - "chunk_id": f"tavily_chunk_{i}", - "content": result.get("content", ""), - "score": result.get("score", 0.0), - "document": { + async with self.counter_lock: + for i, result in enumerate(tavily_results): + + # Create a source entry + source = { "id": self.source_id_counter, "title": result.get("title", "Tavily Result"), - "document_type": "TAVILY_API", - "metadata": { - "url": result.get("url", ""), - "published_date": result.get("published_date", ""), - "source": "TAVILY_API" + "description": result.get("content", "")[:100], + "url": result.get("url", "") + } + sources_list.append(source) + + # Create a document entry + document = { + "chunk_id": f"tavily_chunk_{i}", + "content": result.get("content", ""), + "score": result.get("score", 0.0), + "document": { + "id": self.source_id_counter, + "title": result.get("title", "Tavily Result"), + "document_type": "TAVILY_API", + "metadata": { + "url": result.get("url", ""), + "published_date": result.get("published_date", ""), + "source": "TAVILY_API" + } } } - } - documents.append(document) - self.source_id_counter += 1 + documents.append(document) + self.source_id_counter += 1 # Create result object result_object = { @@ -216,59 +300,80 @@ class ConnectorService: "sources": [], }, [] - async def search_slack(self, user_query: str, user_id: str, search_space_id: int, top_k: int = 20) -> tuple: + async def search_slack(self, user_query: str, user_id: str, search_space_id: int, top_k: int = 20, search_mode: SearchMode = SearchMode.CHUNKS) -> tuple: """ Search for slack and return both the source information and langchain documents Returns: tuple: (sources_info, langchain_documents) """ - slack_chunks = await self.retriever.hybrid_search( - query_text=user_query, - top_k=top_k, - user_id=user_id, - search_space_id=search_space_id, - document_type="SLACK_CONNECTOR" - ) + if search_mode == SearchMode.CHUNKS: + slack_chunks = await self.chunk_retriever.hybrid_search( + query_text=user_query, + top_k=top_k, + user_id=user_id, + search_space_id=search_space_id, + document_type="SLACK_CONNECTOR" + ) + elif search_mode == SearchMode.DOCUMENTS: + slack_chunks = await self.document_retriever.hybrid_search( + query_text=user_query, + top_k=top_k, + user_id=user_id, + search_space_id=search_space_id, + document_type="SLACK_CONNECTOR" + ) + # Transform document retriever results to match expected format + slack_chunks = self._transform_document_results(slack_chunks) + + # Early return if no results + if not slack_chunks: + return { + "id": 4, + "name": "Slack", + "type": "SLACK_CONNECTOR", + "sources": [], + }, [] # Process each chunk and create sources directly without deduplication sources_list = [] - for i, chunk in enumerate(slack_chunks): - # Fix for UI - slack_chunks[i]['document']['id'] = self.source_id_counter - # Extract document metadata - document = chunk.get('document', {}) - metadata = document.get('metadata', {}) + async with self.counter_lock: + for i, chunk in enumerate(slack_chunks): + # Fix for UI + slack_chunks[i]['document']['id'] = self.source_id_counter + # Extract document metadata + document = chunk.get('document', {}) + metadata = document.get('metadata', {}) - # Create a mapped source entry with Slack-specific metadata - channel_name = metadata.get('channel_name', 'Unknown Channel') - channel_id = metadata.get('channel_id', '') - message_date = metadata.get('start_date', '') - - # Create a more descriptive title for Slack messages - title = f"Slack: {channel_name}" - if message_date: - title += f" ({message_date})" + # Create a mapped source entry with Slack-specific metadata + channel_name = metadata.get('channel_name', 'Unknown Channel') + channel_id = metadata.get('channel_id', '') + message_date = metadata.get('start_date', '') - # Create a more descriptive description for Slack messages - description = chunk.get('content', '')[:100] - if len(description) == 100: - description += "..." - - # For URL, we can use a placeholder or construct a URL to the Slack channel if available - url = "" - if channel_id: - url = f"https://slack.com/app_redirect?channel={channel_id}" + # Create a more descriptive title for Slack messages + title = f"Slack: {channel_name}" + if message_date: + title += f" ({message_date})" + + # Create a more descriptive description for Slack messages + description = chunk.get('content', '')[:100] + if len(description) == 100: + description += "..." + + # For URL, we can use a placeholder or construct a URL to the Slack channel if available + url = "" + if channel_id: + url = f"https://slack.com/app_redirect?channel={channel_id}" - source = { - "id": self.source_id_counter, - "title": title, - "description": description, - "url": url, - } + source = { + "id": self.source_id_counter, + "title": title, + "description": description, + "url": url, + } - self.source_id_counter += 1 - sources_list.append(source) + self.source_id_counter += 1 + sources_list.append(source) # Create result object result_object = { @@ -280,7 +385,7 @@ class ConnectorService: return result_object, slack_chunks - async def search_notion(self, user_query: str, user_id: str, search_space_id: int, top_k: int = 20) -> tuple: + async def search_notion(self, user_query: str, user_id: str, search_space_id: int, top_k: int = 20, search_mode: SearchMode = SearchMode.CHUNKS) -> tuple: """ Search for Notion pages and return both the source information and langchain documents @@ -293,54 +398,75 @@ class ConnectorService: Returns: tuple: (sources_info, langchain_documents) """ - notion_chunks = await self.retriever.hybrid_search( - query_text=user_query, - top_k=top_k, - user_id=user_id, - search_space_id=search_space_id, - document_type="NOTION_CONNECTOR" - ) + if search_mode == SearchMode.CHUNKS: + notion_chunks = await self.chunk_retriever.hybrid_search( + query_text=user_query, + top_k=top_k, + user_id=user_id, + search_space_id=search_space_id, + document_type="NOTION_CONNECTOR" + ) + elif search_mode == SearchMode.DOCUMENTS: + notion_chunks = await self.document_retriever.hybrid_search( + query_text=user_query, + top_k=top_k, + user_id=user_id, + search_space_id=search_space_id, + document_type="NOTION_CONNECTOR" + ) + # Transform document retriever results to match expected format + notion_chunks = self._transform_document_results(notion_chunks) + + # Early return if no results + if not notion_chunks: + return { + "id": 5, + "name": "Notion", + "type": "NOTION_CONNECTOR", + "sources": [], + }, [] # Process each chunk and create sources directly without deduplication sources_list = [] - for i, chunk in enumerate(notion_chunks): - # Fix for UI - notion_chunks[i]['document']['id'] = self.source_id_counter - - # Extract document metadata - document = chunk.get('document', {}) - metadata = document.get('metadata', {}) - - # Create a mapped source entry with Notion-specific metadata - page_title = metadata.get('page_title', 'Untitled Page') - page_id = metadata.get('page_id', '') - indexed_at = metadata.get('indexed_at', '') - - # Create a more descriptive title for Notion pages - title = f"Notion: {page_title}" - if indexed_at: - title += f" (indexed: {indexed_at})" + async with self.counter_lock: + for i, chunk in enumerate(notion_chunks): + # Fix for UI + notion_chunks[i]['document']['id'] = self.source_id_counter - # Create a more descriptive description for Notion pages - description = chunk.get('content', '')[:100] - if len(description) == 100: - description += "..." + # Extract document metadata + document = chunk.get('document', {}) + metadata = document.get('metadata', {}) + + # Create a mapped source entry with Notion-specific metadata + page_title = metadata.get('page_title', 'Untitled Page') + page_id = metadata.get('page_id', '') + indexed_at = metadata.get('indexed_at', '') - # For URL, we can use a placeholder or construct a URL to the Notion page if available - url = "" - if page_id: - # Notion page URLs follow this format - url = f"https://notion.so/{page_id.replace('-', '')}" + # Create a more descriptive title for Notion pages + title = f"Notion: {page_title}" + if indexed_at: + title += f" (indexed: {indexed_at})" + + # Create a more descriptive description for Notion pages + description = chunk.get('content', '')[:100] + if len(description) == 100: + description += "..." + + # For URL, we can use a placeholder or construct a URL to the Notion page if available + url = "" + if page_id: + # Notion page URLs follow this format + url = f"https://notion.so/{page_id.replace('-', '')}" - source = { - "id": self.source_id_counter, - "title": title, - "description": description, - "url": url, - } + source = { + "id": self.source_id_counter, + "title": title, + "description": description, + "url": url, + } - self.source_id_counter += 1 - sources_list.append(source) + self.source_id_counter += 1 + sources_list.append(source) # Create result object result_object = { @@ -352,7 +478,7 @@ class ConnectorService: return result_object, notion_chunks - async def search_extension(self, user_query: str, user_id: str, search_space_id: int, top_k: int = 20) -> tuple: + async def search_extension(self, user_query: str, user_id: str, search_space_id: int, top_k: int = 20, search_mode: SearchMode = SearchMode.CHUNKS) -> tuple: """ Search for extension data and return both the source information and langchain documents @@ -365,72 +491,93 @@ class ConnectorService: Returns: tuple: (sources_info, langchain_documents) """ - extension_chunks = await self.retriever.hybrid_search( - query_text=user_query, - top_k=top_k, - user_id=user_id, - search_space_id=search_space_id, - document_type="EXTENSION" - ) + if search_mode == SearchMode.CHUNKS: + extension_chunks = await self.chunk_retriever.hybrid_search( + query_text=user_query, + top_k=top_k, + user_id=user_id, + search_space_id=search_space_id, + document_type="EXTENSION" + ) + elif search_mode == SearchMode.DOCUMENTS: + extension_chunks = await self.document_retriever.hybrid_search( + query_text=user_query, + top_k=top_k, + user_id=user_id, + search_space_id=search_space_id, + document_type="EXTENSION" + ) + # Transform document retriever results to match expected format + extension_chunks = self._transform_document_results(extension_chunks) + + # Early return if no results + if not extension_chunks: + return { + "id": 6, + "name": "Extension", + "type": "EXTENSION", + "sources": [], + }, [] # Process each chunk and create sources directly without deduplication sources_list = [] - for i, chunk in enumerate(extension_chunks): - # Fix for UI - extension_chunks[i]['document']['id'] = self.source_id_counter - - # Extract document metadata - document = chunk.get('document', {}) - metadata = document.get('metadata', {}) + async with self.counter_lock: + for i, chunk in enumerate(extension_chunks): + # Fix for UI + extension_chunks[i]['document']['id'] = self.source_id_counter + + # Extract document metadata + document = chunk.get('document', {}) + metadata = document.get('metadata', {}) - # Extract extension-specific metadata - webpage_title = metadata.get('VisitedWebPageTitle', 'Untitled Page') - webpage_url = metadata.get('VisitedWebPageURL', '') - visit_date = metadata.get('VisitedWebPageDateWithTimeInISOString', '') - visit_duration = metadata.get('VisitedWebPageVisitDurationInMilliseconds', '') - browsing_session_id = metadata.get('BrowsingSessionId', '') - - # Create a more descriptive title for extension data - title = webpage_title - if visit_date: - # Format the date for display (simplified) - try: - # Just extract the date part for display - formatted_date = visit_date.split('T')[0] if 'T' in visit_date else visit_date - title += f" (visited: {formatted_date})" - except: - # Fallback if date parsing fails - title += f" (visited: {visit_date})" + # Extract extension-specific metadata + webpage_title = metadata.get('VisitedWebPageTitle', 'Untitled Page') + webpage_url = metadata.get('VisitedWebPageURL', '') + visit_date = metadata.get('VisitedWebPageDateWithTimeInISOString', '') + visit_duration = metadata.get('VisitedWebPageVisitDurationInMilliseconds', '') + browsing_session_id = metadata.get('BrowsingSessionId', '') - # Create a more descriptive description for extension data - description = chunk.get('content', '')[:100] - if len(description) == 100: - description += "..." - - # Add visit duration if available - if visit_duration: - try: - duration_seconds = int(visit_duration) / 1000 - if duration_seconds < 60: - duration_text = f"{duration_seconds:.1f} seconds" - else: - duration_text = f"{duration_seconds/60:.1f} minutes" + # Create a more descriptive title for extension data + title = webpage_title + if visit_date: + # Format the date for display (simplified) + try: + # Just extract the date part for display + formatted_date = visit_date.split('T')[0] if 'T' in visit_date else visit_date + title += f" (visited: {formatted_date})" + except: + # Fallback if date parsing fails + title += f" (visited: {visit_date})" - if description: - description += f" | Duration: {duration_text}" - except: - # Fallback if duration parsing fails - pass + # Create a more descriptive description for extension data + description = chunk.get('content', '')[:100] + if len(description) == 100: + description += "..." + + # Add visit duration if available + if visit_duration: + try: + duration_seconds = int(visit_duration) / 1000 + if duration_seconds < 60: + duration_text = f"{duration_seconds:.1f} seconds" + else: + duration_text = f"{duration_seconds/60:.1f} minutes" + + if description: + description += f" | Duration: {duration_text}" + except: + # Fallback if duration parsing fails + pass - source = { - "id": self.source_id_counter, - "title": title, - "description": description, - "url": webpage_url - } + source = { + "id": self.source_id_counter, + "title": title, + "description": description, + "url": webpage_url + } - self.source_id_counter += 1 - sources_list.append(source) + self.source_id_counter += 1 + sources_list.append(source) # Create result object result_object = { @@ -442,7 +589,7 @@ class ConnectorService: return result_object, extension_chunks - async def search_youtube(self, user_query: str, user_id: str, search_space_id: int, top_k: int = 20) -> tuple: + async def search_youtube(self, user_query: str, user_id: str, search_space_id: int, top_k: int = 20, search_mode: SearchMode = SearchMode.CHUNKS) -> tuple: """ Search for YouTube videos and return both the source information and langchain documents @@ -455,54 +602,75 @@ class ConnectorService: Returns: tuple: (sources_info, langchain_documents) """ - youtube_chunks = await self.retriever.hybrid_search( - query_text=user_query, - top_k=top_k, - user_id=user_id, - search_space_id=search_space_id, - document_type="YOUTUBE_VIDEO" - ) + if search_mode == SearchMode.CHUNKS: + youtube_chunks = await self.chunk_retriever.hybrid_search( + query_text=user_query, + top_k=top_k, + user_id=user_id, + search_space_id=search_space_id, + document_type="YOUTUBE_VIDEO" + ) + elif search_mode == SearchMode.DOCUMENTS: + youtube_chunks = await self.document_retriever.hybrid_search( + query_text=user_query, + top_k=top_k, + user_id=user_id, + search_space_id=search_space_id, + document_type="YOUTUBE_VIDEO" + ) + # Transform document retriever results to match expected format + youtube_chunks = self._transform_document_results(youtube_chunks) + + # Early return if no results + if not youtube_chunks: + return { + "id": 7, + "name": "YouTube Videos", + "type": "YOUTUBE_VIDEO", + "sources": [], + }, [] # Process each chunk and create sources directly without deduplication sources_list = [] - for i, chunk in enumerate(youtube_chunks): - # Fix for UI - youtube_chunks[i]['document']['id'] = self.source_id_counter - - # Extract document metadata - document = chunk.get('document', {}) - metadata = document.get('metadata', {}) - - # Extract YouTube-specific metadata - video_title = metadata.get('video_title', 'Untitled Video') - video_id = metadata.get('video_id', '') - channel_name = metadata.get('channel_name', '') - published_date = metadata.get('published_date', '') - - # Create a more descriptive title for YouTube videos - title = video_title - if channel_name: - title += f" - {channel_name}" + async with self.counter_lock: + for i, chunk in enumerate(youtube_chunks): + # Fix for UI + youtube_chunks[i]['document']['id'] = self.source_id_counter - # Create a more descriptive description for YouTube videos - description = metadata.get('description', chunk.get('content', '')[:100]) - if len(description) == 100: - description += "..." + # Extract document metadata + document = chunk.get('document', {}) + metadata = document.get('metadata', {}) + + # Extract YouTube-specific metadata + video_title = metadata.get('video_title', 'Untitled Video') + video_id = metadata.get('video_id', '') + channel_name = metadata.get('channel_name', '') + published_date = metadata.get('published_date', '') - # For URL, construct a URL to the YouTube video - url = f"https://www.youtube.com/watch?v={video_id}" if video_id else "" + # Create a more descriptive title for YouTube videos + title = video_title + if channel_name: + title += f" - {channel_name}" + + # Create a more descriptive description for YouTube videos + description = metadata.get('description', chunk.get('content', '')[:100]) + if len(description) == 100: + description += "..." + + # For URL, construct a URL to the YouTube video + url = f"https://www.youtube.com/watch?v={video_id}" if video_id else "" - source = { - "id": self.source_id_counter, - "title": title, - "description": description, - "url": url, - "video_id": video_id, # Additional field for YouTube videos - "channel_name": channel_name # Additional field for YouTube videos - } + source = { + "id": self.source_id_counter, + "title": title, + "description": description, + "url": url, + "video_id": video_id, # Additional field for YouTube videos + "channel_name": channel_name # Additional field for YouTube videos + } - self.source_id_counter += 1 - sources_list.append(source) + self.source_id_counter += 1 + sources_list.append(source) # Create result object result_object = { @@ -514,41 +682,62 @@ class ConnectorService: return result_object, youtube_chunks - async def search_github(self, user_query: str, user_id: int, search_space_id: int, top_k: int = 20) -> tuple: + async def search_github(self, user_query: str, user_id: int, search_space_id: int, top_k: int = 20, search_mode: SearchMode = SearchMode.CHUNKS) -> tuple: """ Search for GitHub documents and return both the source information and langchain documents Returns: tuple: (sources_info, langchain_documents) """ - github_chunks = await self.retriever.hybrid_search( - query_text=user_query, - top_k=top_k, - user_id=user_id, - search_space_id=search_space_id, - document_type="GITHUB_CONNECTOR" - ) + if search_mode == SearchMode.CHUNKS: + github_chunks = await self.chunk_retriever.hybrid_search( + query_text=user_query, + top_k=top_k, + user_id=user_id, + search_space_id=search_space_id, + document_type="GITHUB_CONNECTOR" + ) + elif search_mode == SearchMode.DOCUMENTS: + github_chunks = await self.document_retriever.hybrid_search( + query_text=user_query, + top_k=top_k, + user_id=user_id, + search_space_id=search_space_id, + document_type="GITHUB_CONNECTOR" + ) + # Transform document retriever results to match expected format + github_chunks = self._transform_document_results(github_chunks) + + # Early return if no results + if not github_chunks: + return { + "id": 8, + "name": "GitHub", + "type": "GITHUB_CONNECTOR", + "sources": [], + }, [] # Process each chunk and create sources directly without deduplication sources_list = [] - for i, chunk in enumerate(github_chunks): - # Fix for UI - assign a unique ID for citation/source tracking - github_chunks[i]['document']['id'] = self.source_id_counter - - # Extract document metadata - document = chunk.get('document', {}) - metadata = document.get('metadata', {}) + async with self.counter_lock: + for i, chunk in enumerate(github_chunks): + # Fix for UI - assign a unique ID for citation/source tracking + github_chunks[i]['document']['id'] = self.source_id_counter + + # Extract document metadata + document = chunk.get('document', {}) + metadata = document.get('metadata', {}) - # Create a source entry - source = { - "id": self.source_id_counter, - "title": document.get('title', 'GitHub Document'), # Use specific title if available - "description": metadata.get('description', chunk.get('content', '')[:100]), # Use description or content preview - "url": metadata.get('url', '') # Use URL if available in metadata - } + # Create a source entry + source = { + "id": self.source_id_counter, + "title": document.get('title', 'GitHub Document'), # Use specific title if available + "description": metadata.get('description', chunk.get('content', '')[:100]), # Use description or content preview + "url": metadata.get('url', '') # Use URL if available in metadata + } - self.source_id_counter += 1 - sources_list.append(source) + self.source_id_counter += 1 + sources_list.append(source) # Create result object result_object = { @@ -560,7 +749,7 @@ class ConnectorService: return result_object, github_chunks - async def search_linear(self, user_query: str, user_id: str, search_space_id: int, top_k: int = 20) -> tuple: + async def search_linear(self, user_query: str, user_id: str, search_space_id: int, top_k: int = 20, search_mode: SearchMode = SearchMode.CHUNKS) -> tuple: """ Search for Linear issues and comments and return both the source information and langchain documents @@ -573,66 +762,87 @@ class ConnectorService: Returns: tuple: (sources_info, langchain_documents) """ - linear_chunks = await self.retriever.hybrid_search( - query_text=user_query, - top_k=top_k, - user_id=user_id, - search_space_id=search_space_id, - document_type="LINEAR_CONNECTOR" - ) + if search_mode == SearchMode.CHUNKS: + linear_chunks = await self.chunk_retriever.hybrid_search( + query_text=user_query, + top_k=top_k, + user_id=user_id, + search_space_id=search_space_id, + document_type="LINEAR_CONNECTOR" + ) + elif search_mode == SearchMode.DOCUMENTS: + linear_chunks = await self.document_retriever.hybrid_search( + query_text=user_query, + top_k=top_k, + user_id=user_id, + search_space_id=search_space_id, + document_type="LINEAR_CONNECTOR" + ) + # Transform document retriever results to match expected format + linear_chunks = self._transform_document_results(linear_chunks) + + # Early return if no results + if not linear_chunks: + return { + "id": 9, + "name": "Linear Issues", + "type": "LINEAR_CONNECTOR", + "sources": [], + }, [] # Process each chunk and create sources directly without deduplication sources_list = [] - for i, chunk in enumerate(linear_chunks): - # Fix for UI - linear_chunks[i]['document']['id'] = self.source_id_counter - - # Extract document metadata - document = chunk.get('document', {}) - metadata = document.get('metadata', {}) - - # Extract Linear-specific metadata - issue_identifier = metadata.get('issue_identifier', '') - issue_title = metadata.get('issue_title', 'Untitled Issue') - issue_state = metadata.get('state', '') - comment_count = metadata.get('comment_count', 0) - - # Create a more descriptive title for Linear issues - title = f"Linear: {issue_identifier} - {issue_title}" - if issue_state: - title += f" ({issue_state})" + async with self.counter_lock: + for i, chunk in enumerate(linear_chunks): + # Fix for UI + linear_chunks[i]['document']['id'] = self.source_id_counter - # Create a more descriptive description for Linear issues - description = chunk.get('content', '')[:100] - if len(description) == 100: - description += "..." - - # Add comment count info to description - if comment_count: - if description: - description += f" | Comments: {comment_count}" - else: - description = f"Comments: {comment_count}" - - # For URL, we could construct a URL to the Linear issue if we have the workspace info - # For now, use a generic placeholder - url = "" - if issue_identifier: - # This is a generic format, may need to be adjusted based on actual Linear workspace - url = f"https://linear.app/issue/{issue_identifier}" + # Extract document metadata + document = chunk.get('document', {}) + metadata = document.get('metadata', {}) - source = { - "id": self.source_id_counter, - "title": title, - "description": description, - "url": url, - "issue_identifier": issue_identifier, - "state": issue_state, - "comment_count": comment_count - } + # Extract Linear-specific metadata + issue_identifier = metadata.get('issue_identifier', '') + issue_title = metadata.get('issue_title', 'Untitled Issue') + issue_state = metadata.get('state', '') + comment_count = metadata.get('comment_count', 0) + + # Create a more descriptive title for Linear issues + title = f"Linear: {issue_identifier} - {issue_title}" + if issue_state: + title += f" ({issue_state})" + + # Create a more descriptive description for Linear issues + description = chunk.get('content', '')[:100] + if len(description) == 100: + description += "..." + + # Add comment count info to description + if comment_count: + if description: + description += f" | Comments: {comment_count}" + else: + description = f"Comments: {comment_count}" + + # For URL, we could construct a URL to the Linear issue if we have the workspace info + # For now, use a generic placeholder + url = "" + if issue_identifier: + # This is a generic format, may need to be adjusted based on actual Linear workspace + url = f"https://linear.app/issue/{issue_identifier}" - self.source_id_counter += 1 - sources_list.append(source) + source = { + "id": self.source_id_counter, + "title": title, + "description": description, + "url": url, + "issue_identifier": issue_identifier, + "state": issue_state, + "comment_count": comment_count + } + + self.source_id_counter += 1 + sources_list.append(source) # Create result object result_object = { @@ -643,3 +853,109 @@ class ConnectorService: } return result_object, linear_chunks + + async def search_linkup(self, user_query: str, user_id: str, mode: str = "standard") -> tuple: + """ + Search using Linkup API and return both the source information and documents + + Args: + user_query: The user's query + user_id: The user's ID + mode: Search depth mode, can be "standard" or "deep" + + Returns: + tuple: (sources_info, documents) + """ + # Get Linkup connector configuration + linkup_connector = await self.get_connector_by_type(user_id, SearchSourceConnectorType.LINKUP_API) + + if not linkup_connector: + # Return empty results if no Linkup connector is configured + return { + "id": 10, + "name": "Linkup Search", + "type": "LINKUP_API", + "sources": [], + }, [] + + # Initialize Linkup client with API key from connector config + linkup_api_key = linkup_connector.config.get("LINKUP_API_KEY") + linkup_client = LinkupClient(api_key=linkup_api_key) + + # Perform search with Linkup + try: + response = linkup_client.search( + query=user_query, + depth=mode, # Use the provided mode ("standard" or "deep") + output_type="searchResults", # Default to search results + ) + + # Extract results from Linkup response - access as attribute instead of using .get() + linkup_results = response.results if hasattr(response, 'results') else [] + + # Only proceed if we have results + if not linkup_results: + return { + "id": 10, + "name": "Linkup Search", + "type": "LINKUP_API", + "sources": [], + }, [] + + # Process each result and create sources directly without deduplication + sources_list = [] + documents = [] + + async with self.counter_lock: + for i, result in enumerate(linkup_results): + # Only process results that have content + if not hasattr(result, 'content') or not result.content: + continue + + # Create a source entry + source = { + "id": self.source_id_counter, + "title": result.name if hasattr(result, 'name') else "Linkup Result", + "description": result.content[:100] if hasattr(result, 'content') else "", + "url": result.url if hasattr(result, 'url') else "" + } + sources_list.append(source) + + # Create a document entry + document = { + "chunk_id": f"linkup_chunk_{i}", + "content": result.content if hasattr(result, 'content') else "", + "score": 1.0, # Default score since not provided by Linkup + "document": { + "id": self.source_id_counter, + "title": result.name if hasattr(result, 'name') else "Linkup Result", + "document_type": "LINKUP_API", + "metadata": { + "url": result.url if hasattr(result, 'url') else "", + "type": result.type if hasattr(result, 'type') else "", + "source": "LINKUP_API" + } + } + } + documents.append(document) + self.source_id_counter += 1 + + # Create result object + result_object = { + "id": 10, + "name": "Linkup Search", + "type": "LINKUP_API", + "sources": sources_list, + } + + return result_object, documents + + except Exception as e: + # Log the error and return empty results + print(f"Error searching with Linkup: {str(e)}") + return { + "id": 10, + "name": "Linkup Search", + "type": "LINKUP_API", + "sources": [], + }, [] diff --git a/surfsense_backend/app/utils/document_converters.py b/surfsense_backend/app/utils/document_converters.py index 0c9b8f73f..a6f69e4f6 100644 --- a/surfsense_backend/app/utils/document_converters.py +++ b/surfsense_backend/app/utils/document_converters.py @@ -1,19 +1,22 @@ +import hashlib + + async def convert_element_to_markdown(element) -> str: """ Convert an Unstructured element to markdown format based on its category. - + Args: element: The Unstructured API element object - + Returns: str: Markdown formatted string """ element_category = element.metadata["category"] content = element.page_content - + if not content: return "" - + markdown_mapping = { "Formula": lambda x: f"```math\n{x}\n```", "FigureCaption": lambda x: f"*Figure: {x}*", @@ -31,7 +34,7 @@ async def convert_element_to_markdown(element) -> str: "PageNumber": lambda x: f"*Page {x}*\n\n", "UncategorizedText": lambda x: f"{x}\n\n" } - + converter = markdown_mapping.get(element_category, lambda x: x) return converter(content) @@ -39,29 +42,30 @@ async def convert_element_to_markdown(element) -> str: async def convert_document_to_markdown(elements): """ Convert all document elements to markdown. - + Args: elements: List of Unstructured API elements - + Returns: str: Complete markdown document """ markdown_parts = [] - + for element in elements: markdown_text = await convert_element_to_markdown(element) if markdown_text: markdown_parts.append(markdown_text) - + return "".join(markdown_parts) + def convert_chunks_to_langchain_documents(chunks): """ Convert chunks from hybrid search results to LangChain Document objects. - + Args: chunks: List of chunk dictionaries from hybrid search results - + Returns: List of LangChain Document objects """ @@ -71,20 +75,20 @@ def convert_chunks_to_langchain_documents(chunks): raise ImportError( "LangChain is not installed. Please install it with `pip install langchain langchain-core`" ) - + langchain_docs = [] - + for chunk in chunks: # Extract content from the chunk content = chunk.get("content", "") - + # Create metadata dictionary metadata = { "chunk_id": chunk.get("chunk_id"), "score": chunk.get("score"), "rank": chunk.get("rank") if "rank" in chunk else None, } - + # Add document information to metadata if "document" in chunk: doc = chunk["document"] @@ -93,24 +97,25 @@ def convert_chunks_to_langchain_documents(chunks): "document_title": doc.get("title"), "document_type": doc.get("document_type"), }) - + # Add document metadata if available if "metadata" in doc: # Prefix document metadata keys to avoid conflicts - doc_metadata = {f"doc_meta_{k}": v for k, v in doc.get("metadata", {}).items()} + doc_metadata = {f"doc_meta_{k}": v for k, + v in doc.get("metadata", {}).items()} metadata.update(doc_metadata) - + # Add source URL if available in metadata if "url" in doc.get("metadata", {}): metadata["source"] = doc["metadata"]["url"] elif "sourceURL" in doc.get("metadata", {}): metadata["source"] = doc["metadata"]["sourceURL"] - + # Ensure source_id is set for citation purposes # Use document_id as the source_id if available if "document_id" in metadata: metadata["source_id"] = metadata["document_id"] - + # Update content for citation mode - format as XML with explicit source_id new_content = f""" @@ -124,13 +129,18 @@ def convert_chunks_to_langchain_documents(chunks): """ - + # Create LangChain Document langchain_doc = LangChainDocument( page_content=new_content, metadata=metadata ) - + langchain_docs.append(langchain_doc) - + return langchain_docs + + +def generate_content_hash(content: str) -> str: + """Generate SHA-256 hash for the given content.""" + return hashlib.sha256(content.encode('utf-8')).hexdigest() diff --git a/surfsense_backend/app/utils/query_service.py b/surfsense_backend/app/utils/query_service.py index 760f0c8fa..4442c8fa7 100644 --- a/surfsense_backend/app/utils/query_service.py +++ b/surfsense_backend/app/utils/query_service.py @@ -1,8 +1,8 @@ -""" -NOTE: This is not used anymore. Might be removed in the future. -""" -from langchain.schema import HumanMessage, SystemMessage +import datetime +from langchain.schema import HumanMessage, SystemMessage, AIMessage from app.config import config +from typing import Any, List, Optional + class QueryService: """ @@ -10,72 +10,91 @@ class QueryService: """ @staticmethod - async def reformulate_query(user_query: str) -> str: + async def reformulate_query_with_chat_history(user_query: str, chat_history_str: Optional[str] = None) -> str: """ Reformulate the user query using the STRATEGIC_LLM to make it more effective for information retrieval and research purposes. - + Args: user_query: The original user query - + chat_history: Optional list of previous chat messages + Returns: str: The reformulated query """ if not user_query or not user_query.strip(): return user_query - + try: # Get the strategic LLM instance from config llm = config.strategic_llm_instance - + # Create system message with instructions system_message = SystemMessage( - content=""" - You are an expert at reformulating user queries to optimize information retrieval. - Your job is to take a user query and reformulate it to: - - 1. Make it more specific and detailed - 2. Expand ambiguous terms - 3. Include relevant synonyms and alternative phrasings - 4. Break down complex questions into their core components - 5. Ensure it's comprehensive for research purposes - - The query will be used with the following data sources/connectors: - - SERPER_API: Web search for retrieving current information from the internet - - TAVILY_API: Research-focused search API for comprehensive information - - SLACK_CONNECTOR: Retrieves information from indexed Slack workspace conversations - - NOTION_CONNECTOR: Retrieves information from indexed Notion documents and databases - - FILE: Searches through user's uploaded files - - CRAWLED_URL: Searches through previously crawled web pages - - IMPORTANT: Keep the reformulated query as concise as possible while still being effective. - Avoid unnecessary verbosity and limit the query to only essential terms and concepts. - - Please optimize the query to work effectively across these different data sources. - - Return ONLY the reformulated query without explanations, prefixes, or commentary. - Do not include phrases like "Reformulated query:" or any other text except the query itself. + content=f""" + Today's date: {datetime.datetime.now().strftime("%Y-%m-%d")} + You are a highly skilled AI assistant specializing in query optimization for advanced research. + Your primary objective is to transform a user's initial query into a highly effective search query. + This reformulated query will be used to retrieve information from diverse data sources. + + **Chat History Context:** + {chat_history_str if chat_history_str else "No prior conversation history is available."} + If chat history is provided, analyze it to understand the user's evolving information needs and the broader context of their request. Use this understanding to refine the current query, ensuring it builds upon or clarifies previous interactions. + + **Query Reformulation Guidelines:** + Your reformulated query should: + 1. **Enhance Specificity and Detail:** Add precision to narrow the search focus effectively, making the query less ambiguous and more targeted. + 2. **Resolve Ambiguities:** Identify and clarify vague terms or phrases. If a term has multiple meanings, orient the query towards the most likely one given the context. + 3. **Expand Key Concepts:** Incorporate relevant synonyms, related terms, and alternative phrasings for core concepts. This helps capture a wider range of relevant documents. + 4. **Deconstruct Complex Questions:** If the original query is multifaceted, break it down into its core searchable components or rephrase it to address each aspect clearly. The final output must still be a single, coherent query string. + 5. **Optimize for Comprehensiveness:** Ensure the query is structured to uncover all essential facets of the original request, aiming for thorough information retrieval suitable for research. + 6. **Maintain User Intent:** The reformulated query must stay true to the original intent of the user's query. Do not introduce new topics or shift the focus significantly. + + **Crucial Constraints:** + * **Conciseness and Effectiveness:** While aiming for comprehensiveness, the reformulated query MUST be as concise as possible. Eliminate all unnecessary verbosity. Focus on essential keywords, entities, and concepts that directly contribute to effective retrieval. + * **Single, Direct Output:** Return ONLY the reformulated query itself. Do NOT include any explanations, introductory phrases (e.g., "Reformulated query:", "Here is the optimized query:"), or any other surrounding text or markdown formatting. + + Your output should be a single, optimized query string, ready for immediate use in a search system. """ ) - + # Create human message with the user query human_message = HumanMessage( content=f"Reformulate this query for better research results: {user_query}" ) - + # Get the response from the LLM response = await llm.agenerate(messages=[[system_message, human_message]]) - + # Extract the reformulated query from the response reformulated_query = response.generations[0][0].text.strip() - + # Return the original query if the reformulation is empty if not reformulated_query: return user_query - + return reformulated_query - + except Exception as e: # Log the error and return the original query print(f"Error reformulating query: {e}") - return user_query \ No newline at end of file + return user_query + + + @staticmethod + async def langchain_chat_history_to_str(chat_history: List[Any]) -> str: + """ + Convert a list of chat history messages to a string. + """ + chat_history_str = "\n" + + for chat_message in chat_history: + if isinstance(chat_message, HumanMessage): + chat_history_str += f"{chat_message.content}\n" + elif isinstance(chat_message, AIMessage): + chat_history_str += f"{chat_message.content}\n" + elif isinstance(chat_message, SystemMessage): + chat_history_str += f"{chat_message.content}\n" + + chat_history_str += "" + return chat_history_str diff --git a/surfsense_backend/draw.py b/surfsense_backend/draw.py deleted file mode 100644 index ec55f79a5..000000000 --- a/surfsense_backend/draw.py +++ /dev/null @@ -1,5 +0,0 @@ -from app.agents.researcher.graph import graph as researcher_graph -from app.agents.researcher.sub_section_writer.graph import graph as sub_section_writer_graph - -print(researcher_graph.get_graph().draw_mermaid()) -print(sub_section_writer_graph.get_graph().draw_mermaid()) \ No newline at end of file diff --git a/surfsense_backend/pyproject.toml b/surfsense_backend/pyproject.toml index 7b7a6f900..dfa755946 100644 --- a/surfsense_backend/pyproject.toml +++ b/surfsense_backend/pyproject.toml @@ -1,13 +1,13 @@ [project] name = "surf-new-backend" -version = "0.0.6" +version = "0.0.7" description = "SurfSense Backend" readme = "README.md" requires-python = ">=3.12" dependencies = [ "alembic>=1.13.0", "asyncpg>=0.30.0", - "chonkie[all]>=0.4.1", + "chonkie[all]>=1.0.6", "fastapi>=0.115.8", "fastapi-users[oauth,sqlalchemy]>=14.0.1", "firecrawl-py>=1.12.0", @@ -15,14 +15,18 @@ dependencies = [ "langchain-community>=0.3.17", "langchain-unstructured>=0.1.6", "langgraph>=0.3.29", + "linkup-sdk>=0.2.4", "litellm>=1.61.4", + "llama-cloud-services>=0.6.25", "markdownify>=0.14.1", "notion-client>=2.3.0", "pgvector>=0.3.6", "playwright>=1.50.0", + "python-ffmpeg>=2.0.12", "rerankers[flashrank]>=0.7.1", "sentence-transformers>=3.4.1", "slack-sdk>=3.34.0", + "static-ffmpeg>=2.13", "tavily-python>=0.3.2", "unstructured-client>=0.30.0", "unstructured[all-docs]>=0.16.25", diff --git a/surfsense_backend/uv.lock b/surfsense_backend/uv.lock index 9b485b0df..968e5c9e8 100644 --- a/surfsense_backend/uv.lock +++ b/surfsense_backend/uv.lock @@ -13,6 +13,24 @@ resolution-markers = [ "(python_full_version < '3.12.4' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version < '3.12.4' and sys_platform != 'darwin' and sys_platform != 'linux')", ] +[[package]] +name = "accelerate" +version = "1.6.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "huggingface-hub" }, + { name = "numpy" }, + { name = "packaging" }, + { name = "psutil" }, + { name = "pyyaml" }, + { name = "safetensors" }, + { name = "torch" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/8a/6e/c29a1dcde7db07f47870ed63e5124086b11874ad52ccd533dc1ca2c799da/accelerate-1.6.0.tar.gz", hash = "sha256:28c1ef1846e690944f98b68dc7b8bb6c51d032d45e85dcbb3adb0c8b99dffb32", size = 363804 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/63/b1/8198e3cdd11a426b1df2912e3381018c4a4a55368f6d0857ba3ca418ef93/accelerate-1.6.0-py3-none-any.whl", hash = "sha256:1aee717d3d3735ad6d09710a7c26990ee4652b79b4e93df46551551b5227c2aa", size = 354748 }, +] + [[package]] name = "aiofiles" version = "24.1.0" @@ -92,6 +110,18 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/ec/6a/bc7e17a3e87a2985d3e8f4da4cd0f481060eb78fb08596c42be62c90a4d9/aiosignal-1.3.2-py2.py3-none-any.whl", hash = "sha256:45cde58e409a301715980c2b01d0c28bdde3770d8290b5eb2173759d9acb31a5", size = 7597 }, ] +[[package]] +name = "aiosqlite" +version = "0.21.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "typing-extensions" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/13/7d/8bca2bf9a247c2c5dfeec1d7a5f40db6518f88d314b8bca9da29670d2671/aiosqlite-0.21.0.tar.gz", hash = "sha256:131bb8056daa3bc875608c631c678cda73922a2d4ba8aec373b19f18c17e7aa3", size = 13454 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/f5/10/6c25ed6de94c49f88a91fa5018cb4c0f3625f31d5be9f771ebe5cc7cd506/aiosqlite-0.21.0-py3-none-any.whl", hash = "sha256:2549cf4057f95f53dcba16f2b64e8e2791d7e1adedb13197dd8ed77bb226d7d0", size = 15792 }, +] + [[package]] name = "alembic" version = "1.15.2" @@ -201,19 +231,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/fc/30/d4986a882011f9df997a55e6becd864812ccfcd821d64aac8570ee39f719/attrs-25.1.0-py3-none-any.whl", hash = "sha256:c75a69e28a550a7e93789579c22aa26b0f5b83b75dc4e08fe092980051e1090a", size = 63152 }, ] -[[package]] -name = "autotiktokenizer" -version = "0.2.2" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "huggingface-hub" }, - { name = "tiktoken" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/a6/1a/c6f494750dc67c2e5b06b91ae9565d46adb384f25f61a7136ff79dd02413/autotiktokenizer-0.2.2.tar.gz", hash = "sha256:f0954f14cedfe538b96ba0eed2e39996378c0bdf649fd977d6a047e419e05fdb", size = 15401 } -wheels = [ - { url = "https://files.pythonhosted.org/packages/d8/7b/c34469a1495d755bac1c80fbf3c0c2c29eb03ffe61172d889426025173bd/autotiktokenizer-0.2.2-py3-none-any.whl", hash = "sha256:ebbf15d9d5516fcb3287a8153bd8efbcc932f9c99089b2357255413cf37815d9", size = 8957 }, -] - [[package]] name = "backoff" version = "2.2.1" @@ -223,6 +240,22 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/df/73/b6e24bd22e6720ca8ee9a85a0c4a2971af8497d8f3193fa05390cbd46e09/backoff-2.2.1-py3-none-any.whl", hash = "sha256:63579f9a0628e06278f7e47b7d7d5b6ce20dc65c5e96a6f3ca99a6adca0396e8", size = 15148 }, ] +[[package]] +name = "banks" +version = "2.1.2" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "deprecated" }, + { name = "griffe" }, + { name = "jinja2" }, + { name = "platformdirs" }, + { name = "pydantic" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/77/34/2b6697f02ffb68bee50e5fd37d6c64432244d3245603fd62950169dfed7e/banks-2.1.2.tar.gz", hash = "sha256:a0651db9d14b57fa2e115e78f68dbb1b36fe226ad6eef96192542908b1d20c1f", size = 173332 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/04/4a/7fdca29d1db62f5f5c3446bf8f668beacdb0b5a8aff4247574ddfddc6bcd/banks-2.1.2-py3-none-any.whl", hash = "sha256:7fba451069f6bea376483b8136a0f29cb1e6883133626d00e077e20a3d102c0e", size = 28064 }, +] + [[package]] name = "bcrypt" version = "4.2.1" @@ -363,23 +396,36 @@ wheels = [ [[package]] name = "chonkie" -version = "0.4.1" +version = "1.0.6" source = { registry = "https://pypi.org/simple" } dependencies = [ - { name = "autotiktokenizer" }, + { name = "tokenizers" }, { name = "tqdm" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/2e/94/4a1bc8bdf06e7327bb256abb85767647125286c9bbc7cbcd77a550b96d63/chonkie-0.4.1.tar.gz", hash = "sha256:164216efa01af02e750e7cb218cea87918a18f83ebbd8f020b25557f1ed36aa9", size = 43284 } +sdist = { url = "https://files.pythonhosted.org/packages/5a/db/16d5d23a216db734bcb68e61c466ff48a55dc0d2cdc7ecdd73aaea1f6f7d/chonkie-1.0.6.tar.gz", hash = "sha256:feefad3cbbb62b4a55f4c6409bd8d8f0ee180d8319c4d32e31539a768955b3b0", size = 70056 } wheels = [ - { url = "https://files.pythonhosted.org/packages/c0/b5/c0d77500a413794773edb630bdc7061121c237a4eaf6ce222226c200d603/chonkie-0.4.1-py3-none-any.whl", hash = "sha256:af7d95d17f4ed60a26e32f0bad60f807287e3301189114755d727657ed2ef964", size = 51193 }, + { url = "https://files.pythonhosted.org/packages/bc/46/d6d9789eb6e61bfa073a13fd2b5cbbcf022a7781adbb060a25d82f16437e/chonkie-1.0.6-py3-none-any.whl", hash = "sha256:d8cfcf665cb6a64ac6ca87da61207372a88b9e5a7bb697faade78069c853e4b1", size = 89526 }, ] [package.optional-dependencies] all = [ + { name = "accelerate" }, + { name = "cohere" }, + { name = "google-genai" }, + { name = "huggingface-hub" }, + { name = "jsonschema" }, + { name = "magika" }, { name = "model2vec" }, { name = "numpy" }, { name = "openai" }, + { name = "pydantic" }, + { name = "rich" }, { name = "sentence-transformers" }, + { name = "tiktoken" }, + { name = "torch" }, + { name = "transformers" }, + { name = "tree-sitter" }, + { name = "tree-sitter-language-pack" }, ] [[package]] @@ -394,6 +440,26 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/7e/d4/7ebdbd03970677812aac39c869717059dbb71a4cfc033ca6e5221787892c/click-8.1.8-py3-none-any.whl", hash = "sha256:63c132bbbed01578a06712a2d1f497bb62d9c1c0d329b7903a866228027263b2", size = 98188 }, ] +[[package]] +name = "cohere" +version = "5.15.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "fastavro" }, + { name = "httpx" }, + { name = "httpx-sse" }, + { name = "pydantic" }, + { name = "pydantic-core" }, + { name = "requests" }, + { name = "tokenizers" }, + { name = "types-requests" }, + { name = "typing-extensions" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/a1/33/69c7d1b25a20eafef4197a1444c7f87d5241e936194e54876ea8996157e6/cohere-5.15.0.tar.gz", hash = "sha256:e802d4718ddb0bb655654382ebbce002756a3800faac30296cde7f1bdc6ff2cc", size = 135021 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/c7/87/94694db7fe6df979fbc03286eaabdfa98f1c8fa532960e5afdf965e10960/cohere-5.15.0-py3-none-any.whl", hash = "sha256:22ff867c2a6f2fc2b585360c6072f584f11f275ef6d9242bac24e0fa2df1dfb5", size = 259522 }, +] + [[package]] name = "colorama" version = "0.4.6" @@ -534,6 +600,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/6e/c6/ac0b6c1e2d138f1002bcf799d330bd6d85084fece321e662a14223794041/Deprecated-1.2.18-py2.py3-none-any.whl", hash = "sha256:bd5011788200372a32418f888e326a09ff80d0214bd961147cfed01b5c018eec", size = 9998 }, ] +[[package]] +name = "dirtyjson" +version = "1.0.8" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/db/04/d24f6e645ad82ba0ef092fa17d9ef7a21953781663648a01c9371d9e8e98/dirtyjson-1.0.8.tar.gz", hash = "sha256:90ca4a18f3ff30ce849d100dcf4a003953c79d3a2348ef056f1d9c22231a25fd", size = 30782 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/68/69/1bcf70f81de1b4a9f21b3a62ec0c83bdff991c88d6cc2267d02408457e88/dirtyjson-1.0.8-py3-none-any.whl", hash = "sha256:125e27248435a58acace26d5c2c4c11a1c0de0a9c5124c5a94ba78e517d74f53", size = 25197 }, +] + [[package]] name = "distro" version = "1.9.0" @@ -552,6 +627,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/68/1b/e0a87d256e40e8c888847551b20a017a6b98139178505dc7ffb96f04e954/dnspython-2.7.0-py3-none-any.whl", hash = "sha256:b4c34b7d10b51bcc3a5071e7b8dee77939f1e878477eeecc965e9835f63c6c86", size = 313632 }, ] +[[package]] +name = "docutils" +version = "0.21.2" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/ae/ed/aefcc8cd0ba62a0560c3c18c33925362d46c6075480bfa4df87b28e169a9/docutils-0.21.2.tar.gz", hash = "sha256:3a6b18732edf182daa3cd12775bbb338cf5691468f91eeeb109deff6ebfa986f", size = 2204444 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/8f/d7/9322c609343d929e75e7e5e6255e614fcc67572cfd083959cdef3b7aad79/docutils-0.21.2-py3-none-any.whl", hash = "sha256:dafca5b9e384f0e419294eb4d2ff9fa826435bf15f15b7bd45723e8ad76811b2", size = 587408 }, +] + [[package]] name = "effdet" version = "0.4.1" @@ -660,6 +744,26 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/a6/08/9968963c1fb8c34627b7f1fbcdfe9438540f87dc7c9bfb59bb4fd19a4ecf/fastapi_users_db_sqlalchemy-7.0.0-py3-none-any.whl", hash = "sha256:5fceac018e7cfa69efc70834dd3035b3de7988eb4274154a0dbe8b14f5aa001e", size = 6891 }, ] +[[package]] +name = "fastavro" +version = "1.10.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/f3/67/7121d2221e998706cac00fa779ec44c1c943cb65e8a7ed1bd57d78d93f2c/fastavro-1.10.0.tar.gz", hash = "sha256:47bf41ac6d52cdfe4a3da88c75a802321321b37b663a900d12765101a5d6886f", size = 987970 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/9c/a4/8e69c0a5cd121e5d476237de1bde5a7947f791ae45768ae52ed0d3ea8d18/fastavro-1.10.0-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:cfe57cb0d72f304bd0dcc5a3208ca6a7363a9ae76f3073307d095c9d053b29d4", size = 1036343 }, + { url = "https://files.pythonhosted.org/packages/1e/01/aa219e2b33e5873d27b867ec0fad9f35f23d461114e1135a7e46c06786d2/fastavro-1.10.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:74e517440c824cb65fb29d3e3903a9406f4d7c75490cef47e55c4c82cdc66270", size = 3263368 }, + { url = "https://files.pythonhosted.org/packages/a7/ba/1766e2d7d95df2e95e9e9a089dc7a537c0616720b053a111a918fa7ee6b6/fastavro-1.10.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:203c17d44cadde76e8eecb30f2d1b4f33eb478877552d71f049265dc6f2ecd10", size = 3328933 }, + { url = "https://files.pythonhosted.org/packages/2e/40/26e56696b9696ab4fbba25a96b8037ca3f9fd8a8cc55b4b36400ef023e49/fastavro-1.10.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:6575be7f2b5f94023b5a4e766b0251924945ad55e9a96672dc523656d17fe251", size = 3258045 }, + { url = "https://files.pythonhosted.org/packages/4e/bc/2f6c92c06c5363372abe828bccdd95762f2c1983b261509f94189c38c8a1/fastavro-1.10.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:fe471deb675ed2f01ee2aac958fbf8ebb13ea00fa4ce7f87e57710a0bc592208", size = 3418001 }, + { url = "https://files.pythonhosted.org/packages/0c/ce/cfd16546c04ebbca1be80873b533c788cec76f7bfac231bfac6786047572/fastavro-1.10.0-cp312-cp312-win_amd64.whl", hash = "sha256:567ff515f2a5d26d9674b31c95477f3e6022ec206124c62169bc2ffaf0889089", size = 487855 }, + { url = "https://files.pythonhosted.org/packages/c9/c4/163cf154cc694c2dccc70cd6796db6214ac668a1260bf0310401dad188dc/fastavro-1.10.0-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:82263af0adfddb39c85f9517d736e1e940fe506dfcc35bc9ab9f85e0fa9236d8", size = 1022741 }, + { url = "https://files.pythonhosted.org/packages/38/01/a24598f5f31b8582a92fe9c41bf91caeed50d5b5eaa7576e6f8b23cb488d/fastavro-1.10.0-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:566c193109ff0ff84f1072a165b7106c4f96050078a4e6ac7391f81ca1ef3efa", size = 3237421 }, + { url = "https://files.pythonhosted.org/packages/a7/bf/08bcf65cfb7feb0e5b1329fafeb4a9b95b7b5ec723ba58c7dbd0d04ded34/fastavro-1.10.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e400d2e55d068404d9fea7c5021f8b999c6f9d9afa1d1f3652ec92c105ffcbdd", size = 3300222 }, + { url = "https://files.pythonhosted.org/packages/53/4d/a6c25f3166328f8306ec2e6be1123ed78a55b8ab774a43a661124508881f/fastavro-1.10.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:9b8227497f71565270f9249fc9af32a93644ca683a0167cfe66d203845c3a038", size = 3233276 }, + { url = "https://files.pythonhosted.org/packages/47/1c/b2b2ce2bf866a248ae23e96a87b3b8369427ff79be9112073039bee1d245/fastavro-1.10.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:8e62d04c65461b30ac6d314e4197ad666371e97ae8cb2c16f971d802f6c7f514", size = 3388936 }, + { url = "https://files.pythonhosted.org/packages/1f/2c/43927e22a2d57587b3aa09765098a6d833246b672d34c10c5f135414745a/fastavro-1.10.0-cp313-cp313-win_amd64.whl", hash = "sha256:86baf8c9740ab570d0d4d18517da71626fe9be4d1142bea684db52bd5adb078f", size = 483967 }, +] + [[package]] name = "filelock" version = "3.17.0" @@ -858,6 +962,24 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/94/b6/60f2910485d32f7bba92cc33e5053b3f29d61fccaa57e5e58c600bb7e0d2/google_cloud_vision-3.10.1-py3-none-any.whl", hash = "sha256:91959ea12b0d6a8442e30c0a5062cd305f349a4840f9184b5061b3153bbd8476", size = 526076 }, ] +[[package]] +name = "google-genai" +version = "1.12.1" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "anyio" }, + { name = "google-auth" }, + { name = "httpx" }, + { name = "pydantic" }, + { name = "requests" }, + { name = "typing-extensions" }, + { name = "websockets" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/38/9c/c907dbea921663bb7c41f415337bedd08259d17da8d156396c7237611744/google_genai-1.12.1.tar.gz", hash = "sha256:5c7eda422360643ce602a3f6b23152470ec1039310ef40080cbe4e71237f6391", size = 167752 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/29/2c/5b454dec837328eb167e78f45a14da502af223f8b94a4824e2fd0df74f19/google_genai-1.12.1-py3-none-any.whl", hash = "sha256:7cbc1bc029712946ce41bcf80c0eaa89eb8c09c308efbbfe30fd491f402c258a", size = 165940 }, +] + [[package]] name = "googleapis-common-protos" version = "1.69.2" @@ -903,6 +1025,18 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/ac/38/08cc303ddddc4b3d7c628c3039a61a3aae36c241ed01393d00c2fd663473/greenlet-3.1.1-cp313-cp313t-musllinux_1_1_x86_64.whl", hash = "sha256:411f015496fec93c1c8cd4e5238da364e1da7a124bcb293f085bf2860c32c6f6", size = 1142112 }, ] +[[package]] +name = "griffe" +version = "1.7.3" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "colorama" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/a9/3e/5aa9a61f7c3c47b0b52a1d930302992229d191bf4bc76447b324b731510a/griffe-1.7.3.tar.gz", hash = "sha256:52ee893c6a3a968b639ace8015bec9d36594961e156e23315c8e8e51401fa50b", size = 395137 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/58/c6/5c20af38c2a57c15d87f7f38bee77d63c1d2a3689f74fefaf35915dd12b2/griffe-1.7.3-py3-none-any.whl", hash = "sha256:c6b3ee30c2f0f17f30bcdef5068d6ab7a2a4f1b8bf1a3e74b56fffd21e1c5f75", size = 129303 }, +] + [[package]] name = "grpcio" version = "1.71.0" @@ -1068,6 +1202,18 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/f0/0f/310fb31e39e2d734ccaa2c0fb981ee41f7bd5056ce9bc29b2248bd569169/humanfriendly-10.0-py2.py3-none-any.whl", hash = "sha256:1697e1a8a8f550fd43c2865cd84542fc175a61dcb779b6fee18cf6b6ccba1477", size = 86794 }, ] +[[package]] +name = "id" +version = "1.5.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "requests" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/22/11/102da08f88412d875fa2f1a9a469ff7ad4c874b0ca6fed0048fe385bdb3d/id-1.5.0.tar.gz", hash = "sha256:292cb8a49eacbbdbce97244f47a97b4c62540169c976552e497fd57df0734c1d", size = 15237 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/9f/cb/18326d2d89ad3b0dd143da971e77afd1e6ca6674f1b1c3df4b6bec6279fc/id-1.5.0-py3-none-any.whl", hash = "sha256:f1434e1cef91f2cbb8a4ec64663d5a23b9ed43ef44c4c957d02583d61714c658", size = 13611 }, +] + [[package]] name = "idna" version = "3.10" @@ -1089,6 +1235,48 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/79/9d/0fb148dc4d6fa4a7dd1d8378168d9b4cd8d4560a6fbf6f0121c5fc34eb68/importlib_metadata-8.6.1-py3-none-any.whl", hash = "sha256:02a89390c1e15fdfdc0d7c6b25cb3e62650d0494005c97d6f148bf5b9787525e", size = 26971 }, ] +[[package]] +name = "jaraco-classes" +version = "3.4.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "more-itertools" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/06/c0/ed4a27bc5571b99e3cff68f8a9fa5b56ff7df1c2251cc715a652ddd26402/jaraco.classes-3.4.0.tar.gz", hash = "sha256:47a024b51d0239c0dd8c8540c6c7f484be3b8fcf0b2d85c13825780d3b3f3acd", size = 11780 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/7f/66/b15ce62552d84bbfcec9a4873ab79d993a1dd4edb922cbfccae192bd5b5f/jaraco.classes-3.4.0-py3-none-any.whl", hash = "sha256:f662826b6bed8cace05e7ff873ce0f9283b5c924470fe664fff1c2f00f581790", size = 6777 }, +] + +[[package]] +name = "jaraco-context" +version = "6.0.1" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/df/ad/f3777b81bf0b6e7bc7514a1656d3e637b2e8e15fab2ce3235730b3e7a4e6/jaraco_context-6.0.1.tar.gz", hash = "sha256:9bae4ea555cf0b14938dc0aee7c9f32ed303aa20a3b73e7dc80111628792d1b3", size = 13912 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/ff/db/0c52c4cf5e4bd9f5d7135ec7669a3a767af21b3a308e1ed3674881e52b62/jaraco.context-6.0.1-py3-none-any.whl", hash = "sha256:f797fc481b490edb305122c9181830a3a5b76d84ef6d1aef2fb9b47ab956f9e4", size = 6825 }, +] + +[[package]] +name = "jaraco-functools" +version = "4.1.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "more-itertools" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/ab/23/9894b3df5d0a6eb44611c36aec777823fc2e07740dabbd0b810e19594013/jaraco_functools-4.1.0.tar.gz", hash = "sha256:70f7e0e2ae076498e212562325e805204fc092d7b4c17e0e86c959e249701a9d", size = 19159 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/9f/4f/24b319316142c44283d7540e76c7b5a6dbd5db623abd86bb7b3491c21018/jaraco.functools-4.1.0-py3-none-any.whl", hash = "sha256:ad159f13428bc4acbf5541ad6dec511f91573b90fba04df61dafa2a1231cf649", size = 10187 }, +] + +[[package]] +name = "jeepney" +version = "0.9.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/7b/6f/357efd7602486741aa73ffc0617fb310a29b588ed0fd69c2399acbb85b0c/jeepney-0.9.0.tar.gz", hash = "sha256:cf0e9e845622b81e4a28df94c40345400256ec608d0e55bb8a3feaa9163f5732", size = 106758 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/b2/a3/e137168c9c44d18eff0376253da9f1e9234d0239e0ee230d2fee6cea8e55/jeepney-0.9.0-py3-none-any.whl", hash = "sha256:97e5714520c16fc0a45695e5365a2e11b81ea79bba796e26f9f1d178cb182683", size = 49010 }, +] + [[package]] name = "jinja2" version = "3.1.5" @@ -1193,6 +1381,23 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/d1/0f/8910b19ac0670a0f80ce1008e5e751c4a57e14d2c4c13a482aa6079fa9d6/jsonschema_specifications-2024.10.1-py3-none-any.whl", hash = "sha256:a09a0680616357d9a0ecf05c12ad234479f549239d0f5b55f3deea67475da9bf", size = 18459 }, ] +[[package]] +name = "keyring" +version = "25.6.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "jaraco-classes" }, + { name = "jaraco-context" }, + { name = "jaraco-functools" }, + { name = "jeepney", marker = "sys_platform == 'linux'" }, + { name = "pywin32-ctypes", marker = "sys_platform == 'win32'" }, + { name = "secretstorage", marker = "sys_platform == 'linux'" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/70/09/d904a6e96f76ff214be59e7aa6ef7190008f52a0ab6689760a98de0bf37d/keyring-25.6.0.tar.gz", hash = "sha256:0b39998aa941431eb3d9b0d4b2460bc773b9df6fed7621c2dfb291a7e0187a66", size = 62750 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/d3/32/da7f44bcb1105d3e88a0b74ebdca50c59121d2ddf71c9e34ba47df7f3a56/keyring-25.6.0-py3-none-any.whl", hash = "sha256:552a3f7af126ece7ed5c89753650eec89c7eaae8617d0aa4d9ad2b75111266bd", size = 39085 }, +] + [[package]] name = "kiwisolver" version = "1.4.8" @@ -1413,6 +1618,19 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/8b/e4/5380e8229c442e406404977d2ec71a9db6a3e6a89fce7791c6ad7cd2bdbe/langsmith-0.3.8-py3-none-any.whl", hash = "sha256:fbb9dd97b0f090219447fca9362698d07abaeda1da85aa7cc6ec6517b36581b1", size = 332800 }, ] +[[package]] +name = "linkup-sdk" +version = "0.2.4" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "httpx" }, + { name = "pydantic" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/c2/c7/d9a85331bf2611ecac67f1ad92a6ced641b2e2e93eea26b17a9af701b3d1/linkup_sdk-0.2.4.tar.gz", hash = "sha256:2b8fd1894b9b4715bc14aabcbf53df6def9024f2cc426f234cc59e1807ec4c12", size = 9392 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/18/d8/bb9e01328fe5ad979e3e459c0f76321d295663906deef56eeaa5ce0cf269/linkup_sdk-0.2.4-py3-none-any.whl", hash = "sha256:8bc4c4f34de93529136a14e42441d803868d681c2bf3fd59be51923e44f1f1d4", size = 8325 }, +] + [[package]] name = "litellm" version = "1.61.4" @@ -1435,6 +1653,72 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/f9/c2/1b6c502909b7af9054736af61e27558a3341e8c1ba28e7f82473e6dd936f/litellm-1.61.4-py3-none-any.whl", hash = "sha256:e87e0d397a191795b4217f9299fc9b21eaacaab91409695f0a4780cceccda6e1", size = 6814517 }, ] +[[package]] +name = "llama-cloud" +version = "0.1.23" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "certifi" }, + { name = "httpx" }, + { name = "pydantic" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/5b/e4/d1a30167ed6690a408382be1cf7de220a506085f4371baaf067d65bad8fd/llama_cloud-0.1.23.tar.gz", hash = "sha256:3d84a24a860f046d39a106c06742ec0ea39a574ac42bbf91706fe025f44e233e", size = 101292 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/8a/15/3b56acef877dbc5d01d7e1a782c2cc50ef8a08d5773121c3bc20546de582/llama_cloud-0.1.23-py3-none-any.whl", hash = "sha256:ce95b0705d85c99b3b27b0af0d16a17d9a81b14c96bf13c1063a1bd13d8d0446", size = 267343 }, +] + +[[package]] +name = "llama-cloud-services" +version = "0.6.25" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "click" }, + { name = "llama-cloud" }, + { name = "llama-index-core" }, + { name = "platformdirs" }, + { name = "pydantic" }, + { name = "python-dotenv" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/79/c0/89f89dfc2c2b6c2d5c1c5fde9f445696eb12f9c2a4e17637ab0aaf7cc373/llama_cloud_services-0.6.25.tar.gz", hash = "sha256:3608004b0cf984640a3a36657b8b40394d7ce2c48e3eb9dd24fc654df7643595", size = 32303 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/e6/f1/99b8ef4a636dafd5f1ae1e1b19eb9f793f51573d782919bf01d9b9f797f4/llama_cloud_services-0.6.25-py3-none-any.whl", hash = "sha256:aef0afbbf0d6dc485e6566af2daeeefa8caa7bc7f6511d860036bc0aac15361b", size = 37231 }, +] + +[[package]] +name = "llama-index-core" +version = "0.12.39" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "aiohttp" }, + { name = "aiosqlite" }, + { name = "banks" }, + { name = "dataclasses-json" }, + { name = "deprecated" }, + { name = "dirtyjson" }, + { name = "filetype" }, + { name = "fsspec" }, + { name = "httpx" }, + { name = "nest-asyncio" }, + { name = "networkx" }, + { name = "nltk" }, + { name = "numpy" }, + { name = "pillow" }, + { name = "pydantic" }, + { name = "pyyaml" }, + { name = "requests" }, + { name = "sqlalchemy", extra = ["asyncio"] }, + { name = "tenacity" }, + { name = "tiktoken" }, + { name = "tqdm" }, + { name = "typing-extensions" }, + { name = "typing-inspect" }, + { name = "wrapt" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/f7/45/163806502804ff75ace474f868cc33158774c4eb31d565133f32932e930e/llama_index_core-0.12.39.tar.gz", hash = "sha256:0cca9de59953542a3c2f1db61327c5204e0b1e997f31f1200e49392b2879593a", size = 7292040 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/dd/a3/583d80764df75aefc9885f28dcc06a0e5aefc993fa5318186e70f2340d73/llama_index_core-0.12.39-py3-none-any.whl", hash = "sha256:c255ed87aa85e43893f2bb05870b61ce7701d7a6a931d174ba925def5856b4c2", size = 7664906 }, +] + [[package]] name = "lxml" version = "5.3.1" @@ -1477,6 +1761,24 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/80/83/8c54533b3576f4391eebea88454738978669a6cad0d8e23266224007939d/lxml-5.3.1-cp313-cp313-win_amd64.whl", hash = "sha256:91fb6a43d72b4f8863d21f347a9163eecbf36e76e2f51068d59cd004c506f332", size = 3814484 }, ] +[[package]] +name = "magika" +version = "0.6.1" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "click" }, + { name = "numpy" }, + { name = "onnxruntime" }, + { name = "python-dotenv" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/6d/18/ea70f6abd36f455037340f12c8125918c726d08cd6e01f0b76b6884e0c38/magika-0.6.1.tar.gz", hash = "sha256:e3dd22c73936630b1cd79d0f412d6d9a53dc99ba5e3709b1ac53f56bc998e635", size = 3030234 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/1f/be/c9f7bb9ee94abe8d344b660672001313e459c67b867b24abe32d5c80a9ce/magika-0.6.1-py3-none-any.whl", hash = "sha256:15838d2469f1394d8e9598bc7fceea1ede7f35aebe9675c6b45c6b5c48315931", size = 2968516 }, + { url = "https://files.pythonhosted.org/packages/3c/b9/016b174520e81faef5edb31b6c7a73966dc84ee33acd23a2e7b775df7ba4/magika-0.6.1-py3-none-macosx_11_0_arm64.whl", hash = "sha256:dadd036296a2e4840fd48fa0712848fe122da438e8f607dc8f19ca4663c359dc", size = 12408519 }, + { url = "https://files.pythonhosted.org/packages/02/b7/e7dfeb235823a82d676c68a748541c24db0249b854f945f6e3cec11c1b7e/magika-0.6.1-py3-none-manylinux_2_28_x86_64.whl", hash = "sha256:133c0e1a844361de86ca2dd7c530e38b324e86177d30c52e36fd82101c190b5c", size = 15089294 }, + { url = "https://files.pythonhosted.org/packages/64/f0/bec5bff0125d08c1bc3baef88beeb910121085249f67b5994ea961615b55/magika-0.6.1-py3-none-win_amd64.whl", hash = "sha256:0342b6230ea9aea7ab4b8fa92e1b46f1cc62e724d452ee8d6821a37f56738d22", size = 12378455 }, +] + [[package]] name = "makefun" version = "1.15.6" @@ -1630,7 +1932,7 @@ wheels = [ [[package]] name = "model2vec" -version = "0.4.0" +version = "0.4.1" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "jinja2" }, @@ -1642,9 +1944,18 @@ dependencies = [ { name = "tokenizers" }, { name = "tqdm" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/83/e2/3fb7bd8c612f71ad3abded92e7401f97f1e71427d3a68a3fb85f39394b17/model2vec-0.4.0.tar.gz", hash = "sha256:48d4a3da040499b0090f736eb8f22ea0fdd35b67462d81d789c70004423adbae", size = 2486998 } +sdist = { url = "https://files.pythonhosted.org/packages/b8/c1/3cd6cab10e8b7da8c32acebf85672d38a26f5f03165bfeaa617a5ec0bb61/model2vec-0.4.1.tar.gz", hash = "sha256:fc6038416679eebe448951708f2d0bebdee8510f47970af1c81a8f054a3c3f9f", size = 2660626 } wheels = [ - { url = "https://files.pythonhosted.org/packages/93/7d/39ff093c4e45303a06e3c5825c6144cbd21f18a1393a154bbf93232b0f1a/model2vec-0.4.0-py3-none-any.whl", hash = "sha256:df30685a55841c61c6638e4f329648e76b148507bd778801d7bfcd6b970a4f2f", size = 38593 }, + { url = "https://files.pythonhosted.org/packages/cd/76/c8575f90f521017597c5e57e3bfef61e3f27d9cb6c741a82a24d72b10a60/model2vec-0.4.1-py3-none-any.whl", hash = "sha256:04a397a17da9b967082b6baa4c494f0be48c89ec4e1a3975b4f290f045238a38", size = 41972 }, +] + +[[package]] +name = "more-itertools" +version = "10.7.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/ce/a0/834b0cebabbfc7e311f30b46c8188790a37f89fc8d756660346fe5abfd09/more_itertools-10.7.0.tar.gz", hash = "sha256:9fddd5403be01a94b204faadcff459ec3568cf110265d3c54323e1e866ad29d3", size = 127671 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/2b/9f/7ba6f94fc1e9ac3d2b853fdff3035fb2fa5afbed898c4a72b8a020610594/more_itertools-10.7.0-py3-none-any.whl", hash = "sha256:d43980384673cb07d2f7d2d918c616b30c659c089ee23953f601d6609c67510e", size = 65278 }, ] [[package]] @@ -1722,6 +2033,37 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/b9/54/dd730b32ea14ea797530a4479b2ed46a6fb250f682a9cfb997e968bf0261/networkx-3.4.2-py3-none-any.whl", hash = "sha256:df5d4365b724cf81b8c6a7312509d0c22386097011ad1abe274afd5e9d3bbc5f", size = 1723263 }, ] +[[package]] +name = "nh3" +version = "0.2.21" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/37/30/2f81466f250eb7f591d4d193930df661c8c23e9056bdc78e365b646054d8/nh3-0.2.21.tar.gz", hash = "sha256:4990e7ee6a55490dbf00d61a6f476c9a3258e31e711e13713b2ea7d6616f670e", size = 16581 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/7f/81/b83775687fcf00e08ade6d4605f0be9c4584cb44c4973d9f27b7456a31c9/nh3-0.2.21-cp313-cp313t-macosx_10_12_x86_64.macosx_11_0_arm64.macosx_10_12_universal2.whl", hash = "sha256:fcff321bd60c6c5c9cb4ddf2554e22772bb41ebd93ad88171bbbb6f271255286", size = 1297678 }, + { url = "https://files.pythonhosted.org/packages/22/ee/d0ad8fb4b5769f073b2df6807f69a5e57ca9cea504b78809921aef460d20/nh3-0.2.21-cp313-cp313t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:31eedcd7d08b0eae28ba47f43fd33a653b4cdb271d64f1aeda47001618348fde", size = 733774 }, + { url = "https://files.pythonhosted.org/packages/ea/76/b450141e2d384ede43fe53953552f1c6741a499a8c20955ad049555cabc8/nh3-0.2.21-cp313-cp313t-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:d426d7be1a2f3d896950fe263332ed1662f6c78525b4520c8e9861f8d7f0d243", size = 760012 }, + { url = "https://files.pythonhosted.org/packages/97/90/1182275db76cd8fbb1f6bf84c770107fafee0cb7da3e66e416bcb9633da2/nh3-0.2.21-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:9d67709bc0d7d1f5797b21db26e7a8b3d15d21c9c5f58ccfe48b5328483b685b", size = 923619 }, + { url = "https://files.pythonhosted.org/packages/29/c7/269a7cfbec9693fad8d767c34a755c25ccb8d048fc1dfc7a7d86bc99375c/nh3-0.2.21-cp313-cp313t-musllinux_1_2_armv7l.whl", hash = "sha256:55823c5ea1f6b267a4fad5de39bc0524d49a47783e1fe094bcf9c537a37df251", size = 1000384 }, + { url = "https://files.pythonhosted.org/packages/68/a9/48479dbf5f49ad93f0badd73fbb48b3d769189f04c6c69b0df261978b009/nh3-0.2.21-cp313-cp313t-musllinux_1_2_i686.whl", hash = "sha256:818f2b6df3763e058efa9e69677b5a92f9bc0acff3295af5ed013da544250d5b", size = 918908 }, + { url = "https://files.pythonhosted.org/packages/d7/da/0279c118f8be2dc306e56819880b19a1cf2379472e3b79fc8eab44e267e3/nh3-0.2.21-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:b3b5c58161e08549904ac4abd450dacd94ff648916f7c376ae4b2c0652b98ff9", size = 909180 }, + { url = "https://files.pythonhosted.org/packages/26/16/93309693f8abcb1088ae143a9c8dbcece9c8f7fb297d492d3918340c41f1/nh3-0.2.21-cp313-cp313t-win32.whl", hash = "sha256:637d4a10c834e1b7d9548592c7aad760611415fcd5bd346f77fd8a064309ae6d", size = 532747 }, + { url = "https://files.pythonhosted.org/packages/a2/3a/96eb26c56cbb733c0b4a6a907fab8408ddf3ead5d1b065830a8f6a9c3557/nh3-0.2.21-cp313-cp313t-win_amd64.whl", hash = "sha256:713d16686596e556b65e7f8c58328c2df63f1a7abe1277d87625dcbbc012ef82", size = 528908 }, + { url = "https://files.pythonhosted.org/packages/ba/1d/b1ef74121fe325a69601270f276021908392081f4953d50b03cbb38b395f/nh3-0.2.21-cp38-abi3-macosx_10_12_x86_64.macosx_11_0_arm64.macosx_10_12_universal2.whl", hash = "sha256:a772dec5b7b7325780922dd904709f0f5f3a79fbf756de5291c01370f6df0967", size = 1316133 }, + { url = "https://files.pythonhosted.org/packages/b8/f2/2c7f79ce6de55b41e7715f7f59b159fd59f6cdb66223c05b42adaee2b645/nh3-0.2.21-cp38-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d002b648592bf3033adfd875a48f09b8ecc000abd7f6a8769ed86b6ccc70c759", size = 758328 }, + { url = "https://files.pythonhosted.org/packages/6d/ad/07bd706fcf2b7979c51b83d8b8def28f413b090cf0cb0035ee6b425e9de5/nh3-0.2.21-cp38-abi3-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:2a5174551f95f2836f2ad6a8074560f261cf9740a48437d6151fd2d4d7d617ab", size = 747020 }, + { url = "https://files.pythonhosted.org/packages/75/99/06a6ba0b8a0d79c3d35496f19accc58199a1fb2dce5e711a31be7e2c1426/nh3-0.2.21-cp38-abi3-manylinux_2_17_ppc64.manylinux2014_ppc64.whl", hash = "sha256:b8d55ea1fc7ae3633d758a92aafa3505cd3cc5a6e40470c9164d54dff6f96d42", size = 944878 }, + { url = "https://files.pythonhosted.org/packages/79/d4/dc76f5dc50018cdaf161d436449181557373869aacf38a826885192fc587/nh3-0.2.21-cp38-abi3-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:6ae319f17cd8960d0612f0f0ddff5a90700fa71926ca800e9028e7851ce44a6f", size = 903460 }, + { url = "https://files.pythonhosted.org/packages/cd/c3/d4f8037b2ab02ebf5a2e8637bd54736ed3d0e6a2869e10341f8d9085f00e/nh3-0.2.21-cp38-abi3-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:63ca02ac6f27fc80f9894409eb61de2cb20ef0a23740c7e29f9ec827139fa578", size = 839369 }, + { url = "https://files.pythonhosted.org/packages/11/a9/1cd3c6964ec51daed7b01ca4686a5c793581bf4492cbd7274b3f544c9abe/nh3-0.2.21-cp38-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a5f77e62aed5c4acad635239ac1290404c7e940c81abe561fd2af011ff59f585", size = 739036 }, + { url = "https://files.pythonhosted.org/packages/fd/04/bfb3ff08d17a8a96325010ae6c53ba41de6248e63cdb1b88ef6369a6cdfc/nh3-0.2.21-cp38-abi3-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:087ffadfdcd497658c3adc797258ce0f06be8a537786a7217649fc1c0c60c293", size = 768712 }, + { url = "https://files.pythonhosted.org/packages/9e/aa/cfc0bf545d668b97d9adea4f8b4598667d2b21b725d83396c343ad12bba7/nh3-0.2.21-cp38-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:ac7006c3abd097790e611fe4646ecb19a8d7f2184b882f6093293b8d9b887431", size = 930559 }, + { url = "https://files.pythonhosted.org/packages/78/9d/6f5369a801d3a1b02e6a9a097d56bcc2f6ef98cffebf03c4bb3850d8e0f0/nh3-0.2.21-cp38-abi3-musllinux_1_2_armv7l.whl", hash = "sha256:6141caabe00bbddc869665b35fc56a478eb774a8c1dfd6fba9fe1dfdf29e6efa", size = 1008591 }, + { url = "https://files.pythonhosted.org/packages/a6/df/01b05299f68c69e480edff608248313cbb5dbd7595c5e048abe8972a57f9/nh3-0.2.21-cp38-abi3-musllinux_1_2_i686.whl", hash = "sha256:20979783526641c81d2f5bfa6ca5ccca3d1e4472474b162c6256745fbfe31cd1", size = 925670 }, + { url = "https://files.pythonhosted.org/packages/3d/79/bdba276f58d15386a3387fe8d54e980fb47557c915f5448d8c6ac6f7ea9b/nh3-0.2.21-cp38-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:a7ea28cd49293749d67e4fcf326c554c83ec912cd09cd94aa7ec3ab1921c8283", size = 917093 }, + { url = "https://files.pythonhosted.org/packages/e7/d8/c6f977a5cd4011c914fb58f5ae573b071d736187ccab31bfb1d539f4af9f/nh3-0.2.21-cp38-abi3-win32.whl", hash = "sha256:6c9c30b8b0d291a7c5ab0967ab200598ba33208f754f2f4920e9343bdd88f79a", size = 537623 }, + { url = "https://files.pythonhosted.org/packages/23/fc/8ce756c032c70ae3dd1d48a3552577a325475af2a2f629604b44f571165c/nh3-0.2.21-cp38-abi3-win_amd64.whl", hash = "sha256:bb0014948f04d7976aabae43fcd4cb7f551f9f8ce785a4c9ef66e6c2590f8629", size = 535283 }, +] + [[package]] name = "nltk" version = "3.9.1" @@ -1751,18 +2093,40 @@ wheels = [ [[package]] name = "numpy" -version = "1.26.4" +version = "2.2.5" source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/65/6e/09db70a523a96d25e115e71cc56a6f9031e7b8cd166c1ac8438307c14058/numpy-1.26.4.tar.gz", hash = "sha256:2a02aba9ed12e4ac4eb3ea9421c420301a0c6460d9830d74a9df87efa4912010", size = 15786129 } +sdist = { url = "https://files.pythonhosted.org/packages/dc/b2/ce4b867d8cd9c0ee84938ae1e6a6f7926ebf928c9090d036fc3c6a04f946/numpy-2.2.5.tar.gz", hash = "sha256:a9c0d994680cd991b1cb772e8b297340085466a6fe964bc9d4e80f5e2f43c291", size = 20273920 } wheels = [ - { url = "https://files.pythonhosted.org/packages/95/12/8f2020a8e8b8383ac0177dc9570aad031a3beb12e38847f7129bacd96228/numpy-1.26.4-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:b3ce300f3644fb06443ee2222c2201dd3a89ea6040541412b8fa189341847218", size = 20335901 }, - { url = "https://files.pythonhosted.org/packages/75/5b/ca6c8bd14007e5ca171c7c03102d17b4f4e0ceb53957e8c44343a9546dcc/numpy-1.26.4-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:03a8c78d01d9781b28a6989f6fa1bb2c4f2d51201cf99d3dd875df6fbd96b23b", size = 13685868 }, - { url = "https://files.pythonhosted.org/packages/79/f8/97f10e6755e2a7d027ca783f63044d5b1bc1ae7acb12afe6a9b4286eac17/numpy-1.26.4-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:9fad7dcb1aac3c7f0584a5a8133e3a43eeb2fe127f47e3632d43d677c66c102b", size = 13925109 }, - { url = "https://files.pythonhosted.org/packages/0f/50/de23fde84e45f5c4fda2488c759b69990fd4512387a8632860f3ac9cd225/numpy-1.26.4-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:675d61ffbfa78604709862923189bad94014bef562cc35cf61d3a07bba02a7ed", size = 17950613 }, - { url = "https://files.pythonhosted.org/packages/4c/0c/9c603826b6465e82591e05ca230dfc13376da512b25ccd0894709b054ed0/numpy-1.26.4-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:ab47dbe5cc8210f55aa58e4805fe224dac469cde56b9f731a4c098b91917159a", size = 13572172 }, - { url = "https://files.pythonhosted.org/packages/76/8c/2ba3902e1a0fc1c74962ea9bb33a534bb05984ad7ff9515bf8d07527cadd/numpy-1.26.4-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:1dda2e7b4ec9dd512f84935c5f126c8bd8b9f2fc001e9f54af255e8c5f16b0e0", size = 17786643 }, - { url = "https://files.pythonhosted.org/packages/28/4a/46d9e65106879492374999e76eb85f87b15328e06bd1550668f79f7b18c6/numpy-1.26.4-cp312-cp312-win32.whl", hash = "sha256:50193e430acfc1346175fcbdaa28ffec49947a06918b7b92130744e81e640110", size = 5677803 }, - { url = "https://files.pythonhosted.org/packages/16/2e/86f24451c2d530c88daf997cb8d6ac622c1d40d19f5a031ed68a4b73a374/numpy-1.26.4-cp312-cp312-win_amd64.whl", hash = "sha256:08beddf13648eb95f8d867350f6a018a4be2e5ad54c8d8caed89ebca558b2818", size = 15517754 }, + { url = "https://files.pythonhosted.org/packages/e2/f7/1fd4ff108cd9d7ef929b8882692e23665dc9c23feecafbb9c6b80f4ec583/numpy-2.2.5-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:ee461a4eaab4f165b68780a6a1af95fb23a29932be7569b9fab666c407969051", size = 20948633 }, + { url = "https://files.pythonhosted.org/packages/12/03/d443c278348371b20d830af155ff2079acad6a9e60279fac2b41dbbb73d8/numpy-2.2.5-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:ec31367fd6a255dc8de4772bd1658c3e926d8e860a0b6e922b615e532d320ddc", size = 14176123 }, + { url = "https://files.pythonhosted.org/packages/2b/0b/5ca264641d0e7b14393313304da48b225d15d471250376f3fbdb1a2be603/numpy-2.2.5-cp312-cp312-macosx_14_0_arm64.whl", hash = "sha256:47834cde750d3c9f4e52c6ca28a7361859fcaf52695c7dc3cc1a720b8922683e", size = 5163817 }, + { url = "https://files.pythonhosted.org/packages/04/b3/d522672b9e3d28e26e1613de7675b441bbd1eaca75db95680635dd158c67/numpy-2.2.5-cp312-cp312-macosx_14_0_x86_64.whl", hash = "sha256:2c1a1c6ccce4022383583a6ded7bbcda22fc635eb4eb1e0a053336425ed36dfa", size = 6698066 }, + { url = "https://files.pythonhosted.org/packages/a0/93/0f7a75c1ff02d4b76df35079676b3b2719fcdfb39abdf44c8b33f43ef37d/numpy-2.2.5-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:9d75f338f5f79ee23548b03d801d28a505198297534f62416391857ea0479571", size = 14087277 }, + { url = "https://files.pythonhosted.org/packages/b0/d9/7c338b923c53d431bc837b5b787052fef9ae68a56fe91e325aac0d48226e/numpy-2.2.5-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3a801fef99668f309b88640e28d261991bfad9617c27beda4a3aec4f217ea073", size = 16135742 }, + { url = "https://files.pythonhosted.org/packages/2d/10/4dec9184a5d74ba9867c6f7d1e9f2e0fb5fe96ff2bf50bb6f342d64f2003/numpy-2.2.5-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:abe38cd8381245a7f49967a6010e77dbf3680bd3627c0fe4362dd693b404c7f8", size = 15581825 }, + { url = "https://files.pythonhosted.org/packages/80/1f/2b6fcd636e848053f5b57712a7d1880b1565eec35a637fdfd0a30d5e738d/numpy-2.2.5-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:5a0ac90e46fdb5649ab6369d1ab6104bfe5854ab19b645bf5cda0127a13034ae", size = 17899600 }, + { url = "https://files.pythonhosted.org/packages/ec/87/36801f4dc2623d76a0a3835975524a84bd2b18fe0f8835d45c8eae2f9ff2/numpy-2.2.5-cp312-cp312-win32.whl", hash = "sha256:0cd48122a6b7eab8f06404805b1bd5856200e3ed6f8a1b9a194f9d9054631beb", size = 6312626 }, + { url = "https://files.pythonhosted.org/packages/8b/09/4ffb4d6cfe7ca6707336187951992bd8a8b9142cf345d87ab858d2d7636a/numpy-2.2.5-cp312-cp312-win_amd64.whl", hash = "sha256:ced69262a8278547e63409b2653b372bf4baff0870c57efa76c5703fd6543282", size = 12645715 }, + { url = "https://files.pythonhosted.org/packages/e2/a0/0aa7f0f4509a2e07bd7a509042967c2fab635690d4f48c6c7b3afd4f448c/numpy-2.2.5-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:059b51b658f4414fff78c6d7b1b4e18283ab5fa56d270ff212d5ba0c561846f4", size = 20935102 }, + { url = "https://files.pythonhosted.org/packages/7e/e4/a6a9f4537542912ec513185396fce52cdd45bdcf3e9d921ab02a93ca5aa9/numpy-2.2.5-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:47f9ed103af0bc63182609044b0490747e03bd20a67e391192dde119bf43d52f", size = 14191709 }, + { url = "https://files.pythonhosted.org/packages/be/65/72f3186b6050bbfe9c43cb81f9df59ae63603491d36179cf7a7c8d216758/numpy-2.2.5-cp313-cp313-macosx_14_0_arm64.whl", hash = "sha256:261a1ef047751bb02f29dfe337230b5882b54521ca121fc7f62668133cb119c9", size = 5149173 }, + { url = "https://files.pythonhosted.org/packages/e5/e9/83e7a9432378dde5802651307ae5e9ea07bb72b416728202218cd4da2801/numpy-2.2.5-cp313-cp313-macosx_14_0_x86_64.whl", hash = "sha256:4520caa3807c1ceb005d125a75e715567806fed67e315cea619d5ec6e75a4191", size = 6684502 }, + { url = "https://files.pythonhosted.org/packages/ea/27/b80da6c762394c8ee516b74c1f686fcd16c8f23b14de57ba0cad7349d1d2/numpy-2.2.5-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3d14b17b9be5f9c9301f43d2e2a4886a33b53f4e6fdf9ca2f4cc60aeeee76372", size = 14084417 }, + { url = "https://files.pythonhosted.org/packages/aa/fc/ebfd32c3e124e6a1043e19c0ab0769818aa69050ce5589b63d05ff185526/numpy-2.2.5-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2ba321813a00e508d5421104464510cc962a6f791aa2fca1c97b1e65027da80d", size = 16133807 }, + { url = "https://files.pythonhosted.org/packages/bf/9b/4cc171a0acbe4666f7775cfd21d4eb6bb1d36d3a0431f48a73e9212d2278/numpy-2.2.5-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:a4cbdef3ddf777423060c6f81b5694bad2dc9675f110c4b2a60dc0181543fac7", size = 15575611 }, + { url = "https://files.pythonhosted.org/packages/a3/45/40f4135341850df48f8edcf949cf47b523c404b712774f8855a64c96ef29/numpy-2.2.5-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:54088a5a147ab71a8e7fdfd8c3601972751ded0739c6b696ad9cb0343e21ab73", size = 17895747 }, + { url = "https://files.pythonhosted.org/packages/f8/4c/b32a17a46f0ffbde8cc82df6d3daeaf4f552e346df143e1b188a701a8f09/numpy-2.2.5-cp313-cp313-win32.whl", hash = "sha256:c8b82a55ef86a2d8e81b63da85e55f5537d2157165be1cb2ce7cfa57b6aef38b", size = 6309594 }, + { url = "https://files.pythonhosted.org/packages/13/ae/72e6276feb9ef06787365b05915bfdb057d01fceb4a43cb80978e518d79b/numpy-2.2.5-cp313-cp313-win_amd64.whl", hash = "sha256:d8882a829fd779f0f43998e931c466802a77ca1ee0fe25a3abe50278616b1471", size = 12638356 }, + { url = "https://files.pythonhosted.org/packages/79/56/be8b85a9f2adb688e7ded6324e20149a03541d2b3297c3ffc1a73f46dedb/numpy-2.2.5-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:e8b025c351b9f0e8b5436cf28a07fa4ac0204d67b38f01433ac7f9b870fa38c6", size = 20963778 }, + { url = "https://files.pythonhosted.org/packages/ff/77/19c5e62d55bff507a18c3cdff82e94fe174957bad25860a991cac719d3ab/numpy-2.2.5-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:8dfa94b6a4374e7851bbb6f35e6ded2120b752b063e6acdd3157e4d2bb922eba", size = 14207279 }, + { url = "https://files.pythonhosted.org/packages/75/22/aa11f22dc11ff4ffe4e849d9b63bbe8d4ac6d5fae85ddaa67dfe43be3e76/numpy-2.2.5-cp313-cp313t-macosx_14_0_arm64.whl", hash = "sha256:97c8425d4e26437e65e1d189d22dff4a079b747ff9c2788057bfb8114ce1e133", size = 5199247 }, + { url = "https://files.pythonhosted.org/packages/4f/6c/12d5e760fc62c08eded0394f62039f5a9857f758312bf01632a81d841459/numpy-2.2.5-cp313-cp313t-macosx_14_0_x86_64.whl", hash = "sha256:352d330048c055ea6db701130abc48a21bec690a8d38f8284e00fab256dc1376", size = 6711087 }, + { url = "https://files.pythonhosted.org/packages/ef/94/ece8280cf4218b2bee5cec9567629e61e51b4be501e5c6840ceb593db945/numpy-2.2.5-cp313-cp313t-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8b4c0773b6ada798f51f0f8e30c054d32304ccc6e9c5d93d46cb26f3d385ab19", size = 14059964 }, + { url = "https://files.pythonhosted.org/packages/39/41/c5377dac0514aaeec69115830a39d905b1882819c8e65d97fc60e177e19e/numpy-2.2.5-cp313-cp313t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:55f09e00d4dccd76b179c0f18a44f041e5332fd0e022886ba1c0bbf3ea4a18d0", size = 16121214 }, + { url = "https://files.pythonhosted.org/packages/db/54/3b9f89a943257bc8e187145c6bc0eb8e3d615655f7b14e9b490b053e8149/numpy-2.2.5-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:02f226baeefa68f7d579e213d0f3493496397d8f1cff5e2b222af274c86a552a", size = 15575788 }, + { url = "https://files.pythonhosted.org/packages/b1/c4/2e407e85df35b29f79945751b8f8e671057a13a376497d7fb2151ba0d290/numpy-2.2.5-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:c26843fd58f65da9491165072da2cccc372530681de481ef670dcc8e27cfb066", size = 17893672 }, + { url = "https://files.pythonhosted.org/packages/29/7e/d0b44e129d038dba453f00d0e29ebd6eaf2f06055d72b95b9947998aca14/numpy-2.2.5-cp313-cp313t-win32.whl", hash = "sha256:1a161c2c79ab30fe4501d5a2bbfe8b162490757cf90b7f05be8b80bc02f7bb8e", size = 6377102 }, + { url = "https://files.pythonhosted.org/packages/63/be/b85e4aa4bf42c6502851b971f1c326d583fcc68227385f92089cf50a7b45/numpy-2.2.5-cp313-cp313t-win_amd64.whl", hash = "sha256:d403c84991b5ad291d3809bace5e85f4bbf44a04bdc9a88ed2bb1807b3360bb8", size = 12750096 }, ] [[package]] @@ -2219,6 +2583,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/cf/6c/41c21c6c8af92b9fea313aa47c75de49e2f9a467964ee33eb0135d47eb64/pillow-11.1.0-cp313-cp313t-win_arm64.whl", hash = "sha256:67cd427c68926108778a9005f2a04adbd5e67c442ed21d95389fe1d595458756", size = 2377651 }, ] +[[package]] +name = "platformdirs" +version = "4.3.8" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/fe/8b/3c73abc9c759ecd3f1f7ceff6685840859e8070c4d947c93fae71f6a0bf2/platformdirs-4.3.8.tar.gz", hash = "sha256:3d512d96e16bcb959a814c9f348431070822a6496326a4be0911c40b5a74c2bc", size = 21362 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/fe/39/979e8e21520d4e47a0bbe349e2713c0aac6f3d853d0e5b34d76206c439aa/platformdirs-4.3.8-py3-none-any.whl", hash = "sha256:ff7059bb7eb1179e2685604f4aaf157cfd9535242bd23742eadc3c13542139b4", size = 18567 }, +] + [[package]] name = "playwright" version = "1.50.0" @@ -2237,6 +2610,12 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/bc/2b/e944e10c9b18e77e43d3bb4d6faa323f6cc27597db37b75bc3fd796adfd5/playwright-1.50.0-py3-none-win_amd64.whl", hash = "sha256:1859423da82de631704d5e3d88602d755462b0906824c1debe140979397d2e8d", size = 34784546 }, ] +[[package]] +name = "progress" +version = "1.6" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/2a/68/d8412d1e0d70edf9791cbac5426dc859f4649afc22f2abbeb0d947cf70fd/progress-1.6.tar.gz", hash = "sha256:c9c86e98b5c03fa1fe11e3b67c1feda4788b8d0fe7336c2ff7d5644ccfba34cd", size = 7842 } + [[package]] name = "propcache" version = "0.2.1" @@ -2576,6 +2955,19 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/6a/3e/b68c118422ec867fa7ab88444e1274aa40681c606d59ac27de5a5588f082/python_dotenv-1.0.1-py3-none-any.whl", hash = "sha256:f7b63ef50f1b690dddf550d03497b66d609393b40b564ed0d674909a68ebf16a", size = 19863 }, ] +[[package]] +name = "python-ffmpeg" +version = "2.0.12" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "pyee" }, + { name = "typing-extensions" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/dd/4d/7ecffb341d646e016be76e36f5a42cb32f409c9ca21a57b68f067fad3fc7/python_ffmpeg-2.0.12.tar.gz", hash = "sha256:19ac80af5a064a2f53c245af1a909b2d7648ea045500d96d3bcd507b88d43dc7", size = 14126292 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/7f/6d/02e817aec661defe148cb9eb0c4eca2444846305f625c2243fb9f92a9045/python_ffmpeg-2.0.12-py3-none-any.whl", hash = "sha256:d86697da8dfb39335183e336d31baf42fb217468adf5ac97fd743898240faae3", size = 14411 }, +] + [[package]] name = "python-iso639" version = "2025.2.18" @@ -2641,6 +3033,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/eb/38/ac33370d784287baa1c3d538978b5e2ea064d4c1b93ffbd12826c190dd10/pytz-2025.1-py2.py3-none-any.whl", hash = "sha256:89dd22dca55b46eac6eda23b2d72721bf1bdfef212645d81513ef5d03038de57", size = 507930 }, ] +[[package]] +name = "pywin32-ctypes" +version = "0.2.3" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/85/9f/01a1a99704853cb63f253eea009390c88e7131c67e66a0a02099a8c917cb/pywin32-ctypes-0.2.3.tar.gz", hash = "sha256:d162dc04946d704503b2edc4d55f3dba5c1d539ead017afa00142c38b9885755", size = 29471 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/de/3d/8161f7711c017e01ac9f008dfddd9410dff3674334c233bde66e7ba65bbf/pywin32_ctypes-0.2.3-py3-none-any.whl", hash = "sha256:8a1513379d709975552d202d942d9837758905c8d01eb82b8bcc30918929e7b8", size = 30756 }, +] + [[package]] name = "pyyaml" version = "6.0.2" @@ -2705,6 +3106,20 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/4b/43/ca3d1018b392f49131843648e10b08ace23afe8dad3bee5f136e4346b7cd/rapidfuzz-3.12.2-cp313-cp313-win_arm64.whl", hash = "sha256:69f6ecdf1452139f2b947d0c169a605de578efdb72cbb2373cb0a94edca1fd34", size = 863535 }, ] +[[package]] +name = "readme-renderer" +version = "44.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "docutils" }, + { name = "nh3" }, + { name = "pygments" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/5a/a9/104ec9234c8448c4379768221ea6df01260cd6c2ce13182d4eac531c8342/readme_renderer-44.0.tar.gz", hash = "sha256:8712034eabbfa6805cacf1402b4eeb2a73028f72d1166d6f5cb7f9c047c5d1e1", size = 32056 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/e1/67/921ec3024056483db83953ae8e48079ad62b92db7880013ca77632921dd0/readme_renderer-44.0-py3-none-any.whl", hash = "sha256:2fbca89b81a08526aadf1357a8c2ae889ec05fb03f5da67f9769c9a592166151", size = 13310 }, +] + [[package]] name = "referencing" version = "0.36.2" @@ -2798,17 +3213,26 @@ flashrank = [ { name = "flashrank" }, ] +[[package]] +name = "rfc3986" +version = "2.0.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/85/40/1520d68bfa07ab5a6f065a186815fb6610c86fe957bc065754e47f7b0840/rfc3986-2.0.0.tar.gz", hash = "sha256:97aacf9dbd4bfd829baad6e6309fa6573aaf1be3f6fa735c8ab05e46cecb261c", size = 49026 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/ff/9a/9afaade874b2fa6c752c36f1548f718b5b83af81ed9b76628329dab81c1b/rfc3986-2.0.0-py2.py3-none-any.whl", hash = "sha256:50b1502b60e289cb37883f3dfd34532b8873c7de9f49bb546641ce9cbd256ebd", size = 31326 }, +] + [[package]] name = "rich" -version = "13.9.4" +version = "14.0.0" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "markdown-it-py" }, { name = "pygments" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/ab/3a/0316b28d0761c6734d6bc14e770d85506c986c85ffb239e688eeaab2c2bc/rich-13.9.4.tar.gz", hash = "sha256:439594978a49a09530cff7ebc4b5c7103ef57baf48d5ea3184f21d9a2befa098", size = 223149 } +sdist = { url = "https://files.pythonhosted.org/packages/a1/53/830aa4c3066a8ab0ae9a9955976fb770fe9c6102117c8ec4ab3ea62d89e8/rich-14.0.0.tar.gz", hash = "sha256:82f1bc23a6a21ebca4ae0c45af9bdbc492ed20231dcb63f297d6d1021a9d5725", size = 224078 } wheels = [ - { url = "https://files.pythonhosted.org/packages/19/71/39c7c0d87f8d4e6c020a393182060eaefeeae6c01dab6a84ec346f2567df/rich-13.9.4-py3-none-any.whl", hash = "sha256:6049d5e6ec054bf2779ab3358186963bac2ea89175919d699e378b99738c2a90", size = 242424 }, + { url = "https://files.pythonhosted.org/packages/0d/9b/63f4c7ebc259242c89b3acafdb37b41d1185c07ff0011164674e9076b491/rich-14.0.0-py3-none-any.whl", hash = "sha256:1c9491e1951aac09caffd42f448ee3d04e58923ffe14993f6e83068dc395d7e0", size = 243229 }, ] [[package]] @@ -2954,6 +3378,19 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/e4/1f/5d46a8d94e9f6d2c913cbb109e57e7eed914de38ea99e2c4d69a9fc93140/scipy-1.15.1-cp313-cp313t-win_amd64.whl", hash = "sha256:bc7136626261ac1ed988dca56cfc4ab5180f75e0ee52e58f1e6aa74b5f3eacd5", size = 43181730 }, ] +[[package]] +name = "secretstorage" +version = "3.3.3" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "cryptography", marker = "sys_platform != 'darwin'" }, + { name = "jeepney", marker = "sys_platform != 'darwin'" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/53/a4/f48c9d79cb507ed1373477dbceaba7401fd8a23af63b837fa61f1dcd3691/SecretStorage-3.3.3.tar.gz", hash = "sha256:2403533ef369eca6d2ba81718576c5e0f564d5cca1b58f73a8b23e7d4eeebd77", size = 19739 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/54/24/b4293291fa1dd830f353d2cb163295742fa87f179fcc8a20a306a81978b7/SecretStorage-3.3.3-py3-none-any.whl", hash = "sha256:f356e6628222568e3af06f2eba8df495efa13b3b63081dafd4f7d9a7b7bc9f99", size = 15221 }, +] + [[package]] name = "sentence-transformers" version = "3.4.1" @@ -3063,9 +3500,23 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/d9/61/f2b52e107b1fc8944b33ef56bf6ac4ebbe16d91b94d2b87ce013bf63fb84/starlette-0.45.3-py3-none-any.whl", hash = "sha256:dfb6d332576f136ec740296c7e8bb8c8a7125044e7c6da30744718880cdd059d", size = 71507 }, ] +[[package]] +name = "static-ffmpeg" +version = "2.13" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "filelock" }, + { name = "progress" }, + { name = "requests" }, + { name = "twine" }, +] +wheels = [ + { url = "https://files.pythonhosted.org/packages/09/39/1a5d0603280dd681ec52a2a6717c05dab530190dff7887b7603740a1741b/static_ffmpeg-2.13-py3-none-any.whl", hash = "sha256:3bed55a7979f9de9d1eec1126b98774a1d41c2e323811f59973d54b9c94d6dac", size = 7586 }, +] + [[package]] name = "surf-new-backend" -version = "0.0.6" +version = "0.0.7" source = { virtual = "." } dependencies = [ { name = "alembic" }, @@ -3078,14 +3529,18 @@ dependencies = [ { name = "langchain-community" }, { name = "langchain-unstructured" }, { name = "langgraph" }, + { name = "linkup-sdk" }, { name = "litellm" }, + { name = "llama-cloud-services" }, { name = "markdownify" }, { name = "notion-client" }, { name = "pgvector" }, { name = "playwright" }, + { name = "python-ffmpeg" }, { name = "rerankers", extra = ["flashrank"] }, { name = "sentence-transformers" }, { name = "slack-sdk" }, + { name = "static-ffmpeg" }, { name = "tavily-python" }, { name = "unstructured", extra = ["all-docs"] }, { name = "unstructured-client" }, @@ -3098,7 +3553,7 @@ dependencies = [ requires-dist = [ { name = "alembic", specifier = ">=1.13.0" }, { name = "asyncpg", specifier = ">=0.30.0" }, - { name = "chonkie", extras = ["all"], specifier = ">=0.4.1" }, + { name = "chonkie", extras = ["all"], specifier = ">=1.0.6" }, { name = "fastapi", specifier = ">=0.115.8" }, { name = "fastapi-users", extras = ["oauth", "sqlalchemy"], specifier = ">=14.0.1" }, { name = "firecrawl-py", specifier = ">=1.12.0" }, @@ -3106,14 +3561,18 @@ requires-dist = [ { name = "langchain-community", specifier = ">=0.3.17" }, { name = "langchain-unstructured", specifier = ">=0.1.6" }, { name = "langgraph", specifier = ">=0.3.29" }, + { name = "linkup-sdk", specifier = ">=0.2.4" }, { name = "litellm", specifier = ">=1.61.4" }, + { name = "llama-cloud-services", specifier = ">=0.6.25" }, { name = "markdownify", specifier = ">=0.14.1" }, { name = "notion-client", specifier = ">=2.3.0" }, { name = "pgvector", specifier = ">=0.3.6" }, { name = "playwright", specifier = ">=1.50.0" }, + { name = "python-ffmpeg", specifier = ">=2.0.12" }, { name = "rerankers", extras = ["flashrank"], specifier = ">=0.7.1" }, { name = "sentence-transformers", specifier = ">=3.4.1" }, { name = "slack-sdk", specifier = ">=3.34.0" }, + { name = "static-ffmpeg", specifier = ">=2.13" }, { name = "tavily-python", specifier = ">=0.3.2" }, { name = "unstructured", extras = ["all-docs"], specifier = ">=0.16.25" }, { name = "unstructured-client", specifier = ">=0.30.0" }, @@ -3324,6 +3783,91 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/b6/1a/efeecb8d83705f2f4beac98d46f2148c95ecd7babfb31b5c0f1e7017e83d/transformers-4.48.3-py3-none-any.whl", hash = "sha256:78697f990f5ef350c23b46bf86d5081ce96b49479ab180b2de7687267de8fd36", size = 9669412 }, ] +[[package]] +name = "tree-sitter" +version = "0.24.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/a7/a2/698b9d31d08ad5558f8bfbfe3a0781bd4b1f284e89bde3ad18e05101a892/tree-sitter-0.24.0.tar.gz", hash = "sha256:abd95af65ca2f4f7eca356343391ed669e764f37748b5352946f00f7fc78e734", size = 168304 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/e9/57/3a590f287b5aa60c07d5545953912be3d252481bf5e178f750db75572bff/tree_sitter-0.24.0-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:14beeff5f11e223c37be7d5d119819880601a80d0399abe8c738ae2288804afc", size = 140788 }, + { url = "https://files.pythonhosted.org/packages/61/0b/fc289e0cba7dbe77c6655a4dd949cd23c663fd62a8b4d8f02f97e28d7fe5/tree_sitter-0.24.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:26a5b130f70d5925d67b47db314da209063664585a2fd36fa69e0717738efaf4", size = 133945 }, + { url = "https://files.pythonhosted.org/packages/86/d7/80767238308a137e0b5b5c947aa243e3c1e3e430e6d0d5ae94b9a9ffd1a2/tree_sitter-0.24.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5fc5c3c26d83c9d0ecb4fc4304fba35f034b7761d35286b936c1db1217558b4e", size = 564819 }, + { url = "https://files.pythonhosted.org/packages/bf/b3/6c5574f4b937b836601f5fb556b24804b0a6341f2eb42f40c0e6464339f4/tree_sitter-0.24.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:772e1bd8c0931c866b848d0369b32218ac97c24b04790ec4b0e409901945dd8e", size = 579303 }, + { url = "https://files.pythonhosted.org/packages/0a/f4/bd0ddf9abe242ea67cca18a64810f8af230fc1ea74b28bb702e838ccd874/tree_sitter-0.24.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:24a8dd03b0d6b8812425f3b84d2f4763322684e38baf74e5bb766128b5633dc7", size = 581054 }, + { url = "https://files.pythonhosted.org/packages/8c/1c/ff23fa4931b6ef1bbeac461b904ca7e49eaec7e7e5398584e3eef836ec96/tree_sitter-0.24.0-cp312-cp312-win_amd64.whl", hash = "sha256:f9e8b1605ab60ed43803100f067eed71b0b0e6c1fb9860a262727dbfbbb74751", size = 120221 }, + { url = "https://files.pythonhosted.org/packages/b2/2a/9979c626f303177b7612a802237d0533155bf1e425ff6f73cc40f25453e2/tree_sitter-0.24.0-cp312-cp312-win_arm64.whl", hash = "sha256:f733a83d8355fc95561582b66bbea92ffd365c5d7a665bc9ebd25e049c2b2abb", size = 108234 }, + { url = "https://files.pythonhosted.org/packages/61/cd/2348339c85803330ce38cee1c6cbbfa78a656b34ff58606ebaf5c9e83bd0/tree_sitter-0.24.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:0d4a6416ed421c4210f0ca405a4834d5ccfbb8ad6692d4d74f7773ef68f92071", size = 140781 }, + { url = "https://files.pythonhosted.org/packages/8b/a3/1ea9d8b64e8dcfcc0051028a9c84a630301290995cd6e947bf88267ef7b1/tree_sitter-0.24.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:e0992d483677e71d5c5d37f30dfb2e3afec2f932a9c53eec4fca13869b788c6c", size = 133928 }, + { url = "https://files.pythonhosted.org/packages/fe/ae/55c1055609c9428a4aedf4b164400ab9adb0b1bf1538b51f4b3748a6c983/tree_sitter-0.24.0-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:57277a12fbcefb1c8b206186068d456c600dbfbc3fd6c76968ee22614c5cd5ad", size = 564497 }, + { url = "https://files.pythonhosted.org/packages/ce/d0/f2ffcd04882c5aa28d205a787353130cbf84b2b8a977fd211bdc3b399ae3/tree_sitter-0.24.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d25fa22766d63f73716c6fec1a31ee5cf904aa429484256bd5fdf5259051ed74", size = 578917 }, + { url = "https://files.pythonhosted.org/packages/af/82/aebe78ea23a2b3a79324993d4915f3093ad1af43d7c2208ee90be9273273/tree_sitter-0.24.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:7d5d9537507e1c8c5fa9935b34f320bfec4114d675e028f3ad94f11cf9db37b9", size = 581148 }, + { url = "https://files.pythonhosted.org/packages/a1/b4/6b0291a590c2b0417cfdb64ccb8ea242f270a46ed429c641fbc2bfab77e0/tree_sitter-0.24.0-cp313-cp313-win_amd64.whl", hash = "sha256:f58bb4956917715ec4d5a28681829a8dad5c342cafd4aea269f9132a83ca9b34", size = 120207 }, + { url = "https://files.pythonhosted.org/packages/a8/18/542fd844b75272630229c9939b03f7db232c71a9d82aadc59c596319ea6a/tree_sitter-0.24.0-cp313-cp313-win_arm64.whl", hash = "sha256:23641bd25dcd4bb0b6fa91b8fb3f46cc9f1c9f475efe4d536d3f1f688d1b84c8", size = 108232 }, +] + +[[package]] +name = "tree-sitter-c-sharp" +version = "0.23.1" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/22/85/a61c782afbb706a47d990eaee6977e7c2bd013771c5bf5c81c617684f286/tree_sitter_c_sharp-0.23.1.tar.gz", hash = "sha256:322e2cfd3a547a840375276b2aea3335fa6458aeac082f6c60fec3f745c967eb", size = 1317728 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/58/04/f6c2df4c53a588ccd88d50851155945cff8cd887bd70c175e00aaade7edf/tree_sitter_c_sharp-0.23.1-cp39-abi3-macosx_10_9_x86_64.whl", hash = "sha256:2b612a6e5bd17bb7fa2aab4bb6fc1fba45c94f09cb034ab332e45603b86e32fd", size = 372235 }, + { url = "https://files.pythonhosted.org/packages/99/10/1aa9486f1e28fc22810fa92cbdc54e1051e7f5536a5e5b5e9695f609b31e/tree_sitter_c_sharp-0.23.1-cp39-abi3-macosx_11_0_arm64.whl", hash = "sha256:1a8b98f62bc53efcd4d971151950c9b9cd5cbe3bacdb0cd69fdccac63350d83e", size = 419046 }, + { url = "https://files.pythonhosted.org/packages/0f/21/13df29f8fcb9ba9f209b7b413a4764b673dfd58989a0dd67e9c7e19e9c2e/tree_sitter_c_sharp-0.23.1-cp39-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:986e93d845a438ec3c4416401aa98e6a6f6631d644bbbc2e43fcb915c51d255d", size = 415999 }, + { url = "https://files.pythonhosted.org/packages/ca/72/fc6846795bcdae2f8aa94cc8b1d1af33d634e08be63e294ff0d6794b1efc/tree_sitter_c_sharp-0.23.1-cp39-abi3-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a8024e466b2f5611c6dc90321f232d8584893c7fb88b75e4a831992f877616d2", size = 402830 }, + { url = "https://files.pythonhosted.org/packages/fe/3a/b6028c5890ce6653807d5fa88c72232c027c6ceb480dbeb3b186d60e5971/tree_sitter_c_sharp-0.23.1-cp39-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:7f9bf876866835492281d336b9e1f9626ab668737f74e914c31d285261507da7", size = 397880 }, + { url = "https://files.pythonhosted.org/packages/47/d2/4facaa34b40f8104d8751746d0e1cd2ddf0beb9f1404b736b97f372bd1f3/tree_sitter_c_sharp-0.23.1-cp39-abi3-win_amd64.whl", hash = "sha256:ae9a9e859e8f44e2b07578d44f9a220d3fa25b688966708af6aa55d42abeebb3", size = 377562 }, + { url = "https://files.pythonhosted.org/packages/d8/88/3cf6bd9959d94d1fec1e6a9c530c5f08ff4115a474f62aedb5fedb0f7241/tree_sitter_c_sharp-0.23.1-cp39-abi3-win_arm64.whl", hash = "sha256:c81548347a93347be4f48cb63ec7d60ef4b0efa91313330e69641e49aa5a08c5", size = 375157 }, +] + +[[package]] +name = "tree-sitter-embedded-template" +version = "0.23.2" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/28/d6/5a58ea2f0480f5ed188b733114a8c275532a2fd1568b3898793b13d28af5/tree_sitter_embedded_template-0.23.2.tar.gz", hash = "sha256:7b24dcf2e92497f54323e617564d36866230a8bfb719dbb7b45b461510dcddaa", size = 8471 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/ef/c1/be0c48ed9609b720e74ade86f24ea086e353fe9c7405ee9630c3d52d09a2/tree_sitter_embedded_template-0.23.2-cp39-abi3-macosx_10_9_x86_64.whl", hash = "sha256:a505c2d2494464029d79db541cab52f6da5fb326bf3d355e69bf98b84eb89ae0", size = 9554 }, + { url = "https://files.pythonhosted.org/packages/6d/a5/7c12f5d302525ee36d1eafc28a68e4454da5bad208436d547326bee4ed76/tree_sitter_embedded_template-0.23.2-cp39-abi3-macosx_11_0_arm64.whl", hash = "sha256:28028b93b42cc3753261ae7ce066675d407f59de512417524f9c3ab7792b1d37", size = 10051 }, + { url = "https://files.pythonhosted.org/packages/cd/87/95aaba8b64b849200bd7d4ae510cc394ecaef46a031499cbff301766970d/tree_sitter_embedded_template-0.23.2-cp39-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ec399d59ce93ffb60759a2d96053eed529f3c3f6a27128f261710d0d0de60e10", size = 17532 }, + { url = "https://files.pythonhosted.org/packages/13/f8/8c837b898f00b35f9f3f76a4abc525e80866a69343083c9ff329e17ecb03/tree_sitter_embedded_template-0.23.2-cp39-abi3-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:bcfa01f62b88d50dbcb736cc23baec8ddbfe08daacfdc613eee8c04ab65efd09", size = 17394 }, + { url = "https://files.pythonhosted.org/packages/89/9b/893adf9e465d2d7f14870871bf2f3b30045e5ac417cb596f667a72eda493/tree_sitter_embedded_template-0.23.2-cp39-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:6debd24791466f887109a433c31aa4a5deeba2b217817521c745a4e748a944ed", size = 16439 }, + { url = "https://files.pythonhosted.org/packages/40/96/e79934572723673db9f867000500c6eea61a37705e02c7aee9ee031bbb6f/tree_sitter_embedded_template-0.23.2-cp39-abi3-win_amd64.whl", hash = "sha256:158fecb38be5b15db0190ef7238e5248f24bf32ae3cab93bc1197e293a5641eb", size = 12572 }, + { url = "https://files.pythonhosted.org/packages/63/06/27f678b9874e4e2e39ddc6f5cce3374c8c60e6046ea8588a491ab6fc9fcb/tree_sitter_embedded_template-0.23.2-cp39-abi3-win_arm64.whl", hash = "sha256:9f1f3b79fe273f3d15a5b64c85fc6ebfb48decfbe8542accd05f5b7694860df0", size = 11232 }, +] + +[[package]] +name = "tree-sitter-language-pack" +version = "0.7.2" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "tree-sitter" }, + { name = "tree-sitter-c-sharp" }, + { name = "tree-sitter-embedded-template" }, + { name = "tree-sitter-yaml" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/9b/1e/2d63d93025fd5b527327c3fd348955cebaec02a3f1bcec88ab4d88ddfc39/tree_sitter_language_pack-0.7.2.tar.gz", hash = "sha256:46fc96cc3bddfee7091fdedec2ae7e34218679e58241e8319bf82026f6d02eae", size = 59264078 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/da/9d/2c6272bf4fd18a22d8c07d3c983940dbece4f0e9e21f5c78f15a2740f435/tree_sitter_language_pack-0.7.2-cp39-abi3-macosx_10_13_universal2.whl", hash = "sha256:4036603020bd32060d9931a64f8c3d8637de575f350f11534971012e51a27a95", size = 28132977 }, + { url = "https://files.pythonhosted.org/packages/2b/e2/0f2511019c27b870061f9ad719074095ef84cd7857a730765bfa066384be/tree_sitter_language_pack-0.7.2-cp39-abi3-manylinux2014_aarch64.whl", hash = "sha256:801926dbc81eeca4ce97b846cc899dcf3fecfdc3b2514a68eeeb118f70ac686d", size = 17576769 }, + { url = "https://files.pythonhosted.org/packages/3a/88/7b38233def5c359503ad4d36533f96f9fe2943a8eeeced66b36312c49e1b/tree_sitter_language_pack-0.7.2-cp39-abi3-manylinux2014_x86_64.whl", hash = "sha256:77be80335fb585f48eb268b0e07ca54f3da8f30c2eab7be749113f116c3ef316", size = 17433872 }, + { url = "https://files.pythonhosted.org/packages/f8/27/fc5dce240b68a1ed876bc80b2238fbaaa0f695dbaf88660728a0239a2b20/tree_sitter_language_pack-0.7.2-cp39-abi3-win_amd64.whl", hash = "sha256:d71c6b4c14b3370ca783319ede7a581a10e6dd1bdfe5d31d316d9216981a6406", size = 14316050 }, +] + +[[package]] +name = "tree-sitter-yaml" +version = "0.7.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/93/04/6de8be8112c50450cab753fcd6b74d8368c60f6099bf551cee0bec69563a/tree_sitter_yaml-0.7.0.tar.gz", hash = "sha256:9c8bb17d9755c3b0e757260917240c0d19883cd3b59a5d74f205baa8bf8435a4", size = 85085 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/69/1d/243dbdf59fae8a4109e19f0994e2627ddedb2e16b7cf99bd42be64367742/tree_sitter_yaml-0.7.0-cp39-abi3-macosx_10_9_x86_64.whl", hash = "sha256:e21553ac190ae05bf82796df8beb4d9158ba195b5846018cb36fbc3a35bd0679", size = 43335 }, + { url = "https://files.pythonhosted.org/packages/e2/63/e5d5868a1498e20fd07e7db62933766fd64950279862e3e7f150b88ec69d/tree_sitter_yaml-0.7.0-cp39-abi3-macosx_11_0_arm64.whl", hash = "sha256:c022054f1f9b54201082ea83073a6c24c42d0436ad8ee99ff2574cba8f928c28", size = 44574 }, + { url = "https://files.pythonhosted.org/packages/f5/ba/9cff9a3fddb1b6b38bc71ce1dfdb8892ab15a4042c104f4582e30318b412/tree_sitter_yaml-0.7.0-cp39-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1cd1725142f19e41c51d27c99cfc60780f596e069eb181cfa6433d993a19aa3d", size = 93088 }, + { url = "https://files.pythonhosted.org/packages/19/09/39d29d9a22cee0b3c3e4f3fdbd23e4534b9c2a84b5f962f369eafcfbf88c/tree_sitter_yaml-0.7.0-cp39-abi3-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9d1b268378254f75bb27396d83c96d886ccbfcda6bd8c2778e94e3e1d2459085", size = 91367 }, + { url = "https://files.pythonhosted.org/packages/b0/b7/285653b894b351436917b5fe5e738eecaeb2128b4e4bf72bfe0c6043f62e/tree_sitter_yaml-0.7.0-cp39-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:27c2e7f4f49ddf410003abbb82a7b00ec77ea263d8ef08dbce1a15d293eed2fd", size = 87405 }, + { url = "https://files.pythonhosted.org/packages/bb/73/0cdc82ea653c190475a4f63dd4a1f4efd5d1c7d09d2668b8d84008a4c4f8/tree_sitter_yaml-0.7.0-cp39-abi3-win_amd64.whl", hash = "sha256:98dce0d6bc376f842cfb1d3c32512eea95b37e61cd2c87074bb4b05c999917c8", size = 45360 }, + { url = "https://files.pythonhosted.org/packages/2e/32/af2d676b0176a958f22a75b04be836e09476a10844baab78c018a5030297/tree_sitter_yaml-0.7.0-cp39-abi3-win_arm64.whl", hash = "sha256:f0f8d8e05fa8e70f08d0f18a209d6026e171844f4ea7090e7c779b9c375b3a31", size = 43650 }, +] + [[package]] name = "triton" version = "3.2.0" @@ -3333,6 +3877,38 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/c7/30/37a3384d1e2e9320331baca41e835e90a3767303642c7a80d4510152cbcf/triton-3.2.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e5dfa23ba84541d7c0a531dfce76d8bcd19159d50a4a8b14ad01e91734a5c1b0", size = 253154278 }, ] +[[package]] +name = "twine" +version = "6.1.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "id" }, + { name = "keyring", marker = "platform_machine != 'ppc64le' and platform_machine != 's390x'" }, + { name = "packaging" }, + { name = "readme-renderer" }, + { name = "requests" }, + { name = "requests-toolbelt" }, + { name = "rfc3986" }, + { name = "rich" }, + { name = "urllib3" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/c8/a2/6df94fc5c8e2170d21d7134a565c3a8fb84f9797c1dd65a5976aaf714418/twine-6.1.0.tar.gz", hash = "sha256:be324f6272eff91d07ee93f251edf232fc647935dd585ac003539b42404a8dbd", size = 168404 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/7c/b6/74e927715a285743351233f33ea3c684528a0d374d2e43ff9ce9585b73fe/twine-6.1.0-py3-none-any.whl", hash = "sha256:a47f973caf122930bf0fbbf17f80b83bc1602c9ce393c7845f289a3001dc5384", size = 40791 }, +] + +[[package]] +name = "types-requests" +version = "2.32.0.20250328" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "urllib3" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/00/7d/eb174f74e3f5634eaacb38031bbe467dfe2e545bc255e5c90096ec46bc46/types_requests-2.32.0.20250328.tar.gz", hash = "sha256:c9e67228ea103bd811c96984fac36ed2ae8da87a36a633964a21f199d60baf32", size = 22995 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/cc/15/3700282a9d4ea3b37044264d3e4d1b1f0095a4ebf860a99914fd544e3be3/types_requests-2.32.0.20250328-py3-none-any.whl", hash = "sha256:72ff80f84b15eb3aa7a8e2625fffb6a93f2ad5a0c20215fc1dcfa61117bcb2a2", size = 20663 }, +] + [[package]] name = "typing-extensions" version = "4.12.2" @@ -3366,7 +3942,7 @@ wheels = [ [[package]] name = "unstructured" -version = "0.16.25" +version = "0.17.2" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "backoff" }, @@ -3391,9 +3967,9 @@ dependencies = [ { name = "unstructured-client" }, { name = "wrapt" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/64/31/98c4c78e305d1294888adf87fd5ee30577a4c393951341ca32b43f167f1e/unstructured-0.16.25.tar.gz", hash = "sha256:73b9b0f51dbb687af572ecdb849a6811710b9cac797ddeab8ee80fa07d8aa5e6", size = 1683097 } +sdist = { url = "https://files.pythonhosted.org/packages/b4/49/b95ff4b609d7328cd0394ac9d8ad69839e11a1f879462496afcf4887154a/unstructured-0.17.2.tar.gz", hash = "sha256:af18c3caef0a6c562cf77e34ee8b6ff522b605031d2336ffe565df66f126aa46", size = 1684745 } wheels = [ - { url = "https://files.pythonhosted.org/packages/12/4f/ad08585b5c8a33c82ea119494c4d3023f4796958c56e668b15cc282ec0a0/unstructured-0.16.25-py3-none-any.whl", hash = "sha256:14719ccef2830216cf1c5bf654f75e2bf07b17ca5dcee9da5ac74618130fd337", size = 1769286 }, + { url = "https://files.pythonhosted.org/packages/cb/88/061a9dedd4e8cc0c31097c3275a9ef1fd7307e26afac5cd582487386e1b8/unstructured-0.17.2-py3-none-any.whl", hash = "sha256:527dd26a4b273aebef2f9119c9d4f0d0ce17640038d92296d23abe89be123840", size = 1771563 }, ] [package.optional-dependencies] @@ -3403,6 +3979,7 @@ all-docs = [ { name = "markdown" }, { name = "networkx" }, { name = "onnx" }, + { name = "onnxruntime" }, { name = "openpyxl" }, { name = "pandas" }, { name = "pdf2image" }, diff --git a/surfsense_browser_extension/package.json b/surfsense_browser_extension/package.json index 3d2d04002..9274154ca 100644 --- a/surfsense_browser_extension/package.json +++ b/surfsense_browser_extension/package.json @@ -1,7 +1,7 @@ { "name": "surfsense_browser_extension", "displayName": "Surfsense Browser Extension", - "version": "0.0.6", + "version": "0.0.7", "description": "Extension to collect Browsing History for SurfSense.", "author": "https://github.com/MODSetter", "scripts": { diff --git a/surfsense_web/.env.example b/surfsense_web/.env.example index abd370cbf..03f266baa 100644 --- a/surfsense_web/.env.example +++ b/surfsense_web/.env.example @@ -1 +1,3 @@ -NEXT_PUBLIC_FASTAPI_BACKEND_URL=http://localhost:8000 \ No newline at end of file +NEXT_PUBLIC_FASTAPI_BACKEND_URL=http://localhost:8000 +NEXT_PUBLIC_FASTAPI_BACKEND_AUTH_TYPE=LOCAL or GOOGLE +NEXT_PUBLIC_ETL_SERVICE=UNSTRUCTURED or LLAMACLOUD \ No newline at end of file diff --git a/surfsense_web/app/dashboard/[search_space_id]/chats/chats-client.tsx b/surfsense_web/app/dashboard/[search_space_id]/chats/chats-client.tsx index c481bd6ec..7b14c7ed1 100644 --- a/surfsense_web/app/dashboard/[search_space_id]/chats/chats-client.tsx +++ b/surfsense_web/app/dashboard/[search_space_id]/chats/chats-client.tsx @@ -3,7 +3,7 @@ import { useState, useEffect } from 'react'; import { motion, AnimatePresence } from 'framer-motion'; import { useSearchParams } from 'next/navigation'; -import { MessageCircleMore, Search, Calendar, Tag, Trash2, ExternalLink, MoreHorizontal } from 'lucide-react'; +import { MessageCircleMore, Search, Calendar, Tag, Trash2, ExternalLink, MoreHorizontal, Radio, CheckCircle, Circle, Podcast } from 'lucide-react'; import { format } from 'date-fns'; // UI Components @@ -42,6 +42,9 @@ import { SelectTrigger, SelectValue, } from "@/components/ui/select"; +import { Checkbox } from "@/components/ui/checkbox"; +import { Label } from "@/components/ui/label"; +import { toast } from "sonner"; interface Chat { created_at: string; @@ -92,6 +95,18 @@ export default function ChatsPageClient({ searchSpaceId }: ChatsPageClientProps) const [chatToDelete, setChatToDelete] = useState<{ id: number, title: string } | null>(null); const [isDeleting, setIsDeleting] = useState(false); + // New state for podcast generation + const [selectedChats, setSelectedChats] = useState([]); + const [selectionMode, setSelectionMode] = useState(false); + const [podcastDialogOpen, setPodcastDialogOpen] = useState(false); + const [podcastTitle, setPodcastTitle] = useState(""); + const [isGeneratingPodcast, setIsGeneratingPodcast] = useState(false); + + // New state for individual podcast generation + const [currentChatIndex, setCurrentChatIndex] = useState(0); + const [podcastTitles, setPodcastTitles] = useState<{[key: number]: string}>({}); + const [processingChat, setProcessingChat] = useState(null); + const chatsPerPage = 9; const searchParams = useSearchParams(); @@ -234,6 +249,177 @@ export default function ChatsPageClient({ searchSpaceId }: ChatsPageClientProps) // Get unique chat types for filter dropdown const chatTypes = ['all', ...Array.from(new Set(chats.map(chat => chat.type)))]; + // Generate individual podcasts from selected chats + const handleGeneratePodcast = async () => { + if (selectedChats.length === 0) { + toast.error("Please select at least one chat"); + return; + } + + const currentChatId = selectedChats[currentChatIndex]; + const currentTitle = podcastTitles[currentChatId] || podcastTitle; + + if (!currentTitle.trim()) { + toast.error("Please enter a podcast title"); + return; + } + + setIsGeneratingPodcast(true); + try { + const token = localStorage.getItem('surfsense_bearer_token'); + if (!token) { + toast.error("Authentication error. Please log in again."); + setIsGeneratingPodcast(false); + return; + } + + // Create payload for single chat + const payload = { + type: "CHAT", + ids: [currentChatId], // Single chat ID + search_space_id: parseInt(searchSpaceId), + podcast_title: currentTitle + }; + + const response = await fetch(`${process.env.NEXT_PUBLIC_FASTAPI_BACKEND_URL}/api/v1/podcasts/generate/`, { + method: 'POST', + headers: { + 'Authorization': `Bearer ${token}`, + 'Content-Type': 'application/json', + }, + body: JSON.stringify(payload) + }); + + if (!response.ok) { + const errorData = await response.json().catch(() => ({})); + throw new Error(errorData.detail || "Failed to generate podcast"); + } + + const data = await response.json(); + toast.success(`Podcast "${currentTitle}" generation started!`); + + // Move to the next chat or finish + if (currentChatIndex < selectedChats.length - 1) { + // Set up for next chat + setCurrentChatIndex(currentChatIndex + 1); + + // Find the next chat from the chats array + const nextChatId = selectedChats[currentChatIndex + 1]; + const nextChat = chats.find(chat => chat.id === nextChatId) || null; + setProcessingChat(nextChat); + + // Default title for the next chat + if (!podcastTitles[nextChatId]) { + setPodcastTitle(nextChat?.title || `Podcast from Chat ${nextChatId}`); + } else { + setPodcastTitle(podcastTitles[nextChatId]); + } + + setIsGeneratingPodcast(false); + } else { + // All done + finishPodcastGeneration(); + } + } catch (error) { + console.error('Error generating podcast:', error); + toast.error(error instanceof Error ? error.message : 'Failed to generate podcast'); + setIsGeneratingPodcast(false); + } + }; + + // Helper to finish the podcast generation process + const finishPodcastGeneration = () => { + toast.success("All podcasts are being generated! Check the podcasts tab to see them when ready."); + setPodcastDialogOpen(false); + setSelectedChats([]); + setSelectionMode(false); + setCurrentChatIndex(0); + setPodcastTitles({}); + setProcessingChat(null); + setPodcastTitle(""); + setIsGeneratingPodcast(false); + }; + + // Start podcast generation flow + const startPodcastGeneration = () => { + if (selectedChats.length === 0) { + toast.error("Please select at least one chat"); + return; + } + + // Reset the state for podcast generation + setCurrentChatIndex(0); + setPodcastTitles({}); + + // Set up for the first chat + const firstChatId = selectedChats[0]; + const firstChat = chats.find(chat => chat.id === firstChatId) || null; + setProcessingChat(firstChat); + + // Set default title for the first chat + setPodcastTitle(firstChat?.title || `Podcast from Chat ${firstChatId}`); + setPodcastDialogOpen(true); + }; + + // Update the title for the current chat + const updateCurrentChatTitle = (title: string) => { + const currentChatId = selectedChats[currentChatIndex]; + setPodcastTitle(title); + setPodcastTitles(prev => ({ + ...prev, + [currentChatId]: title + })); + }; + + // Skip generating a podcast for the current chat + const skipCurrentChat = () => { + if (currentChatIndex < selectedChats.length - 1) { + // Move to the next chat + setCurrentChatIndex(currentChatIndex + 1); + + // Find the next chat + const nextChatId = selectedChats[currentChatIndex + 1]; + const nextChat = chats.find(chat => chat.id === nextChatId) || null; + setProcessingChat(nextChat); + + // Set default title for the next chat + if (!podcastTitles[nextChatId]) { + setPodcastTitle(nextChat?.title || `Podcast from Chat ${nextChatId}`); + } else { + setPodcastTitle(podcastTitles[nextChatId]); + } + } else { + // All done (all skipped) + finishPodcastGeneration(); + } + }; + + // Toggle chat selection + const toggleChatSelection = (chatId: number) => { + setSelectedChats(prev => + prev.includes(chatId) + ? prev.filter(id => id !== chatId) + : [...prev, chatId] + ); + }; + + // Select all visible chats + const selectAllVisibleChats = () => { + const visibleChatIds = currentChats.map(chat => chat.id); + setSelectedChats(prev => { + const allSelected = visibleChatIds.every(id => prev.includes(id)); + return allSelected + ? prev.filter(id => !visibleChatIds.includes(id)) // Deselect all visible if all are selected + : [...new Set([...prev, ...visibleChatIds])]; // Add all visible, ensuring no duplicates + }); + }; + + // Cancel selection mode + const cancelSelectionMode = () => { + setSelectionMode(false); + setSelectedChats([]); + }; + return ( -
- +
+ {selectionMode ? ( + <> + + + + + ) : ( + <> + + + + )}
@@ -334,44 +565,79 @@ export default function ChatsPageClient({ searchSpaceId }: ChatsPageClientProps) animate="animate" exit="exit" transition={{ duration: 0.2, delay: index * 0.05 }} - className="overflow-hidden hover:shadow-md transition-shadow" + className={`overflow-hidden hover:shadow-md transition-shadow + ${selectionMode && selectedChats.includes(chat.id) + ? 'ring-2 ring-primary ring-offset-2' : ''}`} + onClick={(e) => { + if (!selectionMode) return; + // Ignore clicks coming from interactive elements + if ((e.target as HTMLElement).closest('button, a, [data-stop-selection]')) return; + toggleChatSelection(chat.id); + }} >
-
- {chat.title || `Chat ${chat.id}`} - - - - {format(new Date(chat.created_at), 'MMM d, yyyy')} - - +
+ {selectionMode && ( +
+ {selectedChats.includes(chat.id) + ? + : } +
+ )} +
+ {chat.title || `Chat ${chat.id}`} + + + + {format(new Date(chat.created_at), 'MMM d, yyyy')} + + +
- - - - - - window.location.href = `/dashboard/${chat.search_space_id}/researcher/${chat.id}`}> - - View Chat - - - { - setChatToDelete({ id: chat.id, title: chat.title || `Chat ${chat.id}` }); - setDeleteDialogOpen(true); - }} - > - - Delete Chat - - - + {!selectionMode && ( + + + + + + window.location.href = `/dashboard/${chat.search_space_id}/researcher/${chat.id}`}> + + View Chat + + { + setSelectedChats([chat.id]); + setPodcastTitle(chat.title || `Chat ${chat.id}`); + setPodcastDialogOpen(true); + }} + > + + Generate Podcast + + + { + e.stopPropagation(); + setChatToDelete({ id: chat.id, title: chat.title || `Chat ${chat.id}` }); + setDeleteDialogOpen(true); + }} + > + + Delete Chat + + + + )}
@@ -505,6 +771,104 @@ export default function ChatsPageClient({ searchSpaceId }: ChatsPageClientProps) + + {/* Podcast Generation Dialog */} + { + if (!isOpen) { + // Cancel the process if dialog is closed + setPodcastDialogOpen(false); + setSelectedChats([]); + setSelectionMode(false); + setCurrentChatIndex(0); + setPodcastTitles({}); + setProcessingChat(null); + setPodcastTitle(""); + } else { + setPodcastDialogOpen(true); + } + }} + > + + + + + Generate Podcast {currentChatIndex + 1} of {selectedChats.length} + + + {selectedChats.length > 1 ? ( + <>Creating individual podcasts for each selected chat. Currently processing: {processingChat?.title || `Chat ${selectedChats[currentChatIndex]}`} + ) : ( + <>Create a podcast from this chat. The podcast will be available in the podcasts section once generated. + )} + + + +
+
+ + updateCurrentChatTitle(e.target.value)} + /> +
+ + {selectedChats.length > 1 && ( +
+
+
+ )} +
+ + + {selectedChats.length > 1 && !isGeneratingPodcast && ( + + )} + + + +
+
); } \ No newline at end of file diff --git a/surfsense_web/app/dashboard/[search_space_id]/chats/page.tsx b/surfsense_web/app/dashboard/[search_space_id]/chats/page.tsx index 58c89f421..f382d633c 100644 --- a/surfsense_web/app/dashboard/[search_space_id]/chats/page.tsx +++ b/surfsense_web/app/dashboard/[search_space_id]/chats/page.tsx @@ -8,8 +8,8 @@ interface PageProps { } export default async function ChatsPage({ params }: PageProps) { - // Await params to properly access dynamic route parameters - const searchSpaceId = params.search_space_id; + // Get search space ID from the route parameter + const { search_space_id: searchSpaceId } = await Promise.resolve(params); return ( diff --git a/surfsense_web/app/dashboard/[search_space_id]/connectors/(manage)/page.tsx b/surfsense_web/app/dashboard/[search_space_id]/connectors/(manage)/page.tsx index 24fe6265d..95c769c00 100644 --- a/surfsense_web/app/dashboard/[search_space_id]/connectors/(manage)/page.tsx +++ b/surfsense_web/app/dashboard/[search_space_id]/connectors/(manage)/page.tsx @@ -4,255 +4,308 @@ import { useState, useEffect } from "react"; import { useRouter, useParams } from "next/navigation"; import { motion } from "framer-motion"; import { toast } from "sonner"; -import { Edit, Plus, Search, Trash2, ExternalLink, RefreshCw } from "lucide-react"; +import { + Edit, + Plus, + Search, + Trash2, + ExternalLink, + RefreshCw, +} from "lucide-react"; import { useSearchSourceConnectors } from "@/hooks/useSearchSourceConnectors"; import { Button } from "@/components/ui/button"; import { - Card, - CardContent, - CardDescription, - CardFooter, - CardHeader, - CardTitle, + Card, + CardContent, + CardDescription, + CardFooter, + CardHeader, + CardTitle, } from "@/components/ui/card"; import { - Table, - TableBody, - TableCell, - TableHead, - TableHeader, - TableRow, + Table, + TableBody, + TableCell, + TableHead, + TableHeader, + TableRow, } from "@/components/ui/table"; import { - AlertDialog, - AlertDialogAction, - AlertDialogCancel, - AlertDialogContent, - AlertDialogDescription, - AlertDialogFooter, - AlertDialogHeader, - AlertDialogTitle, - AlertDialogTrigger, + AlertDialog, + AlertDialogAction, + AlertDialogCancel, + AlertDialogContent, + AlertDialogDescription, + AlertDialogFooter, + AlertDialogHeader, + AlertDialogTitle, + AlertDialogTrigger, } from "@/components/ui/alert-dialog"; -import { Tooltip, TooltipContent, TooltipProvider, TooltipTrigger } from "@/components/ui/tooltip"; +import { + Tooltip, + TooltipContent, + TooltipProvider, + TooltipTrigger, +} from "@/components/ui/tooltip"; +import { getConnectorIcon } from "@/components/chat"; // Helper function to get connector type display name const getConnectorTypeDisplay = (type: string): string => { - const typeMap: Record = { - "SERPER_API": "Serper API", - "TAVILY_API": "Tavily API", - "SLACK_CONNECTOR": "Slack", - "NOTION_CONNECTOR": "Notion", - "GITHUB_CONNECTOR": "GitHub", - "LINEAR_CONNECTOR": "Linear", - // Add other connector types here as needed - }; - return typeMap[type] || type; + const typeMap: Record = { + SERPER_API: "Serper API", + TAVILY_API: "Tavily API", + SLACK_CONNECTOR: "Slack", + NOTION_CONNECTOR: "Notion", + GITHUB_CONNECTOR: "GitHub", + LINEAR_CONNECTOR: "Linear", + LINKUP_API: "Linkup", + // Add other connector types here as needed + }; + return typeMap[type] || type; }; // Helper function to format date with time const formatDateTime = (dateString: string | null): string => { - if (!dateString) return "Never"; - - const date = new Date(dateString); - return new Intl.DateTimeFormat('en-US', { - year: 'numeric', - month: 'short', - day: 'numeric', - hour: '2-digit', - minute: '2-digit' - }).format(date); + if (!dateString) return "Never"; + + const date = new Date(dateString); + return new Intl.DateTimeFormat("en-US", { + year: "numeric", + month: "short", + day: "numeric", + hour: "2-digit", + minute: "2-digit", + }).format(date); }; export default function ConnectorsPage() { - const router = useRouter(); - const params = useParams(); - const searchSpaceId = params.search_space_id as string; - - const { connectors, isLoading, error, deleteConnector, indexConnector } = useSearchSourceConnectors(); - const [connectorToDelete, setConnectorToDelete] = useState(null); - const [indexingConnectorId, setIndexingConnectorId] = useState(null); + const router = useRouter(); + const params = useParams(); + const searchSpaceId = params.search_space_id as string; - useEffect(() => { - if (error) { - toast.error("Failed to load connectors"); - console.error("Error fetching connectors:", error); - } - }, [error]); + const { connectors, isLoading, error, deleteConnector, indexConnector } = + useSearchSourceConnectors(); + const [connectorToDelete, setConnectorToDelete] = useState( + null, + ); + const [indexingConnectorId, setIndexingConnectorId] = useState( + null, + ); - // Handle connector deletion - const handleDeleteConnector = async () => { - if (connectorToDelete === null) return; - - try { - await deleteConnector(connectorToDelete); - toast.success("Connector deleted successfully"); - } catch (error) { - console.error("Error deleting connector:", error); - toast.error("Failed to delete connector"); - } finally { - setConnectorToDelete(null); - } - }; + useEffect(() => { + if (error) { + toast.error("Failed to load connectors"); + console.error("Error fetching connectors:", error); + } + }, [error]); - // Handle connector indexing - const handleIndexConnector = async (connectorId: number) => { - setIndexingConnectorId(connectorId); - try { - await indexConnector(connectorId, searchSpaceId); - toast.success("Connector content indexed successfully"); - } catch (error) { - console.error("Error indexing connector content:", error); - toast.error(error instanceof Error ? error.message : "Failed to index connector content"); - } finally { - setIndexingConnectorId(null); - } - }; + // Handle connector deletion + const handleDeleteConnector = async () => { + if (connectorToDelete === null) return; - return ( -
- -
-

Connectors

-

- Manage your connected services and data sources. -

-
- -
+ try { + await deleteConnector(connectorToDelete); + toast.success("Connector deleted successfully"); + } catch (error) { + console.error("Error deleting connector:", error); + toast.error("Failed to delete connector"); + } finally { + setConnectorToDelete(null); + } + }; - - - Your Connectors - - View and manage all your connected services. - - - - {isLoading ? ( -
-
-
-
-
-
- ) : connectors.length === 0 ? ( -
-

No connectors found

-

- You haven't added any connectors yet. Add one to enhance your search capabilities. -

- -
- ) : ( -
- - - - Name - Type - Last Indexed - Actions - - - - {connectors.map((connector) => ( - - {connector.name} - {getConnectorTypeDisplay(connector.connector_type)} - - {connector.is_indexable - ? formatDateTime(connector.last_indexed_at) - : "Not indexable"} - - -
- {connector.is_indexable && ( - - - - - - -

Index Content

-
-
-
- )} - - - - - - - - Delete Connector - - Are you sure you want to delete this connector? This action cannot be undone. - - - - setConnectorToDelete(null)}> - Cancel - - - Delete - - - - -
-
-
- ))} -
-
-
- )} -
-
-
- ); -} + // Handle connector indexing + const handleIndexConnector = async (connectorId: number) => { + setIndexingConnectorId(connectorId); + try { + await indexConnector(connectorId, searchSpaceId); + toast.success("Connector content indexed successfully"); + } catch (error) { + console.error("Error indexing connector content:", error); + toast.error( + error instanceof Error + ? error.message + : "Failed to index connector content", + ); + } finally { + setIndexingConnectorId(null); + } + }; + + return ( +
+ +
+

Connectors

+

+ Manage your connected services and data sources. +

+
+ +
+ + + + Your Connectors + + View and manage all your connected services. + + + + {isLoading ? ( +
+
+
+
+
+
+ ) : connectors.length === 0 ? ( +
+

No connectors found

+

+ You haven't added any connectors yet. Add one to enhance your + search capabilities. +

+ +
+ ) : ( +
+ + + + Name + Type + Last Indexed + Actions + + + + {connectors.map((connector) => ( + + + {connector.name} + + + {getConnectorIcon(connector.connector_type)} + + + {connector.is_indexable + ? formatDateTime(connector.last_indexed_at) + : "Not indexable"} + + +
+ {connector.is_indexable && ( + + + + + + +

Index Content

+
+
+
+ )} + + + + + + + + + Delete Connector + + + Are you sure you want to delete this + connector? This action cannot be undone. + + + + setConnectorToDelete(null)} + > + Cancel + + + Delete + + + + +
+
+
+ ))} +
+
+
+ )} +
+
+
+ ); +} diff --git a/surfsense_web/app/dashboard/[search_space_id]/connectors/[connector_id]/edit/page.tsx b/surfsense_web/app/dashboard/[search_space_id]/connectors/[connector_id]/edit/page.tsx index d41295faa..644dbc981 100644 --- a/surfsense_web/app/dashboard/[search_space_id]/connectors/[connector_id]/edit/page.tsx +++ b/surfsense_web/app/dashboard/[search_space_id]/connectors/[connector_id]/edit/page.tsx @@ -1,6 +1,6 @@ "use client"; -import React, { useEffect } from 'react'; +import React, { useEffect } from "react"; import { useRouter, useParams } from "next/navigation"; import { motion } from "framer-motion"; import { toast } from "sonner"; @@ -8,169 +8,208 @@ import { ArrowLeft, Check, Loader2, Github } from "lucide-react"; import { Form } from "@/components/ui/form"; import { Button } from "@/components/ui/button"; -import { Card, CardContent, CardDescription, CardFooter, CardHeader, CardTitle } from "@/components/ui/card"; +import { + Card, + CardContent, + CardDescription, + CardFooter, + CardHeader, + CardTitle, +} from "@/components/ui/card"; // Import Utils, Types, Hook, and Components -import { getConnectorTypeDisplay } from '@/lib/connectors/utils'; -import { useConnectorEditPage } from '@/hooks/useConnectorEditPage'; +import { getConnectorTypeDisplay } from "@/lib/connectors/utils"; +import { useConnectorEditPage } from "@/hooks/useConnectorEditPage"; import { EditConnectorLoadingSkeleton } from "@/components/editConnector/EditConnectorLoadingSkeleton"; import { EditConnectorNameForm } from "@/components/editConnector/EditConnectorNameForm"; import { EditGitHubConnectorConfig } from "@/components/editConnector/EditGitHubConnectorConfig"; import { EditSimpleTokenForm } from "@/components/editConnector/EditSimpleTokenForm"; +import { getConnectorIcon } from "@/components/chat"; export default function EditConnectorPage() { - const router = useRouter(); - const params = useParams(); - const searchSpaceId = params.search_space_id as string; - // Ensure connectorId is parsed safely - const connectorIdParam = params.connector_id as string; - const connectorId = connectorIdParam ? parseInt(connectorIdParam, 10) : NaN; + const router = useRouter(); + const params = useParams(); + const searchSpaceId = params.search_space_id as string; + // Ensure connectorId is parsed safely + const connectorIdParam = params.connector_id as string; + const connectorId = connectorIdParam ? parseInt(connectorIdParam, 10) : NaN; - // Use the custom hook to manage state and logic - const { - connectorsLoading, - connector, - isSaving, - editForm, - patForm, // Needed for GitHub child component - handleSaveChanges, - // GitHub specific props for the child component - editMode, - setEditMode, // Pass down if needed by GitHub component - originalPat, - currentSelectedRepos, - fetchedRepos, - setFetchedRepos, - newSelectedRepos, - setNewSelectedRepos, - isFetchingRepos, - handleFetchRepositories, - handleRepoSelectionChange, - } = useConnectorEditPage(connectorId, searchSpaceId); + // Use the custom hook to manage state and logic + const { + connectorsLoading, + connector, + isSaving, + editForm, + patForm, // Needed for GitHub child component + handleSaveChanges, + // GitHub specific props for the child component + editMode, + setEditMode, // Pass down if needed by GitHub component + originalPat, + currentSelectedRepos, + fetchedRepos, + setFetchedRepos, + newSelectedRepos, + setNewSelectedRepos, + isFetchingRepos, + handleFetchRepositories, + handleRepoSelectionChange, + } = useConnectorEditPage(connectorId, searchSpaceId); - // Redirect if connectorId is not a valid number after parsing - useEffect(() => { - if (isNaN(connectorId)) { - toast.error("Invalid Connector ID."); - router.push(`/dashboard/${searchSpaceId}/connectors`); - } - }, [connectorId, router, searchSpaceId]); + // Redirect if connectorId is not a valid number after parsing + useEffect(() => { + if (isNaN(connectorId)) { + toast.error("Invalid Connector ID."); + router.push(`/dashboard/${searchSpaceId}/connectors`); + } + }, [connectorId, router, searchSpaceId]); - // Loading State - if (connectorsLoading || !connector) { - // Handle NaN case before showing skeleton - if (isNaN(connectorId)) return null; - return ; - } + // Loading State + if (connectorsLoading || !connector) { + // Handle NaN case before showing skeleton + if (isNaN(connectorId)) return null; + return ; + } - // Main Render using data/handlers from the hook - return ( -
- + // Main Render using data/handlers from the hook + return ( +
+ - - - - - {/* TODO: Dynamic icon */} - Edit {getConnectorTypeDisplay(connector.connector_type)} Connector - - Modify connector name and configuration. - + + + + + {getConnectorIcon(connector.connector_type)} + Edit {getConnectorTypeDisplay(connector.connector_type)} Connector + + + Modify connector name and configuration. + + -
- {/* Pass hook's handleSaveChanges */} - - - {/* Pass form control from hook */} - + + {/* Pass hook's handleSaveChanges */} + + + {/* Pass form control from hook */} + -
+
-

Configuration

+

Configuration

- {/* == GitHub == */} - {connector.connector_type === 'GITHUB_CONNECTOR' && ( - - )} + {/* == GitHub == */} + {connector.connector_type === "GITHUB_CONNECTOR" && ( + + )} - {/* == Slack == */} - {connector.connector_type === 'SLACK_CONNECTOR' && ( - - )} - {/* == Notion == */} - {connector.connector_type === 'NOTION_CONNECTOR' && ( - - )} - {/* == Serper == */} - {connector.connector_type === 'SERPER_API' && ( - - )} - {/* == Tavily == */} - {connector.connector_type === 'TAVILY_API' && ( - - )} + {/* == Slack == */} + {connector.connector_type === "SLACK_CONNECTOR" && ( + + )} + {/* == Notion == */} + {connector.connector_type === "NOTION_CONNECTOR" && ( + + )} + {/* == Serper == */} + {connector.connector_type === "SERPER_API" && ( + + )} + {/* == Tavily == */} + {connector.connector_type === "TAVILY_API" && ( + + )} - {/* == Linear == */} - {connector.connector_type === 'LINEAR_CONNECTOR' && ( - - )} + {/* == Linear == */} + {connector.connector_type === "LINEAR_CONNECTOR" && ( + + )} -
- - - - - -
-
-
- ); -} + {/* == Linkup == */} + {connector.connector_type === "LINKUP_API" && ( + + )} + + + + + + + + +
+ ); +} diff --git a/surfsense_web/app/dashboard/[search_space_id]/connectors/[connector_id]/page.tsx b/surfsense_web/app/dashboard/[search_space_id]/connectors/[connector_id]/page.tsx index ad6ceb7bf..c2726a837 100644 --- a/surfsense_web/app/dashboard/[search_space_id]/connectors/[connector_id]/page.tsx +++ b/surfsense_web/app/dashboard/[search_space_id]/connectors/[connector_id]/page.tsx @@ -52,6 +52,7 @@ const getConnectorTypeDisplay = (type: string): string => { "SLACK_CONNECTOR": "Slack Connector", "NOTION_CONNECTOR": "Notion Connector", "GITHUB_CONNECTOR": "GitHub Connector", + "LINKUP_API": "Linkup", // Add other connector types here as needed }; return typeMap[type] || type; @@ -87,7 +88,8 @@ export default function EditConnectorPage() { "TAVILY_API": "TAVILY_API_KEY", "SLACK_CONNECTOR": "SLACK_BOT_TOKEN", "NOTION_CONNECTOR": "NOTION_INTEGRATION_TOKEN", - "GITHUB_CONNECTOR": "GITHUB_PAT" + "GITHUB_CONNECTOR": "GITHUB_PAT", + "LINKUP_API": "LINKUP_API_KEY" }; return fieldMap[connectorType] || ""; }; @@ -229,7 +231,9 @@ export default function EditConnectorPage() { ? "Notion Integration Token" : connector?.connector_type === "GITHUB_CONNECTOR" ? "GitHub Personal Access Token (PAT)" - : "API Key"} + : connector?.connector_type === "LINKUP_API" + ? "Linkup API Key" + : "API Key"} @@ -253,7 +259,9 @@ export default function EditConnectorPage() { ? "Enter a new Notion Integration Token or leave blank to keep your existing token." : connector?.connector_type === "GITHUB_CONNECTOR" ? "Enter a new GitHub PAT or leave blank to keep your existing token." - : "Enter a new API key or leave blank to keep your existing key."} + : connector?.connector_type === "LINKUP_API" + ? "Enter a new Linkup API Key or leave blank to keep your existing key." + : "Enter a new API key or leave blank to keep your existing key."} diff --git a/surfsense_web/app/dashboard/[search_space_id]/connectors/add/linkup-api/page.tsx b/surfsense_web/app/dashboard/[search_space_id]/connectors/add/linkup-api/page.tsx new file mode 100644 index 000000000..291bdfb36 --- /dev/null +++ b/surfsense_web/app/dashboard/[search_space_id]/connectors/add/linkup-api/page.tsx @@ -0,0 +1,207 @@ +"use client"; + +import { useState } from "react"; +import { useRouter, useParams } from "next/navigation"; +import { motion } from "framer-motion"; +import { zodResolver } from "@hookform/resolvers/zod"; +import { useForm } from "react-hook-form"; +import * as z from "zod"; +import { toast } from "sonner"; +import { ArrowLeft, Check, Info, Loader2 } from "lucide-react"; + +import { useSearchSourceConnectors } from "@/hooks/useSearchSourceConnectors"; +import { + Form, + FormControl, + FormDescription, + FormField, + FormItem, + FormLabel, + FormMessage, +} from "@/components/ui/form"; +import { Input } from "@/components/ui/input"; +import { Button } from "@/components/ui/button"; +import { + Card, + CardContent, + CardDescription, + CardFooter, + CardHeader, + CardTitle, +} from "@/components/ui/card"; +import { + Alert, + AlertDescription, + AlertTitle, +} from "@/components/ui/alert"; + +// Define the form schema with Zod +const linkupApiFormSchema = z.object({ + name: z.string().min(3, { + message: "Connector name must be at least 3 characters.", + }), + api_key: z.string().min(10, { + message: "API key is required and must be valid.", + }), +}); + +// Define the type for the form values +type LinkupApiFormValues = z.infer; + +export default function LinkupApiPage() { + const router = useRouter(); + const params = useParams(); + const searchSpaceId = params.search_space_id as string; + const [isSubmitting, setIsSubmitting] = useState(false); + const { createConnector } = useSearchSourceConnectors(); + + // Initialize the form + const form = useForm({ + resolver: zodResolver(linkupApiFormSchema), + defaultValues: { + name: "Linkup API Connector", + api_key: "", + }, + }); + + // Handle form submission + const onSubmit = async (values: LinkupApiFormValues) => { + setIsSubmitting(true); + try { + await createConnector({ + name: values.name, + connector_type: "LINKUP_API", + config: { + LINKUP_API_KEY: values.api_key, + }, + is_indexable: false, + last_indexed_at: null, + }); + + toast.success("Linkup API connector created successfully!"); + + // Navigate back to connectors page + router.push(`/dashboard/${searchSpaceId}/connectors`); + } catch (error) { + console.error("Error creating connector:", error); + toast.error(error instanceof Error ? error.message : "Failed to create connector"); + } finally { + setIsSubmitting(false); + } + }; + + return ( +
+ + + + + + Connect Linkup API + + Integrate with Linkup API to enhance your search capabilities with AI-powered search results. + + + + + + API Key Required + + You'll need a Linkup API key to use this connector. You can get one by signing up at{" "} + + linkup.so + + + + +
+ + ( + + Connector Name + + + + + A friendly name to identify this connector. + + + + )} + /> + + ( + + Linkup API Key + + + + + Your API key will be encrypted and stored securely. + + + + )} + /> + +
+ +
+ + +
+ +

What you get with Linkup API:

+
    +
  • AI-powered search results tailored to your queries
  • +
  • Real-time information from the web
  • +
  • Enhanced search capabilities for your projects
  • +
+
+
+
+
+ ); +} diff --git a/surfsense_web/app/dashboard/[search_space_id]/connectors/add/page.tsx b/surfsense_web/app/dashboard/[search_space_id]/connectors/add/page.tsx index 1f7490270..c04dae645 100644 --- a/surfsense_web/app/dashboard/[search_space_id]/connectors/add/page.tsx +++ b/surfsense_web/app/dashboard/[search_space_id]/connectors/add/page.tsx @@ -16,6 +16,7 @@ import { IconWorldWww, IconTicket, IconLayoutKanban, + IconLinkPlus, } from "@tabler/icons-react"; import { AnimatePresence, motion } from "framer-motion"; import Link from "next/link"; @@ -50,7 +51,13 @@ const connectorCategories: ConnectorCategory[] = [ icon: , status: "available", }, - // Add other search engine connectors like Tavily, Serper if they have UI config + { + id: "linkup-api", + title: "Linkup API", + description: "Search the web using the Linkup API", + icon: , + status: "available", + }, ], }, { diff --git a/surfsense_web/app/dashboard/[search_space_id]/documents/upload/page.tsx b/surfsense_web/app/dashboard/[search_space_id]/documents/upload/page.tsx index a6fb3d128..b8848b05e 100644 --- a/surfsense_web/app/dashboard/[search_space_id]/documents/upload/page.tsx +++ b/surfsense_web/app/dashboard/[search_space_id]/documents/upload/page.tsx @@ -42,34 +42,95 @@ export default function FileUploader() { const router = useRouter(); const fileInputRef = useRef(null); - const acceptedFileTypes = { - 'image/bmp': ['.bmp'], - 'text/csv': ['.csv'], - 'application/msword': ['.doc'], - 'application/vnd.openxmlformats-officedocument.wordprocessingml.document': ['.docx'], - 'message/rfc822': ['.eml'], - 'application/epub+zip': ['.epub'], - 'image/heic': ['.heic'], - 'text/html': ['.html'], - 'image/jpeg': ['.jpeg', '.jpg'], - 'image/png': ['.png'], - 'text/markdown': ['.md'], - 'application/vnd.ms-outlook': ['.msg'], - 'application/vnd.oasis.opendocument.text': ['.odt'], - 'text/x-org': ['.org'], - 'application/pkcs7-signature': ['.p7s'], - 'application/pdf': ['.pdf'], - 'application/vnd.ms-powerpoint': ['.ppt'], - 'application/vnd.openxmlformats-officedocument.presentationml.presentation': ['.pptx'], - 'text/x-rst': ['.rst'], - 'application/rtf': ['.rtf'], - 'image/tiff': ['.tiff'], - 'text/plain': ['.txt'], - 'text/tab-separated-values': ['.tsv'], - 'application/vnd.ms-excel': ['.xls'], - 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet': ['.xlsx'], - 'application/xml': ['.xml'], - } + // Audio files are always supported (using whisper) + const audioFileTypes = { + 'audio/mpeg': ['.mp3', '.mpeg', '.mpga'], + 'audio/mp4': ['.mp4', '.m4a'], + 'audio/wav': ['.wav'], + 'audio/webm': ['.webm'], + }; + + // Conditionally set accepted file types based on ETL service + const acceptedFileTypes = process.env.NEXT_PUBLIC_ETL_SERVICE === 'LLAMACLOUD' + ? { + // LlamaCloud supported file types + 'application/pdf': ['.pdf'], + 'application/msword': ['.doc'], + 'application/vnd.openxmlformats-officedocument.wordprocessingml.document': ['.docx'], + 'application/vnd.ms-word.document.macroEnabled.12': ['.docm'], + 'application/msword-template': ['.dot'], + 'application/vnd.ms-word.template.macroEnabled.12': ['.dotm'], + 'application/vnd.ms-powerpoint': ['.ppt'], + 'application/vnd.ms-powerpoint.template.macroEnabled.12': ['.pptm'], + 'application/vnd.openxmlformats-officedocument.presentationml.presentation': ['.pptx'], + 'application/vnd.ms-powerpoint.template': ['.pot'], + 'application/vnd.openxmlformats-officedocument.presentationml.template': ['.potx'], + 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet': ['.xlsx'], + 'application/vnd.ms-excel': ['.xls'], + 'application/vnd.ms-excel.sheet.macroEnabled.12': ['.xlsm'], + 'application/vnd.ms-excel.sheet.binary.macroEnabled.12': ['.xlsb'], + 'application/vnd.ms-excel.workspace': ['.xlw'], + 'application/rtf': ['.rtf'], + 'application/xml': ['.xml'], + 'application/epub+zip': ['.epub'], + 'application/vnd.apple.keynote': ['.key'], + 'application/vnd.apple.pages': ['.pages'], + 'application/vnd.apple.numbers': ['.numbers'], + 'application/vnd.wordperfect': ['.wpd'], + 'application/vnd.oasis.opendocument.text': ['.odt'], + 'application/vnd.oasis.opendocument.presentation': ['.odp'], + 'application/vnd.oasis.opendocument.graphics': ['.odg'], + 'application/vnd.oasis.opendocument.spreadsheet': ['.ods'], + 'application/vnd.oasis.opendocument.formula': ['.fods'], + 'text/plain': ['.txt'], + 'text/csv': ['.csv'], + 'text/tab-separated-values': ['.tsv'], + 'text/html': ['.html', '.htm', '.web'], + 'image/jpeg': ['.jpg', '.jpeg'], + 'image/png': ['.png'], + 'image/gif': ['.gif'], + 'image/bmp': ['.bmp'], + 'image/svg+xml': ['.svg'], + 'image/tiff': ['.tiff'], + 'image/webp': ['.webp'], + 'application/dbase': ['.dbf'], + 'application/vnd.lotus-1-2-3': ['.123'], + 'text/x-web-markdown': ['.602', '.abw', '.cgm', '.cwk', '.hwp', '.lwp', '.mw', '.mcw', '.pbd', '.sda', '.sdd', '.sdp', '.sdw', '.sgl', '.sti', '.sxi', '.sxw', '.stw', '.sxg', '.uof', '.uop', '.uot', '.vor', '.wps', '.zabw'], + 'text/x-spreadsheet': ['.dif', '.sylk', '.slk', '.prn', '.et', '.uos1', '.uos2', '.wk1', '.wk2', '.wk3', '.wk4', '.wks', '.wq1', '.wq2', '.wb1', '.wb2', '.wb3', '.qpw', '.xlr', '.eth'], + // Audio files (always supported) + ...audioFileTypes, + } + : { + // Unstructured supported file types + 'image/bmp': ['.bmp'], + 'text/csv': ['.csv'], + 'application/msword': ['.doc'], + 'application/vnd.openxmlformats-officedocument.wordprocessingml.document': ['.docx'], + 'message/rfc822': ['.eml'], + 'application/epub+zip': ['.epub'], + 'image/heic': ['.heic'], + 'text/html': ['.html'], + 'image/jpeg': ['.jpeg', '.jpg'], + 'image/png': ['.png'], + 'text/markdown': ['.md', '.markdown'], + 'application/vnd.ms-outlook': ['.msg'], + 'application/vnd.oasis.opendocument.text': ['.odt'], + 'text/x-org': ['.org'], + 'application/pkcs7-signature': ['.p7s'], + 'application/pdf': ['.pdf'], + 'application/vnd.ms-powerpoint': ['.ppt'], + 'application/vnd.openxmlformats-officedocument.presentationml.presentation': ['.pptx'], + 'text/x-rst': ['.rst'], + 'application/rtf': ['.rtf'], + 'image/tiff': ['.tiff'], + 'text/plain': ['.txt'], + 'text/tab-separated-values': ['.tsv'], + 'application/vnd.ms-excel': ['.xls'], + 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet': ['.xlsx'], + 'application/xml': ['.xml'], + // Audio files (always supported) + ...audioFileTypes, + }; const supportedExtensions = Array.from(new Set(Object.values(acceptedFileTypes).flat())).sort() diff --git a/surfsense_web/app/dashboard/[search_space_id]/layout.tsx b/surfsense_web/app/dashboard/[search_space_id]/layout.tsx index 7449e10b5..a3c344aaf 100644 --- a/surfsense_web/app/dashboard/[search_space_id]/layout.tsx +++ b/surfsense_web/app/dashboard/[search_space_id]/layout.tsx @@ -73,6 +73,13 @@ export default function DashboardLayout({ }, ], }, + { + title: "Podcasts", + url: `/dashboard/${search_space_id}/podcasts`, + icon: "Podcast", + items: [ + ], + } // TODO: Add research synthesizer's // { // title: "Research Synthesizer's", diff --git a/surfsense_web/app/dashboard/[search_space_id]/podcasts/page.tsx b/surfsense_web/app/dashboard/[search_space_id]/podcasts/page.tsx new file mode 100644 index 000000000..429260724 --- /dev/null +++ b/surfsense_web/app/dashboard/[search_space_id]/podcasts/page.tsx @@ -0,0 +1,20 @@ +import { Suspense } from 'react'; +import PodcastsPageClient from './podcasts-client'; + +interface PageProps { + params: { + search_space_id: string; + }; +} + +export default async function PodcastsPage({ params }: PageProps) { + const { search_space_id: searchSpaceId } = await Promise.resolve(params); + + return ( + +
+
}> + + + ); +} diff --git a/surfsense_web/app/dashboard/[search_space_id]/podcasts/podcasts-client.tsx b/surfsense_web/app/dashboard/[search_space_id]/podcasts/podcasts-client.tsx new file mode 100644 index 000000000..5489d8672 --- /dev/null +++ b/surfsense_web/app/dashboard/[search_space_id]/podcasts/podcasts-client.tsx @@ -0,0 +1,968 @@ +'use client'; + +import { format } from 'date-fns'; +import { AnimatePresence, motion } from 'framer-motion'; +import { + Calendar, + MoreHorizontal, + Pause, + Play, + Podcast, + Search, + SkipBack, + SkipForward, + Trash2, + Volume2, VolumeX +} from 'lucide-react'; +import { useEffect, useRef, useState } from 'react'; + +// UI Components +import { Button } from '@/components/ui/button'; +import { Card } from '@/components/ui/card'; +import { + Dialog, + DialogContent, + DialogDescription, + DialogFooter, + DialogHeader, + DialogTitle, +} from "@/components/ui/dialog"; +import { + DropdownMenu, + DropdownMenuContent, + DropdownMenuItem, + DropdownMenuTrigger +} from '@/components/ui/dropdown-menu'; +import { Input } from '@/components/ui/input'; +import { + Select, + SelectContent, + SelectGroup, + SelectItem, + SelectTrigger, + SelectValue, +} from "@/components/ui/select"; +import { Slider } from '@/components/ui/slider'; +import { toast } from "sonner"; + +interface PodcastItem { + id: number; + title: string; + created_at: string; + file_location: string; + podcast_transcript: any[]; + search_space_id: number; +} + +interface PodcastsPageClientProps { + searchSpaceId: string; +} + +const pageVariants = { + initial: { opacity: 0 }, + enter: { opacity: 1, transition: { duration: 0.4, ease: 'easeInOut', staggerChildren: 0.1 } }, + exit: { opacity: 0, transition: { duration: 0.3, ease: 'easeInOut' } } +}; + +const podcastCardVariants = { + initial: { scale: 0.95, y: 20, opacity: 0 }, + animate: { scale: 1, y: 0, opacity: 1, transition: { type: "spring", stiffness: 300, damping: 25 } }, + exit: { scale: 0.95, y: -20, opacity: 0 }, + hover: { y: -5, scale: 1.02, transition: { duration: 0.2 } } +}; + +const MotionCard = motion(Card); + +export default function PodcastsPageClient({ searchSpaceId }: PodcastsPageClientProps) { + const [podcasts, setPodcasts] = useState([]); + const [filteredPodcasts, setFilteredPodcasts] = useState([]); + const [isLoading, setIsLoading] = useState(true); + const [error, setError] = useState(null); + const [searchQuery, setSearchQuery] = useState(''); + const [sortOrder, setSortOrder] = useState('newest'); + const [deleteDialogOpen, setDeleteDialogOpen] = useState(false); + const [podcastToDelete, setPodcastToDelete] = useState<{ id: number, title: string } | null>(null); + const [isDeleting, setIsDeleting] = useState(false); + + // Audio player state + const [currentPodcast, setCurrentPodcast] = useState(null); + const [audioSrc, setAudioSrc] = useState(undefined); + const [isAudioLoading, setIsAudioLoading] = useState(false); + const [isPlaying, setIsPlaying] = useState(false); + const [currentTime, setCurrentTime] = useState(0); + const [duration, setDuration] = useState(0); + const [volume, setVolume] = useState(0.7); + const [isMuted, setIsMuted] = useState(false); + const audioRef = useRef(null); + const currentObjectUrlRef = useRef(null); + + // Add podcast image URL constant + const PODCAST_IMAGE_URL = "https://static.vecteezy.com/system/resources/thumbnails/002/157/611/small_2x/illustrations-concept-design-podcast-channel-free-vector.jpg"; + + // Fetch podcasts from API + useEffect(() => { + const fetchPodcasts = async () => { + try { + setIsLoading(true); + + // Get token from localStorage + const token = localStorage.getItem('surfsense_bearer_token'); + + if (!token) { + setError('Authentication token not found. Please log in again.'); + setIsLoading(false); + return; + } + + // Fetch all podcasts for this search space + const response = await fetch( + `${process.env.NEXT_PUBLIC_FASTAPI_BACKEND_URL}/api/v1/podcasts/`, + { + headers: { + 'Authorization': `Bearer ${token}`, + 'Content-Type': 'application/json', + }, + cache: 'no-store', + } + ); + + if (!response.ok) { + const errorData = await response.json().catch(() => null); + throw new Error(`Failed to fetch podcasts: ${response.status} ${errorData?.detail || ''}`); + } + + const data: PodcastItem[] = await response.json(); + setPodcasts(data); + setFilteredPodcasts(data); + setError(null); + } catch (error) { + console.error('Error fetching podcasts:', error); + setError(error instanceof Error ? error.message : 'Unknown error occurred'); + setPodcasts([]); + setFilteredPodcasts([]); + } finally { + setIsLoading(false); + } + }; + + fetchPodcasts(); + }, [searchSpaceId]); + + // Filter and sort podcasts based on search query and sort order + useEffect(() => { + let result = [...podcasts]; + + // Filter by search term + if (searchQuery) { + const query = searchQuery.toLowerCase(); + result = result.filter(podcast => + podcast.title.toLowerCase().includes(query) + ); + } + + // Filter by search space + result = result.filter(podcast => + podcast.search_space_id === parseInt(searchSpaceId) + ); + + // Sort podcasts + result.sort((a, b) => { + const dateA = new Date(a.created_at).getTime(); + const dateB = new Date(b.created_at).getTime(); + + return sortOrder === 'newest' ? dateB - dateA : dateA - dateB; + }); + + setFilteredPodcasts(result); + }, [podcasts, searchQuery, sortOrder, searchSpaceId]); + + // Cleanup object URL on unmount or when currentPodcast changes + useEffect(() => { + return () => { + if (currentObjectUrlRef.current) { + URL.revokeObjectURL(currentObjectUrlRef.current); + currentObjectUrlRef.current = null; + } + }; + }, []); + + // Audio player time update handler + const handleTimeUpdate = () => { + if (audioRef.current) { + setCurrentTime(audioRef.current.currentTime); + } + }; + + // Audio player metadata loaded handler + const handleMetadataLoaded = () => { + if (audioRef.current) { + setDuration(audioRef.current.duration); + } + }; + + // Play/pause toggle + const togglePlayPause = () => { + if (audioRef.current) { + if (isPlaying) { + audioRef.current.pause(); + } else { + audioRef.current.play(); + } + setIsPlaying(!isPlaying); + } + }; + + // Seek to position + const handleSeek = (value: number[]) => { + if (audioRef.current) { + audioRef.current.currentTime = value[0]; + setCurrentTime(value[0]); + } + }; + + // Volume change + const handleVolumeChange = (value: number[]) => { + if (audioRef.current) { + const newVolume = value[0]; + + // Set volume + audioRef.current.volume = newVolume; + setVolume(newVolume); + + // Handle mute state based on volume + if (newVolume === 0) { + audioRef.current.muted = true; + setIsMuted(true); + } else { + audioRef.current.muted = false; + setIsMuted(false); + } + } + }; + + // Toggle mute + const toggleMute = () => { + if (audioRef.current) { + const newMutedState = !isMuted; + audioRef.current.muted = newMutedState; + setIsMuted(newMutedState); + + // If unmuting, restore previous volume if it was 0 + if (!newMutedState && volume === 0) { + const restoredVolume = 0.5; + audioRef.current.volume = restoredVolume; + setVolume(restoredVolume); + } + } + }; + + // Skip forward 10 seconds + const skipForward = () => { + if (audioRef.current) { + audioRef.current.currentTime = Math.min(audioRef.current.duration, audioRef.current.currentTime + 10); + } + }; + + // Skip backward 10 seconds + const skipBackward = () => { + if (audioRef.current) { + audioRef.current.currentTime = Math.max(0, audioRef.current.currentTime - 10); + } + }; + + // Format time in MM:SS + const formatTime = (time: number) => { + const minutes = Math.floor(time / 60); + const seconds = Math.floor(time % 60); + return `${minutes}:${seconds < 10 ? '0' : ''}${seconds}`; + }; + + // Play podcast - Fetch blob and set object URL + const playPodcast = async (podcast: PodcastItem) => { + // If the same podcast is selected, just toggle play/pause + if (currentPodcast && currentPodcast.id === podcast.id) { + togglePlayPause(); + return; + } + + // Prevent multiple simultaneous loading requests + if (isAudioLoading) { + return; + } + + try { + // Reset player state and show loading + setCurrentPodcast(podcast); + setAudioSrc(undefined); + setCurrentTime(0); + setDuration(0); + setIsPlaying(false); + setIsAudioLoading(true); + + const token = localStorage.getItem('surfsense_bearer_token'); + if (!token) { + throw new Error('Authentication token not found.'); + } + + // Revoke previous object URL if exists (only after we've started the new request) + if (currentObjectUrlRef.current) { + URL.revokeObjectURL(currentObjectUrlRef.current); + currentObjectUrlRef.current = null; + } + + // Use AbortController to handle timeout or cancellation + const controller = new AbortController(); + const timeoutId = setTimeout(() => controller.abort(), 30000); // 30 second timeout + + try { + const response = await fetch( + `${process.env.NEXT_PUBLIC_FASTAPI_BACKEND_URL}/api/v1/podcasts/${podcast.id}/stream`, + { + headers: { + 'Authorization': `Bearer ${token}`, + }, + signal: controller.signal + } + ); + + if (!response.ok) { + throw new Error(`Failed to fetch audio stream: ${response.statusText}`); + } + + const blob = await response.blob(); + const objectUrl = URL.createObjectURL(blob); + currentObjectUrlRef.current = objectUrl; + + // Set audio source + setAudioSrc(objectUrl); + + // Wait for the audio to be ready before playing + // We'll handle actual playback in the onLoadedData event instead of here + } catch (error) { + if (error instanceof DOMException && error.name === 'AbortError') { + throw new Error('Request timed out. Please try again.'); + } + throw error; + } finally { + clearTimeout(timeoutId); + } + } catch (error) { + console.error('Error fetching or playing podcast:', error); + toast.error(error instanceof Error ? error.message : 'Failed to load podcast audio.'); + // Reset state on error + setCurrentPodcast(null); + setAudioSrc(undefined); + } finally { + setIsAudioLoading(false); + } + }; + + // Function to handle podcast deletion + const handleDeletePodcast = async () => { + if (!podcastToDelete) return; + + setIsDeleting(true); + try { + const token = localStorage.getItem('surfsense_bearer_token'); + if (!token) { + setIsDeleting(false); + return; + } + + const response = await fetch(`${process.env.NEXT_PUBLIC_FASTAPI_BACKEND_URL}/api/v1/podcasts/${podcastToDelete.id}`, { + method: 'DELETE', + headers: { + 'Authorization': `Bearer ${token}`, + 'Content-Type': 'application/json', + } + }); + + if (!response.ok) { + throw new Error(`Failed to delete podcast: ${response.statusText}`); + } + + // Close dialog and refresh podcasts + setDeleteDialogOpen(false); + setPodcastToDelete(null); + + // Update local state by removing the deleted podcast + setPodcasts(prevPodcasts => prevPodcasts.filter(podcast => podcast.id !== podcastToDelete.id)); + + // If the current playing podcast is deleted, stop playback + if (currentPodcast && currentPodcast.id === podcastToDelete.id) { + if (audioRef.current) { + audioRef.current.pause(); + } + setCurrentPodcast(null); + setIsPlaying(false); + } + + toast.success('Podcast deleted successfully'); + } catch (error) { + console.error('Error deleting podcast:', error); + toast.error(error instanceof Error ? error.message : 'Failed to delete podcast'); + } finally { + setIsDeleting(false); + } + }; + + return ( + +
+
+

Podcasts

+

Listen to generated podcasts.

+
+ + {/* Filter and Search Bar */} +
+
+
+ + setSearchQuery(e.target.value)} + /> +
+
+ +
+ +
+
+ + {/* Status Messages */} + {isLoading && ( +
+
+
+

Loading podcasts...

+
+
+ )} + + {error && !isLoading && ( +
+

Error loading podcasts

+

{error}

+
+ )} + + {!isLoading && !error && filteredPodcasts.length === 0 && ( +
+ +

No podcasts found

+

+ {searchQuery + ? 'Try adjusting your search filters' + : 'Generate podcasts from your chats to get started'} +

+
+ )} + + {/* Podcast Grid */} + {!isLoading && !error && filteredPodcasts.length > 0 && ( + + + {filteredPodcasts.map((podcast, index) => ( + playPodcast(podcast)} + > +
+ {/* Podcast image with gradient overlay */} + Podcast illustration + + {/* Better overlay with gradient for improved text legibility */} +
+ + {/* Loading indicator with improved animation */} + {currentPodcast?.id === podcast.id && isAudioLoading && ( + + +
+

Loading podcast...

+
+
+ )} + + {/* Play button with animations */} + {!(currentPodcast?.id === podcast.id && (isPlaying || isAudioLoading)) && ( + + + + )} + + {/* Pause button with animations */} + {currentPodcast?.id === podcast.id && isPlaying && !isAudioLoading && ( + + + + )} + + {/* Now playing indicator */} + {currentPodcast?.id === podcast.id && !isAudioLoading && ( +
+ + + + + Now Playing +
+ )} +
+ +
+

+ {podcast.title || 'Untitled Podcast'} +

+

+ + {format(new Date(podcast.created_at), 'MMM d, yyyy')} +

+
+ + {currentPodcast?.id === podcast.id && !isAudioLoading && ( + +
{ + e.stopPropagation(); + if (!audioRef.current || !duration) return; + const container = e.currentTarget; + const rect = container.getBoundingClientRect(); + const x = e.clientX - rect.left; + const percentage = Math.max(0, Math.min(1, x / rect.width)); + const newTime = percentage * duration; + handleSeek([newTime]); + }} + > + + + +
+
+ {formatTime(currentTime)} + {formatTime(duration)} +
+
+ )} + + {currentPodcast?.id === podcast.id && !isAudioLoading && ( + + + + + + + + + + + + )} + +
+ + + + + + { + e.stopPropagation(); + setPodcastToDelete({ id: podcast.id, title: podcast.title }); + setDeleteDialogOpen(true); + }} + > + + Delete Podcast + + + +
+ +
+ ))} +
+
+ )} + + {/* Current Podcast Player (Fixed at bottom) */} + {currentPodcast && !isAudioLoading && audioSrc && ( + +
+
+
+ + + +
+ +
+

{currentPodcast.title}

+ +
+
+ + +
+
+ {formatTime(currentTime)} / {formatTime(duration)} +
+
+
+ +
+ + + + + + + + + + + + +
+ + + + +
+ + +
+
+
+
+
+
+ )} +
+ + {/* Delete Confirmation Dialog */} + + + + + + Delete Podcast + + + Are you sure you want to delete {podcastToDelete?.title}? This action cannot be undone. + + + + + + + + + + {/* Hidden audio element for playback */} +
+ ); +} \ No newline at end of file diff --git a/surfsense_web/app/dashboard/[search_space_id]/researcher/[chat_id]/page.tsx b/surfsense_web/app/dashboard/[search_space_id]/researcher/[chat_id]/page.tsx index d371e9e53..80f98c84d 100644 --- a/surfsense_web/app/dashboard/[search_space_id]/researcher/[chat_id]/page.tsx +++ b/surfsense_web/app/dashboard/[search_space_id]/researcher/[chat_id]/page.tsx @@ -13,7 +13,9 @@ import { ArrowDown, CircleUser, Database, - SendHorizontal + SendHorizontal, + FileText, + Grid3x3 } from 'lucide-react'; import { Card, CardContent, CardHeader, CardTitle } from '@/components/ui/card'; import { Button } from '@/components/ui/button'; @@ -46,7 +48,6 @@ import { researcherOptions } from '@/components/chat'; import { MarkdownViewer } from '@/components/markdown-viewer'; -import { connectorSourcesMenu as defaultConnectorSourcesMenu } from '@/components/chat/connector-sources'; import { Logo } from '@/components/Logo'; import { useSearchSourceConnectors } from '@/hooks'; @@ -239,7 +240,6 @@ const SourcesDialogContent = ({ const ChatPage = () => { const [token, setToken] = React.useState(null); - const [activeTab, setActiveTab] = useState(""); const [dialogOpenId, setDialogOpenId] = useState(null); const [sourcesPage, setSourcesPage] = useState(1); const [expandedSources, setExpandedSources] = useState(false); @@ -249,10 +249,10 @@ const ChatPage = () => { const tabsListRef = useRef(null); const [terminalExpanded, setTerminalExpanded] = useState(false); const [selectedConnectors, setSelectedConnectors] = useState(["CRAWLED_URL"]); + const [searchMode, setSearchMode] = useState<'DOCUMENTS' | 'CHUNKS'>('DOCUMENTS'); const [researchMode, setResearchMode] = useState("GENERAL"); const [currentTime, setCurrentTime] = useState(''); const [currentDate, setCurrentDate] = useState(''); - const [connectorSources, setConnectorSources] = useState([]); const terminalMessagesRef = useRef(null); const { connectorSourceItems, isLoading: isLoadingConnectors } = useSearchSourceConnectors(); @@ -364,7 +364,8 @@ const ChatPage = () => { data: { search_space_id: search_space_id, selected_connectors: selectedConnectors, - research_mode: researchMode + research_mode: researchMode, + search_mode: searchMode } }, onError: (error) => { @@ -476,43 +477,10 @@ const ChatPage = () => { updateChat(); }, [messages, status, chat_id, researchMode, selectedConnectors, search_space_id]); - // Memoize connector sources to prevent excessive re-renders - const processedConnectorSources = React.useMemo(() => { - if (messages.length === 0) return connectorSources; - - // Only process when we have a complete message (not streaming) - if (status !== 'ready') return connectorSources; - - // Find the latest assistant message - const assistantMessages = messages.filter(msg => msg.role === 'assistant'); - if (assistantMessages.length === 0) return connectorSources; - - const latestAssistantMessage = assistantMessages[assistantMessages.length - 1]; - if (!latestAssistantMessage?.annotations) return connectorSources; - - // Find the latest SOURCES annotation - const annotations = latestAssistantMessage.annotations as any[]; - const sourcesAnnotations = annotations.filter(a => a.type === 'SOURCES'); - - if (sourcesAnnotations.length === 0) return connectorSources; - - const latestSourcesAnnotation = sourcesAnnotations[sourcesAnnotations.length - 1]; - if (!latestSourcesAnnotation.content) return connectorSources; - - // Use this content if it differs from current - return latestSourcesAnnotation.content; - }, [messages, status, connectorSources]); - - // Update connector sources when processed value changes - useEffect(() => { - if (processedConnectorSources !== connectorSources) { - setConnectorSources(processedConnectorSources); - } - }, [processedConnectorSources, connectorSources]); - // Check and scroll terminal when terminal info is available useEffect(() => { - if (messages.length === 0 || status !== 'ready') return; + // Modified to trigger during streaming as well (removed status check) + if (messages.length === 0) return; // Find the latest assistant message const assistantMessages = messages.filter(msg => msg.role === 'assistant'); @@ -526,10 +494,27 @@ const ChatPage = () => { const terminalInfoAnnotations = annotations.filter(a => a.type === 'TERMINAL_INFO'); if (terminalInfoAnnotations.length > 0) { - // Schedule scrolling after the DOM has been updated - setTimeout(scrollTerminalToBottom, 100); + // Always scroll to bottom when terminal info is updated, even during streaming + scrollTerminalToBottom(); } - }, [messages, status]); + }, [messages]); // Removed status from dependencies to ensure it triggers during streaming + + // Pure function to get connector sources for a specific message + const getMessageConnectorSources = (message: any): any[] => { + if (!message || message.role !== 'assistant' || !message.annotations) return []; + + // Find all SOURCES annotations + const annotations = message.annotations as any[]; + const sourcesAnnotations = annotations.filter(a => a.type === 'SOURCES'); + + // Get the latest SOURCES annotation + if (sourcesAnnotations.length === 0) return []; + const latestSourcesAnnotation = sourcesAnnotations[sourcesAnnotations.length - 1]; + + if (!latestSourcesAnnotation.content) return []; + + return latestSourcesAnnotation.content; + }; // Custom handleSubmit function to include selected connectors and answer type const handleSubmit = (e: React.FormEvent) => { @@ -561,17 +546,12 @@ const ChatPage = () => { scrollToBottom(); }, [messages]); - // Set activeTab when connectorSources change using a memoized value - const activeTabValue = React.useMemo(() => { - return connectorSources.length > 0 ? connectorSources[0].type : ""; - }, [connectorSources]); - - // Update activeTab when the memoized value changes + // Reset sources page when new messages arrive useEffect(() => { - if (activeTabValue && activeTabValue !== activeTab) { - setActiveTab(activeTabValue); - } - }, [activeTabValue, activeTab]); + // Reset pagination when we get new messages + setSourcesPage(1); + setExpandedSources(false); + }, [messages]); // Scroll terminal to bottom when expanded useEffect(() => { @@ -580,11 +560,6 @@ const ChatPage = () => { } }, [terminalExpanded]); - // Get total sources count for a connector type - const getSourcesCount = (connectorType: string) => { - return getSourcesCountUtil(connectorSources, connectorType); - }; - // Function to check scroll position and update indicators const updateScrollIndicators = () => { updateScrollIndicatorsUtil(tabsListRef as React.RefObject, setCanScrollLeft, setCanScrollRight); @@ -610,23 +585,6 @@ const ChatPage = () => { // Use the scroll to bottom hook useScrollToBottom(messagesEndRef as React.RefObject, [messages]); - // Function to get sources for the main view - const getMainViewSources = (connector: any) => { - return getMainViewSourcesUtil(connector, INITIAL_SOURCES_DISPLAY); - }; - - // Function to get filtered sources for the dialog with null check - const getFilteredSourcesWithCheck = (connector: any, sourceFilter: string) => { - if (!connector?.sources) return []; - return getFilteredSourcesUtil(connector, sourceFilter); - }; - - // Function to get paginated dialog sources with null check - const getPaginatedDialogSourcesWithCheck = (connector: any, sourceFilter: string, expandedSources: boolean, sourcesPage: number, sourcesPerPage: number) => { - if (!connector?.sources) return []; - return getPaginatedDialogSourcesUtil(connector, sourceFilter, expandedSources, sourcesPage, sourcesPerPage); - }; - // Function to get a citation source by ID const getCitationSource = React.useCallback((citationId: number, messageIndex?: number): Source | null => { if (!messages || messages.length === 0) return null; @@ -638,23 +596,14 @@ const ChatPage = () => { if (assistantMessages.length === 0) return null; const latestAssistantMessage = assistantMessages[assistantMessages.length - 1]; - if (!latestAssistantMessage?.annotations) return null; - - // Find all SOURCES annotations - const annotations = latestAssistantMessage.annotations as any[]; - const sourcesAnnotations = annotations.filter( - (annotation) => annotation.type === 'SOURCES' - ); - - // Get the latest SOURCES annotation - if (sourcesAnnotations.length === 0) return null; - const latestSourcesAnnotation = sourcesAnnotations[sourcesAnnotations.length - 1]; - - if (!latestSourcesAnnotation.content) return null; + + // Use our helper function to get sources + const sources = getMessageConnectorSources(latestAssistantMessage); + if (sources.length === 0) return null; // Flatten all sources from all connectors const allSources: Source[] = []; - latestSourcesAnnotation.content.forEach((connector: ConnectorSource) => { + sources.forEach((connector: ConnectorSource) => { if (connector.sources && Array.isArray(connector.sources)) { connector.sources.forEach((source: SourceItem) => { allSources.push({ @@ -675,23 +624,14 @@ const ChatPage = () => { } else { // Use the specific message by index const message = messages[messageIndex]; - if (!message || message.role !== 'assistant' || !message.annotations) return null; - - // Find all SOURCES annotations - const annotations = message.annotations as any[]; - const sourcesAnnotations = annotations.filter( - (annotation) => annotation.type === 'SOURCES' - ); - - // Get the latest SOURCES annotation - if (sourcesAnnotations.length === 0) return null; - const latestSourcesAnnotation = sourcesAnnotations[sourcesAnnotations.length - 1]; - - if (!latestSourcesAnnotation.content) return null; + + // Use our helper function to get sources + const sources = getMessageConnectorSources(message); + if (sources.length === 0) return null; // Flatten all sources from all connectors const allSources: Source[] = []; - latestSourcesAnnotation.content.forEach((connector: ConnectorSource) => { + sources.forEach((connector: ConnectorSource) => { if (connector.sources && Array.isArray(connector.sources)) { connector.sources.forEach((source: SourceItem) => { allSources.push({ @@ -712,6 +652,34 @@ const ChatPage = () => { } }, [messages]); + // Pure function for rendering terminal content - no hooks allowed here + const renderTerminalContent = (message: any) => { + if (!message.annotations) return null; + + // Get all TERMINAL_INFO annotations + const terminalInfoAnnotations = (message.annotations as any[]) + .filter(a => a.type === 'TERMINAL_INFO'); + + // Get the latest TERMINAL_INFO annotation + const latestTerminalInfo = terminalInfoAnnotations.length > 0 + ? terminalInfoAnnotations[terminalInfoAnnotations.length - 1] + : null; + + // Render the content of the latest TERMINAL_INFO annotation + return latestTerminalInfo?.content.map((item: any, idx: number) => ( +
+ [{String(idx).padStart(2, '0')}:{String(Math.floor(idx * 2)).padStart(2, '0')}] + {'>'} + {item.text} +
+ )); + }; + return ( <>
@@ -781,30 +749,9 @@ const ChatPage = () => { $ surfsense-researcher
- {message.annotations && (() => { - // Get all TERMINAL_INFO annotations - const terminalInfoAnnotations = (message.annotations as any[]) - .filter(a => a.type === 'TERMINAL_INFO'); - - // Get the latest TERMINAL_INFO annotation - const latestTerminalInfo = terminalInfoAnnotations.length > 0 - ? terminalInfoAnnotations[terminalInfoAnnotations.length - 1] - : null; - - // Render the content of the latest TERMINAL_INFO annotation - return latestTerminalInfo?.content.map((item: any, idx: number) => ( -
- [{String(idx).padStart(2, '0')}:{String(Math.floor(idx * 2)).padStart(2, '0')}] - {'>'} - {item.text} -
- )); - })()} + + {renderTerminalContent(message)} +
[00:13] researcher@surfsense @@ -836,105 +783,120 @@ const ChatPage = () => { Sources
- 0 ? connectorSources[0].type : "CRAWLED_URL"} - className="w-full" - onValueChange={setActiveTab} - > -
-
- + {(() => { + // Get sources for this specific message + const messageConnectorSources = getMessageConnectorSources(message); + + if (messageConnectorSources.length === 0) { + return ( +
+ +
+ ); + } + + // Use these message-specific sources for the Tabs component + return ( + 0 ? messageConnectorSources[0].type : "CRAWLED_URL"} + className="w-full" + > +
+
+ -
-
- - {connectorSources.map((connector) => ( - - {getConnectorIcon(connector.type)} - {connector.name.split(' ')[0]} - - {getSourcesCount(connector.type)} - - - ))} - +
+
+ + {messageConnectorSources.map((connector) => ( + + {getConnectorIcon(connector.type)} + {connector.name.split(' ')[0]} + + {connector.sources?.length || 0} + + + ))} + +
+
+ +
- -
-
+ {messageConnectorSources.map(connector => ( + +
+ {connector.sources?.slice(0, INITIAL_SOURCES_DISPLAY)?.map((source: any) => ( + +
+
+ {getConnectorIcon(connector.type)} +
+
+

{source.title}

+

{source.description}

+
+ +
+
+ ))} - {connectorSources.map(connector => ( - -
- {getMainViewSources(connector)?.map((source: any) => ( - -
-
- {getConnectorIcon(connector.type)} -
-
-

{source.title}

-

{source.description}

-
- -
-
- ))} - - {connector.sources.length > INITIAL_SOURCES_DISPLAY && ( - setDialogOpenId(open ? connector.id : null)}> - - - - - - - - )} -
-
- ))} - + {connector.sources?.length > INITIAL_SOURCES_DISPLAY && ( + setDialogOpenId(open ? connector.id : null)}> + + + + + + + + )} +
+
+ ))} +
+ ); + })()}
{/* Answer Section */} @@ -1014,15 +976,17 @@ const ChatPage = () => { Send -
-
+
+
{/* Connector Selection Dialog */} - { }} - /> +
+ { }} + /> +
@@ -1089,12 +1053,40 @@ const ChatPage = () => {
+ {/* Search Mode Control */} +
+ + +
+ {/* Research Mode Segmented Control */} - - value={researchMode} - onChange={setResearchMode} - options={researcherOptions} - /> +
+ + value={researchMode} + onChange={setResearchMode} + options={researcherOptions} + /> +
diff --git a/surfsense_web/app/dashboard/page.tsx b/surfsense_web/app/dashboard/page.tsx index d3e298f83..176c9bf15 100644 --- a/surfsense_web/app/dashboard/page.tsx +++ b/surfsense_web/app/dashboard/page.tsx @@ -4,7 +4,7 @@ import React from 'react' import Link from 'next/link' import { motion } from 'framer-motion' import { Button } from '@/components/ui/button' -import { Plus, Search, Trash2, AlertCircle, Loader2 } from 'lucide-react' +import { Plus, Search, Trash2, AlertCircle, Loader2, LogOut } from 'lucide-react' import { Tilt } from '@/components/ui/tilt' import { Spotlight } from '@/components/ui/spotlight' import { Logo } from '@/components/Logo'; @@ -145,11 +145,19 @@ const DashboardPage = () => { }, }; + const router = useRouter(); const { searchSpaces, loading, error, refreshSearchSpaces } = useSearchSpaces(); if (loading) return ; if (error) return ; + const handleLogout = () => { + if (typeof window !== 'undefined') { + localStorage.removeItem('surfsense_bearer_token'); + router.push('/'); + } + }; + const handleDeleteSearchSpace = async (id: number) => { // Send DELETE request to the API try { @@ -193,7 +201,18 @@ const DashboardPage = () => {

- +
+ + +
diff --git a/surfsense_web/app/globals.css b/surfsense_web/app/globals.css index 98e4411fb..5dbd332d0 100644 --- a/surfsense_web/app/globals.css +++ b/surfsense_web/app/globals.css @@ -45,6 +45,7 @@ --sidebar-accent-foreground: oklch(0.205 0 0); --sidebar-border: oklch(0.922 0 0); --sidebar-ring: oklch(0.708 0 0); + --syntax-bg: #f5f5f5; } .dark { @@ -80,6 +81,7 @@ --sidebar-accent-foreground: oklch(0.985 0 0); --sidebar-border: oklch(0.269 0 0); --sidebar-ring: oklch(0.439 0 0); + --syntax-bg: #1e1e1e; } @theme inline { diff --git a/surfsense_web/app/layout.tsx b/surfsense_web/app/layout.tsx index 6b60891a4..33bf10bb4 100644 --- a/surfsense_web/app/layout.tsx +++ b/surfsense_web/app/layout.tsx @@ -15,35 +15,67 @@ const roboto = Roboto({ }); export const metadata: Metadata = { - title: "SurfSense - A Personal NotebookLM and Perplexity-like AI Assistant for Everyone.", - description: - "Have your own private NotebookLM and Perplexity with better integrations.", - openGraph: { - images: [ - { - url: "https://surfsense.net/og-image.png", - width: 1200, - height: 630, - alt: "SurfSense - A Personal NotebookLM and Perplexity-like AI Assistant for Everyone.", - }, - ], - }, - twitter: { - card: "summary_large_image", - site: "https://surfsense.net", - creator: "https://surfsense.net", - title: "SurfSense - A Personal NotebookLM and Perplexity-like AI Assistant for Everyone.", - description: - "Have your own private NotebookLM and Perplexity with better integrations.", - images: [ - { - url: "https://surfsense.net/og-image.png", - width: 1200, - height: 630, - alt: "SurfSense - A Personal NotebookLM and Perplexity-like AI Assistant for Everyone.", - }, - ], - }, + title: "SurfSense – Customizable AI Research & Knowledge Management Assistant", + description: + "SurfSense is an AI-powered research assistant that integrates with tools like Notion, GitHub, Slack, and more to help you efficiently manage, search, and chat with your documents. Generate podcasts, perform hybrid search, and unlock insights from your knowledge base.", + keywords: [ + "SurfSense", + "AI research assistant", + "AI knowledge management", + "AI document assistant", + "customizable AI assistant", + "notion integration", + "slack integration", + "github integration", + "hybrid search", + "vector search", + "RAG", + "LangChain", + "FastAPI", + "LLM apps", + "AI document chat", + "knowledge management AI", + "AI-powered document search", + "personal AI assistant", + "AI research tools", + "AI podcast generator", + "AI knowledge base", + "AI document assistant tools", + "AI-powered search assistant", + ], + openGraph: { + title: "SurfSense – AI Research & Knowledge Management Assistant", + description: + "Connect your documents and tools like Notion, Slack, GitHub, and more to your private AI assistant. SurfSense offers powerful search, document chat, podcast generation, and RAG APIs to enhance your workflow.", + url: "https://surfsense.net", + siteName: "SurfSense", + type: "website", + images: [ + { + url: "https://surfsense.net/og-image.png", + width: 1200, + height: 630, + alt: "SurfSense AI Research Assistant", + }, + ], + locale: "en_US", + }, + twitter: { + card: "summary_large_image", + title: "SurfSense – AI Assistant for Research & Knowledge Management", + description: + "Have your own NotebookLM or Perplexity, but better. SurfSense connects external tools, allows chat with your documents, and generates fast, high-quality podcasts.", + creator: "https://surfsense.net", + site: "https://surfsense.net", + images: [ + { + url: "https://surfsense.net/og-image-twitter.png", + width: 1200, + height: 630, + alt: "SurfSense AI Assistant Preview", + }, + ], + } }; export default async function RootLayout({ diff --git a/surfsense_web/app/login/AmbientBackground.tsx b/surfsense_web/app/login/AmbientBackground.tsx new file mode 100644 index 000000000..6b61d517d --- /dev/null +++ b/surfsense_web/app/login/AmbientBackground.tsx @@ -0,0 +1,43 @@ +"use client"; +import React from "react"; + +export const AmbientBackground = () => { + return ( +
+
+
+
+
+ ); +}; \ No newline at end of file diff --git a/surfsense_web/app/login/GoogleLoginButton.tsx b/surfsense_web/app/login/GoogleLoginButton.tsx index a4ed4a3a0..ee5deb3a9 100644 --- a/surfsense_web/app/login/GoogleLoginButton.tsx +++ b/surfsense_web/app/login/GoogleLoginButton.tsx @@ -3,6 +3,7 @@ import React from "react"; import { IconBrandGoogleFilled } from "@tabler/icons-react"; import { motion } from "framer-motion"; import { Logo } from "@/components/Logo"; +import { AmbientBackground } from "./AmbientBackground"; export function GoogleLoginButton() { const handleGoogleLogin = () => { @@ -34,6 +35,42 @@ export function GoogleLoginButton() { Welcome Back + + + + + + + +
+

+ SurfSense Cloud is currently in development. Check Docs for more information on Self-Hosted version. +

+
+
+
+
); -} - - - -const AmbientBackground = () => { - return ( -
-
-
-
-
- ); -}; \ No newline at end of file +} \ No newline at end of file diff --git a/surfsense_web/app/login/LocalLoginForm.tsx b/surfsense_web/app/login/LocalLoginForm.tsx new file mode 100644 index 000000000..345941802 --- /dev/null +++ b/surfsense_web/app/login/LocalLoginForm.tsx @@ -0,0 +1,114 @@ +"use client"; +import React, { useState, useEffect } from "react"; +import { useRouter } from "next/navigation"; +import Link from "next/link"; + +export function LocalLoginForm() { + const [username, setUsername] = useState(""); + const [password, setPassword] = useState(""); + const [error, setError] = useState(""); + const [isLoading, setIsLoading] = useState(false); + const [authType, setAuthType] = useState(null); + const router = useRouter(); + + useEffect(() => { + // Get the auth type from environment variables + setAuthType(process.env.NEXT_PUBLIC_FASTAPI_BACKEND_AUTH_TYPE || "GOOGLE"); + }, []); + + const handleSubmit = async (e: React.FormEvent) => { + e.preventDefault(); + setIsLoading(true); + setError(""); + + try { + // Create form data for the API request + const formData = new URLSearchParams(); + formData.append("username", username); + formData.append("password", password); + formData.append("grant_type", "password"); + + const response = await fetch( + `${process.env.NEXT_PUBLIC_FASTAPI_BACKEND_URL}/auth/jwt/login`, + { + method: "POST", + headers: { + "Content-Type": "application/x-www-form-urlencoded", + }, + body: formData.toString(), + } + ); + + const data = await response.json(); + + if (!response.ok) { + throw new Error(data.detail || "Failed to login"); + } + + router.push("/auth/callback?token=" + data.access_token); + } catch (err: any) { + setError(err.message || "An error occurred during login"); + } finally { + setIsLoading(false); + } + }; + + return ( +
+
+ {error && ( +
+ {error} +
+ )} + +
+ + setUsername(e.target.value)} + className="mt-1 block w-full rounded-md border border-gray-300 bg-white px-3 py-2 shadow-sm focus:border-blue-500 focus:outline-none focus:ring-blue-500 dark:border-gray-700 dark:bg-gray-800 dark:text-white" + /> +
+ +
+ + setPassword(e.target.value)} + className="mt-1 block w-full rounded-md border border-gray-300 bg-white px-3 py-2 shadow-sm focus:border-blue-500 focus:outline-none focus:ring-blue-500 dark:border-gray-700 dark:bg-gray-800 dark:text-white" + /> +
+ + +
+ + {authType === "LOCAL" && ( +
+

+ Don't have an account?{" "} + + Register here + +

+
+ )} +
+ ); +} \ No newline at end of file diff --git a/surfsense_web/app/login/page.tsx b/surfsense_web/app/login/page.tsx index ee3b46200..65fa0b873 100644 --- a/surfsense_web/app/login/page.tsx +++ b/surfsense_web/app/login/page.tsx @@ -1,5 +1,89 @@ +"use client"; + +import { useState, useEffect, Suspense } from "react"; import { GoogleLoginButton } from "./GoogleLoginButton"; +import { LocalLoginForm } from "./LocalLoginForm"; +import { Logo } from "@/components/Logo"; +import { AmbientBackground } from "./AmbientBackground"; +import { useSearchParams } from "next/navigation"; +import { Loader2 } from "lucide-react"; + +function LoginContent() { + const [authType, setAuthType] = useState(null); + const [registrationSuccess, setRegistrationSuccess] = useState(false); + const [isLoading, setIsLoading] = useState(true); + const searchParams = useSearchParams(); + + useEffect(() => { + // Check if the user was redirected from registration + if (searchParams.get("registered") === "true") { + setRegistrationSuccess(true); + } + + // Get the auth type from environment variables + setAuthType(process.env.NEXT_PUBLIC_FASTAPI_BACKEND_AUTH_TYPE || "GOOGLE"); + setIsLoading(false); + }, [searchParams]); + + // Show loading state while determining auth type + if (isLoading) { + return ( +
+ +
+ +
+ + Loading... +
+
+
+ ); + } + + if (authType === "GOOGLE") { + return ; + } + + return ( +
+ +
+ +

+ Sign In +

+ + {registrationSuccess && ( +
+ Registration successful! You can now sign in with your credentials. +
+ )} + + +
+
+ ); +} + +// Loading fallback for Suspense +const LoadingFallback = () => ( +
+ +
+ +
+ + Loading... +
+
+
+); export default function LoginPage() { - return ; + return ( + }> + + + ); } \ No newline at end of file diff --git a/surfsense_web/app/register/page.tsx b/surfsense_web/app/register/page.tsx new file mode 100644 index 000000000..33e012608 --- /dev/null +++ b/surfsense_web/app/register/page.tsx @@ -0,0 +1,149 @@ +"use client"; +import React, { useState, useEffect } from "react"; +import { useRouter } from "next/navigation"; +import Link from "next/link"; +import { Logo } from "@/components/Logo"; +import { AmbientBackground } from "../login/AmbientBackground"; + +export default function RegisterPage() { + const [email, setEmail] = useState(""); + const [password, setPassword] = useState(""); + const [confirmPassword, setConfirmPassword] = useState(""); + const [error, setError] = useState(""); + const [isLoading, setIsLoading] = useState(false); + const router = useRouter(); + + // Check authentication type and redirect if not LOCAL + useEffect(() => { + const authType = process.env.NEXT_PUBLIC_FASTAPI_BACKEND_AUTH_TYPE || "GOOGLE"; + if (authType !== "LOCAL") { + router.push("/login"); + } + }, [router]); + + const handleSubmit = async (e: React.FormEvent) => { + e.preventDefault(); + + // Form validation + if (password !== confirmPassword) { + setError("Passwords do not match"); + return; + } + + setIsLoading(true); + setError(""); + + try { + const response = await fetch( + `${process.env.NEXT_PUBLIC_FASTAPI_BACKEND_URL}/auth/register`, + { + method: "POST", + headers: { + "Content-Type": "application/json", + }, + body: JSON.stringify({ + email, + password, + is_active: true, + is_superuser: false, + is_verified: false, + }), + } + ); + + const data = await response.json(); + + if (!response.ok) { + throw new Error(data.detail || "Registration failed"); + } + + // Redirect to login page after successful registration + router.push("/login?registered=true"); + } catch (err: any) { + setError(err.message || "An error occurred during registration"); + } finally { + setIsLoading(false); + } + }; + + return ( +
+ +
+ +

+ Create an Account +

+ +
+
+ {error && ( +
+ {error} +
+ )} + +
+ + setEmail(e.target.value)} + className="mt-1 block w-full rounded-md border border-gray-300 bg-white px-3 py-2 shadow-sm focus:border-blue-500 focus:outline-none focus:ring-blue-500 dark:border-gray-700 dark:bg-gray-800 dark:text-white" + /> +
+ +
+ + setPassword(e.target.value)} + className="mt-1 block w-full rounded-md border border-gray-300 bg-white px-3 py-2 shadow-sm focus:border-blue-500 focus:outline-none focus:ring-blue-500 dark:border-gray-700 dark:bg-gray-800 dark:text-white" + /> +
+ +
+ + setConfirmPassword(e.target.value)} + className="mt-1 block w-full rounded-md border border-gray-300 bg-white px-3 py-2 shadow-sm focus:border-blue-500 focus:outline-none focus:ring-blue-500 dark:border-gray-700 dark:bg-gray-800 dark:text-white" + /> +
+ + +
+ +
+

+ Already have an account?{" "} + + Sign in + +

+
+
+
+
+ ); +} \ No newline at end of file diff --git a/surfsense_web/app/sitemap.ts b/surfsense_web/app/sitemap.ts new file mode 100644 index 000000000..cbd35fba1 --- /dev/null +++ b/surfsense_web/app/sitemap.ts @@ -0,0 +1,48 @@ +import type { MetadataRoute } from 'next' + +export default function sitemap(): MetadataRoute.Sitemap { + return [ + { + url: 'https://www.surfsense.net/', + lastModified: new Date(), + changeFrequency: 'yearly', + priority: 1, + }, + { + url: 'https://www.surfsense.net/privacy', + lastModified: new Date(), + changeFrequency: 'monthly', + priority: 0.9, + }, + { + url: 'https://www.surfsense.net/terms', + lastModified: new Date(), + changeFrequency: 'monthly', + priority: 0.9, + }, + { + url: 'https://www.surfsense.net/docs', + lastModified: new Date(), + changeFrequency: 'weekly', + priority: 0.9, + }, + { + url: 'https://www.surfsense.net/docs/installation', + lastModified: new Date(), + changeFrequency: 'weekly', + priority: 0.9, + }, + { + url: 'https://www.surfsense.net/docs/docker-installation', + lastModified: new Date(), + changeFrequency: 'weekly', + priority: 0.9, + }, + { + url: 'https://www.surfsense.net/docs/manual-installation', + lastModified: new Date(), + changeFrequency: 'weekly', + priority: 0.9, + }, + ] +} diff --git a/surfsense_web/components/ModernHeroWithGradients.tsx b/surfsense_web/components/ModernHeroWithGradients.tsx index 052c993da..1a64fc446 100644 --- a/surfsense_web/components/ModernHeroWithGradients.tsx +++ b/surfsense_web/components/ModernHeroWithGradients.tsx @@ -19,6 +19,17 @@ export function ModernHeroWithGradients() {
+
+ + MODSetter%2FSurfSense | Trendshift + +

- A Customizable AI Research Agent just like NotebookLM or Perplexity, but connected to external sources such as search engines (Tavily), Slack, Linear, Notion, YouTube, GitHub and more. + A Customizable AI Research Agent just like NotebookLM or Perplexity, but connected to external sources such as search engines (Tavily, LinkUp), Slack, Linear, Notion, YouTube, GitHub and more.

{ const DesktopNav = ({ navItems, visible }: NavbarProps) => { const [hoveredIndex, setHoveredIndex] = useState(null); - + const handleGoogleLogin = () => { - // Redirect to Google OAuth authorization URL - fetch(`${process.env.NEXT_PUBLIC_FASTAPI_BACKEND_URL}/auth/google/authorize`) - .then((response) => { - if (!response.ok) { - throw new Error('Failed to get authorization URL'); - } - return response.json(); - }) - .then((data) => { - if (data.authorization_url) { - window.location.href = data.authorization_url; - } else { - console.error('No authorization URL received'); - } - }) - .catch((error) => { - console.error('Error during Google login:', error); - }); + // Redirect to the login page + window.location.href = '/login'; }; return ( @@ -89,8 +73,8 @@ const DesktopNav = ({ navItems, visible }: NavbarProps) => { onMouseLeave={() => setHoveredIndex(null)} animate={{ backdropFilter: "blur(16px)", - background: visible - ? "rgba(var(--background-rgb), 0.8)" + background: visible + ? "rgba(var(--background-rgb), 0.8)" : "rgba(var(--background-rgb), 0.6)", width: visible ? "38%" : "80%", height: visible ? "48px" : "64px", @@ -115,7 +99,7 @@ const DesktopNav = ({ navItems, visible }: NavbarProps) => { } as React.CSSProperties} >
- + SurfSense
@@ -191,8 +175,8 @@ const DesktopNav = ({ navItems, visible }: NavbarProps) => { variant="outline" className="hidden cursor-pointer md:flex items-center gap-2 rounded-full dark:bg-white/20 dark:hover:bg-white/30 dark:text-white bg-gray-100 hover:bg-gray-200 text-gray-800 border-0" > - - Sign in with Google + + Sign in )} @@ -204,19 +188,19 @@ const DesktopNav = ({ navItems, visible }: NavbarProps) => { const MobileNav = ({ navItems, visible }: NavbarProps) => { const [open, setOpen] = useState(false); - + const handleGoogleLogin = () => { // Redirect to the login page window.location.href = "./login"; }; - + return ( <> { } as React.CSSProperties} >
- +
{open ? ( @@ -294,8 +278,8 @@ const MobileNav = ({ navItems, visible }: NavbarProps) => { variant="outline" className="flex cursor-pointer items-center gap-2 mt-4 w-full justify-center rounded-full dark:bg-white/20 dark:hover:bg-white/30 dark:text-white bg-gray-100 hover:bg-gray-200 text-gray-800 border-0" > - - Sign in with Google + + Sign in )} diff --git a/surfsense_web/components/chat/ConnectorComponents.tsx b/surfsense_web/components/chat/ConnectorComponents.tsx index 2b4ab8455..163d5bf20 100644 --- a/surfsense_web/components/chat/ConnectorComponents.tsx +++ b/surfsense_web/components/chat/ConnectorComponents.tsx @@ -11,7 +11,7 @@ import { Link, Webhook, } from 'lucide-react'; -import { IconBrandNotion, IconBrandSlack, IconBrandYoutube, IconBrandGithub, IconLayoutKanban } from "@tabler/icons-react"; +import { IconBrandNotion, IconBrandSlack, IconBrandYoutube, IconBrandGithub, IconLayoutKanban, IconLinkPlus } from "@tabler/icons-react"; import { Button } from '@/components/ui/button'; import { Connector, ResearchMode } from './types'; @@ -20,6 +20,8 @@ export const getConnectorIcon = (connectorType: string) => { const iconProps = { className: "h-4 w-4" }; switch(connectorType) { + case 'LINKUP_API': + return ; case 'LINEAR_CONNECTOR': return ; case 'GITHUB_CONNECTOR': @@ -145,7 +147,7 @@ export const ConnectorButton = ({ selectedConnectors, onClick, connectorSources return ( +
+ {mounted ? ( + + {children} + + ) : ( +
+
+            {children}
+          
+
+ )} +
+ ); +}; + // Helper function to process citations within React children const processCitationsInReactChildren = (children: React.ReactNode, getCitationSource: (id: number) => Source | null): React.ReactNode => { // If children is not an array or string, just return it diff --git a/surfsense_web/components/sidebar/app-sidebar.tsx b/surfsense_web/components/sidebar/app-sidebar.tsx index ac5f978ab..4f56c4252 100644 --- a/surfsense_web/components/sidebar/app-sidebar.tsx +++ b/surfsense_web/components/sidebar/app-sidebar.tsx @@ -14,6 +14,7 @@ import { Info, ExternalLink, Trash2, + Podcast, type LucideIcon, } from "lucide-react" @@ -45,7 +46,8 @@ export const iconMap: Record = { AlertCircle, Info, ExternalLink, - Trash2 + Trash2, + Podcast } const defaultData = { diff --git a/surfsense_web/components/ui/slider.tsx b/surfsense_web/components/ui/slider.tsx new file mode 100644 index 000000000..f6ab6d565 --- /dev/null +++ b/surfsense_web/components/ui/slider.tsx @@ -0,0 +1,28 @@ +"use client" + +import * as React from "react" +import * as SliderPrimitive from "@radix-ui/react-slider" + +import { cn } from "@/lib/utils" + +const Slider = React.forwardRef< + React.ElementRef, + React.ComponentPropsWithoutRef +>(({ className, ...props }, ref) => ( + + + + + + +)) +Slider.displayName = SliderPrimitive.Root.displayName + +export { Slider } \ No newline at end of file diff --git a/surfsense_web/content/docs/docker-installation.mdx b/surfsense_web/content/docs/docker-installation.mdx index 2a373d048..03d687489 100644 --- a/surfsense_web/content/docs/docker-installation.mdx +++ b/surfsense_web/content/docs/docker-installation.mdx @@ -1,8 +1,9 @@ --- title: Docker Installation -description: Setting up SurfSense using Docker +description: Setting up SurfSense using Docker full: true --- + ## Known Limitations âš ī¸ **Important Note:** Currently, the following features have limited functionality when running in Docker: @@ -12,8 +13,7 @@ full: true We're actively working to resolve these limitations in future releases. - -# Docker Installation +# Docker Installation This guide explains how to run SurfSense using Docker Compose, which is the preferred and recommended method for deployment. @@ -32,125 +32,203 @@ Before you begin, ensure you have: ## Installation Steps 1. **Configure Environment Variables** - - Set up the necessary environment variables: - - **Linux/macOS:** - ```bash - # Copy example environment files - cp surfsense_backend/.env.example surfsense_backend/.env - cp surfsense_web/.env.example surfsense_web/.env - ``` - - **Windows (Command Prompt):** - ```cmd - copy surfsense_backend\.env.example surfsense_backend\.env - copy surfsense_web\.env.example surfsense_web\.env - ``` - - **Windows (PowerShell):** - ```powershell - Copy-Item -Path surfsense_backend\.env.example -Destination surfsense_backend\.env - Copy-Item -Path surfsense_web\.env.example -Destination surfsense_web\.env - ``` + Set up the necessary environment variables: - Edit both `.env` files and fill in the required values: + **Linux/macOS:** - **Backend Environment Variables:** + ```bash + # Copy example environment files + cp surfsense_backend/.env.example surfsense_backend/.env + cp surfsense_web/.env.example surfsense_web/.env + cp .env.example .env # For Docker-specific settings + ``` - | ENV VARIABLE | DESCRIPTION | - |--------------|-------------| - | DATABASE_URL | PostgreSQL connection string (e.g., `postgresql+asyncpg://postgres:postgres@localhost:5432/surfsense`) | - | SECRET_KEY | JWT Secret key for authentication (should be a secure random string) | - | GOOGLE_OAUTH_CLIENT_ID | Google OAuth client ID obtained from Google Cloud Console | - | GOOGLE_OAUTH_CLIENT_SECRET | Google OAuth client secret obtained from Google Cloud Console | - | NEXT_FRONTEND_URL | URL where your frontend application is hosted (e.g., `http://localhost:3000`) | - | EMBEDDING_MODEL | Name of the embedding model (e.g., `mixedbread-ai/mxbai-embed-large-v1`) | - | RERANKERS_MODEL_NAME | Name of the reranker model (e.g., `ms-marco-MiniLM-L-12-v2`) | - | RERANKERS_MODEL_TYPE | Type of reranker model (e.g., `flashrank`) | - | FAST_LLM | LiteLLM routed smaller, faster LLM (e.g., `openai/gpt-4o-mini`, `ollama/deepseek-r1:8b`) | - | STRATEGIC_LLM | LiteLLM routed advanced LLM for complex tasks (e.g., `openai/gpt-4o`, `ollama/gemma3:12b`) | - | LONG_CONTEXT_LLM | LiteLLM routed LLM for longer context windows (e.g., `gemini/gemini-2.0-flash`, `ollama/deepseek-r1:8b`) | - | UNSTRUCTURED_API_KEY | API key for Unstructured.io service for document parsing | - | FIRECRAWL_API_KEY | API key for Firecrawl service for web crawling | + **Windows (Command Prompt):** - Include API keys for the LLM providers you're using. For example: - - `OPENAI_API_KEY`: If using OpenAI models - - `GEMINI_API_KEY`: If using Google Gemini models - - For other LLM providers, refer to the [LiteLLM documentation](https://docs.litellm.ai/docs/providers). + ```cmd + copy surfsense_backend\.env.example surfsense_backend\.env + copy surfsense_web\.env.example surfsense_web\.env + copy .env.example .env + ``` - **Frontend Environment Variables:** + **Windows (PowerShell):** - | ENV VARIABLE | DESCRIPTION | - |--------------|-------------| - | NEXT_PUBLIC_FASTAPI_BACKEND_URL | URL of the backend service (e.g., `http://localhost:8000`) | + ```powershell + Copy-Item -Path surfsense_backend\.env.example -Destination surfsense_backend\.env + Copy-Item -Path surfsense_web\.env.example -Destination surfsense_web\.env + Copy-Item -Path .env.example -Destination .env + ``` + + Edit all `.env` files and fill in the required values: + +### Docker-Specific Environment Variables + +| ENV VARIABLE | DESCRIPTION | DEFAULT VALUE | +|----------------------------|-----------------------------------------------------------------------------|---------------------| +| FRONTEND_PORT | Port for the frontend service | 3000 | +| BACKEND_PORT | Port for the backend API service | 8000 | +| POSTGRES_PORT | Port for the PostgreSQL database | 5432 | +| PGADMIN_PORT | Port for pgAdmin web interface | 5050 | +| POSTGRES_USER | PostgreSQL username | postgres | +| POSTGRES_PASSWORD | PostgreSQL password | postgres | +| POSTGRES_DB | PostgreSQL database name | surfsense | +| PGADMIN_DEFAULT_EMAIL | Email for pgAdmin login | admin@surfsense.com | +| PGADMIN_DEFAULT_PASSWORD | Password for pgAdmin login | surfsense | +| NEXT_PUBLIC_API_URL | URL of the backend API (used by frontend) | http://backend:8000 | + +**Backend Environment Variables:** + +| ENV VARIABLE | DESCRIPTION | +| -------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| DATABASE_URL | PostgreSQL connection string (e.g., `postgresql+asyncpg://postgres:postgres@localhost:5432/surfsense`) | +| SECRET_KEY | JWT Secret key for authentication (should be a secure random string) | +| AUTH_TYPE | Authentication method: `GOOGLE` for OAuth with Google, `LOCAL` for email/password authentication | +| NEXT_FRONTEND_URL | URL where your frontend application is hosted (e.g., `http://localhost:3000`) | +| EMBEDDING_MODEL | Name of the embedding model (e.g., `openai://text-embedding-ada-002`, `anthropic://claude-v1`, `mixedbread-ai/mxbai-embed-large-v1`) | +| RERANKERS_MODEL_NAME | Name of the reranker model (e.g., `ms-marco-MiniLM-L-12-v2`) | +| RERANKERS_MODEL_TYPE | Type of reranker model (e.g., `flashrank`) | +| FAST_LLM | LiteLLM routed smaller, faster LLM (e.g., `openai/gpt-4o-mini`, `ollama/deepseek-r1:8b`) | +| STRATEGIC_LLM | LiteLLM routed advanced LLM for complex tasks (e.g., `openai/gpt-4o`, `ollama/gemma3:12b`) | +| LONG_CONTEXT_LLM | LiteLLM routed LLM for longer context windows (e.g., `gemini/gemini-2.0-flash`, `ollama/deepseek-r1:8b`) | +| ETL_SERVICE | Document parsing service: `UNSTRUCTURED` (supports 34+ formats) or `LLAMACLOUD` (supports 50+ formats including legacy document types) | +| UNSTRUCTURED_API_KEY | API key for Unstructured.io service for document parsing (required if ETL_SERVICE=UNSTRUCTURED) | +| LLAMA_CLOUD_API_KEY | API key for LlamaCloud service for document parsing (required if ETL_SERVICE=LLAMACLOUD) | +| FIRECRAWL_API_KEY | API key for Firecrawl service for web crawling | +| TTS_SERVICE | Text-to-Speech API provider for Podcasts (e.g., `openai/tts-1`, `azure/neural`, `vertex_ai/`). See [supported providers](https://docs.litellm.ai/docs/text_to_speech#supported-providers) | +| STT_SERVICE | Speech-to-Text API provider for Podcasts (e.g., `openai/whisper-1`). See [supported providers](https://docs.litellm.ai/docs/audio_transcription#supported-providers) | + + +Include API keys for your chosen LLM providers: + +| ENV VARIABLE | DESCRIPTION | +|--------------------|-----------------------------------------------------------------------------| +| `OPENAI_API_KEY` | Required if using OpenAI models | +| `GEMINI_API_KEY` | Required if using Google Gemini models | +| `ANTHROPIC_API_KEY`| Required if using Anthropic models | + +### Google OAuth Configuration (if AUTH_TYPE=GOOGLE) + +| ENV VARIABLE | DESCRIPTION | +|----------------------------|-----------------------------------------------------------------------------| +| `GOOGLE_OAUTH_CLIENT_ID` | Client ID from Google Cloud Console | +| `GOOGLE_OAUTH_CLIENT_SECRET` | Client secret from Google Cloud Console | + +**Optional Backend LangSmith Observability:** +| ENV VARIABLE | DESCRIPTION | +|--------------|-------------| +| LANGSMITH_TRACING | Enable LangSmith tracing (e.g., `true`) | +| LANGSMITH_ENDPOINT | LangSmith API endpoint (e.g., `https://api.smith.langchain.com`) | +| LANGSMITH_API_KEY | Your LangSmith API key | +| LANGSMITH_PROJECT | LangSmith project name (e.g., `surfsense`) | + +**Optional Backend LiteLLM API Base URLs:** +| ENV VARIABLE | DESCRIPTION | +|--------------|-------------| +| FAST_LLM_API_BASE | Custom API base URL for the fast LLM | +| STRATEGIC_LLM_API_BASE | Custom API base URL for the strategic LLM | +| LONG_CONTEXT_LLM_API_BASE | Custom API base URL for the long context LLM | +| TTS_SERVICE_API_BASE | Custom API base URL for the Text-to-Speech (TTS) service | +| STT_SERVICE_API_BASE | Custom API base URL for the Speech-to-Text (STT) service | + +For other LLM providers, refer to the [LiteLLM documentation](https://docs.litellm.ai/docs/providers). + +### Frontend Environment Variables + +| ENV VARIABLE | DESCRIPTION | +| ------------------------------- | ---------------------------------------------------------- | +| NEXT_PUBLIC_FASTAPI_BACKEND_URL | URL of the backend service (e.g., `http://localhost:8000`) | +| NEXT_PUBLIC_FASTAPI_BACKEND_AUTH_TYPE | Same value as set in backend AUTH_TYPE i.e `GOOGLE` for OAuth with Google, `LOCAL` for email/password authentication | +| NEXT_PUBLIC_ETL_SERVICE | Document parsing service (should match backend ETL_SERVICE): `UNSTRUCTURED` or `LLAMACLOUD` - affects supported file formats in upload interface | 2. **Build and Start Containers** - + Start the Docker containers: - + **Linux/macOS/Windows:** + ```bash - docker-compose up --build + docker compose up --build ``` To run in detached mode (in the background): - + **Linux/macOS/Windows:** + ```bash - docker-compose up -d + docker compose up -d ``` - **Note for Windows users:** If you're using older Docker Desktop versions, you might need to use `docker compose` (with a space) instead of `docker-compose`. + **Note for Windows users:** If you're using older Docker Desktop versions, you might need to use `docker compose` (with a space) instead of `docker compose`. 3. **Access the Applications** - + Once the containers are running, you can access: + - Frontend: [http://localhost:3000](http://localhost:3000) - Backend API: [http://localhost:8000](http://localhost:8000) - API Documentation: [http://localhost:8000/docs](http://localhost:8000/docs) + - pgAdmin: [http://localhost:5050](http://localhost:5050) + +## Using pgAdmin + +pgAdmin is included in the Docker setup to help manage your PostgreSQL database. To connect: + +1. Open pgAdmin at [http://localhost:5050](http://localhost:5050) +2. Login with the credentials from your `.env` file (default: admin@surfsense.com / surfsense) +3. Right-click "Servers" > "Create" > "Server" +4. In the "General" tab, name your connection (e.g., "SurfSense DB") +5. In the "Connection" tab: + - Host: `db` + - Port: `5432` + - Maintenance database: `surfsense` + - Username: `postgres` (or your custom POSTGRES_USER) + - Password: `postgres` (or your custom POSTGRES_PASSWORD) +6. Click "Save" to connect ## Useful Docker Commands ### Container Management - **Stop containers:** - + **Linux/macOS/Windows:** + ```bash - docker-compose down + docker compose down ``` - **View logs:** - + **Linux/macOS/Windows:** + ```bash # All services - docker-compose logs -f - + docker compose logs -f + # Specific service - docker-compose logs -f backend - docker-compose logs -f frontend - docker-compose logs -f db + docker compose logs -f backend + docker compose logs -f frontend + docker compose logs -f db ``` - **Restart a specific service:** - + **Linux/macOS/Windows:** + ```bash - docker-compose restart backend + docker compose restart backend ``` - **Execute commands in a running container:** - + **Linux/macOS/Windows:** + ```bash # Backend - docker-compose exec backend python -m pytest - + docker compose exec backend python -m pytest + # Frontend - docker-compose exec frontend pnpm lint + docker compose exec frontend pnpm lint ``` ## Troubleshooting @@ -162,7 +240,6 @@ Before you begin, ensure you have: - For frontend dependency issues, check the `Dockerfile` in the frontend directory. - **Windows-specific:** If you encounter line ending issues (CRLF vs LF), configure Git to handle line endings properly with `git config --global core.autocrlf true` before cloning the repository. - ## Next Steps -Once your installation is complete, you can start using SurfSense! Navigate to the frontend URL and log in using your Google account. \ No newline at end of file +Once your installation is complete, you can start using SurfSense! Navigate to the frontend URL and log in using your Google account. diff --git a/surfsense_web/content/docs/index.mdx b/surfsense_web/content/docs/index.mdx index f3411b897..4845a7312 100644 --- a/surfsense_web/content/docs/index.mdx +++ b/surfsense_web/content/docs/index.mdx @@ -47,9 +47,11 @@ See the [installation notes](https://github.com/pgvector/pgvector/tree/master#in --- -## Google OAuth Setup +## Google OAuth Setup (Optional) -SurfSense user management and authentication works on Google OAuth. Lets set it up. +SurfSense supports both Google OAuth and local email/password authentication. Google OAuth is optional - if you prefer local authentication, you can skip this section. + +To set up Google OAuth: 1. Login to your [Google Developer Console](https://console.cloud.google.com/) 2. Enable People API. diff --git a/surfsense_web/content/docs/manual-installation.mdx b/surfsense_web/content/docs/manual-installation.mdx index 477f5ef17..82b9f0fef 100644 --- a/surfsense_web/content/docs/manual-installation.mdx +++ b/surfsense_web/content/docs/manual-installation.mdx @@ -27,18 +27,21 @@ The backend is the core of SurfSense. Follow these steps to set it up: First, create and configure your environment variables by copying the example file: **Linux/macOS:** + ```bash cd surfsense_backend cp .env.example .env ``` **Windows (Command Prompt):** + ```cmd cd surfsense_backend copy .env.example .env ``` **Windows (PowerShell):** + ```powershell cd surfsense_backend Copy-Item -Path .env.example -Destination .env @@ -46,32 +49,67 @@ Copy-Item -Path .env.example -Destination .env Edit the `.env` file and set the following variables: +| ENV VARIABLE | DESCRIPTION | +| -------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| DATABASE_URL | PostgreSQL connection string (e.g., `postgresql+asyncpg://postgres:postgres@localhost:5432/surfsense`) | +| SECRET_KEY | JWT Secret key for authentication (should be a secure random string) | +| AUTH_TYPE | Authentication method: `GOOGLE` for OAuth with Google, `LOCAL` for email/password authentication | +| NEXT_FRONTEND_URL | URL where your frontend application is hosted (e.g., `http://localhost:3000`) | +| EMBEDDING_MODEL | Name of the embedding model (e.g., `openai://text-embedding-ada-002`, `anthropic://claude-v1`, `mixedbread-ai/mxbai-embed-large-v1`) | +| RERANKERS_MODEL_NAME | Name of the reranker model (e.g., `ms-marco-MiniLM-L-12-v2`) | +| RERANKERS_MODEL_TYPE | Type of reranker model (e.g., `flashrank`) | +| FAST_LLM | LiteLLM routed smaller, faster LLM (e.g., `openai/gpt-4o-mini`, `ollama/deepseek-r1:8b`) | +| STRATEGIC_LLM | LiteLLM routed advanced LLM for complex tasks (e.g., `openai/gpt-4o`, `ollama/gemma3:12b`) | +| LONG_CONTEXT_LLM | LiteLLM routed LLM for longer context windows (e.g., `gemini/gemini-2.0-flash`, `ollama/deepseek-r1:8b`) | +| ETL_SERVICE | Document parsing service: `UNSTRUCTURED` (supports 34+ formats) or `LLAMACLOUD` (supports 50+ formats including legacy document types) | +| UNSTRUCTURED_API_KEY | API key for Unstructured.io service for document parsing (required if ETL_SERVICE=UNSTRUCTURED) | +| LLAMA_CLOUD_API_KEY | API key for LlamaCloud service for document parsing (required if ETL_SERVICE=LLAMACLOUD) | +| FIRECRAWL_API_KEY | API key for Firecrawl service for web crawling | +| TTS_SERVICE | Text-to-Speech API provider for Podcasts (e.g., `openai/tts-1`, `azure/neural`, `vertex_ai/`). See [supported providers](https://docs.litellm.ai/docs/text_to_speech#supported-providers) | +| STT_SERVICE | Speech-to-Text API provider for Podcasts (e.g., `openai/whisper-1`). See [supported providers](https://docs.litellm.ai/docs/audio_transcription#supported-providers) | + + +Include API keys for your chosen LLM providers: + +| ENV VARIABLE | DESCRIPTION | +|--------------------|-----------------------------------------------------------------------------| +| `OPENAI_API_KEY` | Required if using OpenAI models | +| `GEMINI_API_KEY` | Required if using Google Gemini models | +| `ANTHROPIC_API_KEY`| Required if using Anthropic models | + +For other providers, refer to the [LiteLLM documentation](https://docs.litellm.ai/docs/providers) + +### Google OAuth Configuration (if AUTH_TYPE=GOOGLE) + +| ENV VARIABLE | DESCRIPTION | +|----------------------------|-----------------------------------------------------------------------------| +| `GOOGLE_OAUTH_CLIENT_ID` | Client ID from Google Cloud Console | +| `GOOGLE_OAUTH_CLIENT_SECRET` | Client secret from Google Cloud Console | + + +**Optional Backend LangSmith Observability:** | ENV VARIABLE | DESCRIPTION | |--------------|-------------| -| DATABASE_URL | PostgreSQL connection string (e.g., `postgresql+asyncpg://postgres:postgres@localhost:5432/surfsense`) | -| SECRET_KEY | JWT Secret key for authentication (should be a secure random string) | -| GOOGLE_OAUTH_CLIENT_ID | Google OAuth client ID | -| GOOGLE_OAUTH_CLIENT_SECRET | Google OAuth client secret | -| NEXT_FRONTEND_URL | Frontend application URL (e.g., `http://localhost:3000`) | -| EMBEDDING_MODEL | Name of the embedding model (e.g., `mixedbread-ai/mxbai-embed-large-v1`) | -| RERANKERS_MODEL_NAME | Name of the reranker model (e.g., `ms-marco-MiniLM-L-12-v2`) | -| RERANKERS_MODEL_TYPE | Type of reranker model (e.g., `flashrank`) | -| FAST_LLM | LiteLLM routed faster LLM (e.g., `openai/gpt-4o-mini`, `ollama/deepseek-r1:8b`) | -| STRATEGIC_LLM | LiteLLM routed advanced LLM (e.g., `openai/gpt-4o`, `ollama/gemma3:12b`) | -| LONG_CONTEXT_LLM | LiteLLM routed long-context LLM (e.g., `gemini/gemini-2.0-flash`, `ollama/deepseek-r1:8b`) | -| UNSTRUCTURED_API_KEY | API key for Unstructured.io service | -| FIRECRAWL_API_KEY | API key for Firecrawl service (if using crawler) | +| LANGSMITH_TRACING | Enable LangSmith tracing (e.g., `true`) | +| LANGSMITH_ENDPOINT | LangSmith API endpoint (e.g., `https://api.smith.langchain.com`) | +| LANGSMITH_API_KEY | Your LangSmith API key | +| LANGSMITH_PROJECT | LangSmith project name (e.g., `surfsense`) | -**Important**: Since LLM calls are routed through LiteLLM, include API keys for the LLM providers you're using: -- For OpenAI models: `OPENAI_API_KEY` -- For Google Gemini models: `GEMINI_API_KEY` -- For other providers, refer to the [LiteLLM documentation](https://docs.litellm.ai/docs/providers) +**Optional Backend LiteLLM API Base URLs:** +| ENV VARIABLE | DESCRIPTION | +|--------------|-------------| +| FAST_LLM_API_BASE | Custom API base URL for the fast LLM | +| STRATEGIC_LLM_API_BASE | Custom API base URL for the strategic LLM | +| LONG_CONTEXT_LLM_API_BASE | Custom API base URL for the long context LLM | +| TTS_SERVICE_API_BASE | Custom API base URL for the Text-to-Speech (TTS) service | +| STT_SERVICE_API_BASE | Custom API base URL for the Speech-to-Text (STT) service | ### 2. Install Dependencies Install the backend dependencies using `uv`: **Linux/macOS:** + ```bash # Install uv if you don't have it curl -fsSL https://astral.sh/uv/install.sh | bash @@ -81,6 +119,7 @@ uv sync ``` **Windows (PowerShell):** + ```powershell # Install uv if you don't have it iwr -useb https://astral.sh/uv/install.ps1 | iex @@ -90,6 +129,7 @@ uv sync ``` **Windows (Command Prompt):** + ```cmd # Install dependencies with uv (after installing uv) uv sync @@ -100,6 +140,7 @@ uv sync Start the backend server: **Linux/macOS/Windows:** + ```bash # Run without hot reloading uv run main.py @@ -117,18 +158,21 @@ If everything is set up correctly, you should see output indicating the server i Set up the frontend environment: **Linux/macOS:** + ```bash cd surfsense_web cp .env.example .env ``` **Windows (Command Prompt):** + ```cmd cd surfsense_web copy .env.example .env ``` **Windows (PowerShell):** + ```powershell cd surfsense_web Copy-Item -Path .env.example -Destination .env @@ -136,15 +180,18 @@ Copy-Item -Path .env.example -Destination .env Edit the `.env` file and set: -| ENV VARIABLE | DESCRIPTION | -|--------------|-------------| +| ENV VARIABLE | DESCRIPTION | +| ------------------------------- | ------------------------------------------- | | NEXT_PUBLIC_FASTAPI_BACKEND_URL | Backend URL (e.g., `http://localhost:8000`) | +| NEXT_PUBLIC_FASTAPI_BACKEND_AUTH_TYPE | Same value as set in backend AUTH_TYPE i.e `GOOGLE` for OAuth with Google, `LOCAL` for email/password authentication | +| NEXT_PUBLIC_ETL_SERVICE | Document parsing service (should match backend ETL_SERVICE): `UNSTRUCTURED` or `LLAMACLOUD` - affects supported file formats in upload interface | ### 2. Install Dependencies Install the frontend dependencies: **Linux/macOS:** + ```bash # Install pnpm if you don't have it npm install -g pnpm @@ -154,6 +201,7 @@ pnpm install ``` **Windows:** + ```powershell # Install pnpm if you don't have it npm install -g pnpm @@ -167,6 +215,7 @@ pnpm install Start the Next.js development server: **Linux/macOS/Windows:** + ```bash pnpm run dev ``` @@ -180,18 +229,21 @@ The SurfSense browser extension allows you to save any webpage, including those ### 1. Environment Configuration **Linux/macOS:** + ```bash cd surfsense_browser_extension cp .env.example .env ``` **Windows (Command Prompt):** + ```cmd cd surfsense_browser_extension copy .env.example .env ``` **Windows (PowerShell):** + ```powershell cd surfsense_browser_extension Copy-Item -Path .env.example -Destination .env @@ -199,8 +251,8 @@ Copy-Item -Path .env.example -Destination .env Edit the `.env` file: -| ENV VARIABLE | DESCRIPTION | -|--------------|-------------| +| ENV VARIABLE | DESCRIPTION | +| ------------------------- | ----------------------------------------------------- | | PLASMO_PUBLIC_BACKEND_URL | SurfSense Backend URL (e.g., `http://127.0.0.1:8000`) | ### 2. Build the Extension @@ -208,6 +260,7 @@ Edit the `.env` file: Build the extension for your browser using the [Plasmo framework](https://docs.plasmo.com/framework/workflows/build#with-a-specific-target). **Linux/macOS/Windows:** + ```bash # Install dependencies pnpm install @@ -252,7 +305,8 @@ Now that you have SurfSense running locally, you can explore its features: - Explore the advanced RAG capabilities For production deployments, consider setting up: + - A reverse proxy like Nginx - SSL certificates for secure connections - Proper database backups -- User access controls \ No newline at end of file +- User access controls diff --git a/surfsense_web/hooks/useConnectorEditPage.ts b/surfsense_web/hooks/useConnectorEditPage.ts index d7672025d..7e81c5524 100644 --- a/surfsense_web/hooks/useConnectorEditPage.ts +++ b/surfsense_web/hooks/useConnectorEditPage.ts @@ -59,7 +59,8 @@ export function useConnectorEditPage(connectorId: number, searchSpaceId: string) NOTION_INTEGRATION_TOKEN: config.NOTION_INTEGRATION_TOKEN || "", SERPER_API_KEY: config.SERPER_API_KEY || "", TAVILY_API_KEY: config.TAVILY_API_KEY || "", - LINEAR_API_KEY: config.LINEAR_API_KEY || "" + LINEAR_API_KEY: config.LINEAR_API_KEY || "", + LINKUP_API_KEY: config.LINKUP_API_KEY || "" }); if (currentConnector.connector_type === 'GITHUB_CONNECTOR') { const savedRepos = config.repo_full_names || []; @@ -164,6 +165,12 @@ export function useConnectorEditPage(connectorId: number, searchSpaceId: string) newConfig = { LINEAR_API_KEY: formData.LINEAR_API_KEY }; } break; + case 'LINKUP_API': + if (formData.LINKUP_API_KEY !== originalConfig.LINKUP_API_KEY) { + if (!formData.LINKUP_API_KEY) { toast.error("Linkup API Key cannot be empty."); setIsSaving(false); return; } + newConfig = { LINKUP_API_KEY: formData.LINKUP_API_KEY }; + } + break; } if (newConfig !== null) { @@ -203,6 +210,8 @@ export function useConnectorEditPage(connectorId: number, searchSpaceId: string) editForm.setValue('TAVILY_API_KEY', newlySavedConfig.TAVILY_API_KEY || ""); } else if(connector.connector_type === 'LINEAR_CONNECTOR') { editForm.setValue('LINEAR_API_KEY', newlySavedConfig.LINEAR_API_KEY || ""); + } else if(connector.connector_type === 'LINKUP_API') { + editForm.setValue('LINKUP_API_KEY', newlySavedConfig.LINKUP_API_KEY || ""); } } if (connector.connector_type === 'GITHUB_CONNECTOR') { diff --git a/surfsense_web/lib/connectors/utils.ts b/surfsense_web/lib/connectors/utils.ts index 5efc59386..f93bd3f82 100644 --- a/surfsense_web/lib/connectors/utils.ts +++ b/surfsense_web/lib/connectors/utils.ts @@ -7,6 +7,7 @@ export const getConnectorTypeDisplay = (type: string): string => { "NOTION_CONNECTOR": "Notion", "GITHUB_CONNECTOR": "GitHub", "LINEAR_CONNECTOR": "Linear", + "LINKUP_API": "Linkup", }; return typeMap[type] || type; }; diff --git a/surfsense_web/package.json b/surfsense_web/package.json index 07fcb5549..d9f5529b6 100644 --- a/surfsense_web/package.json +++ b/surfsense_web/package.json @@ -1,6 +1,6 @@ { "name": "surfsense_web", - "version": "0.0.6", + "version": "0.0.7", "private": true, "description": "SurfSense Frontend", "scripts": { @@ -28,12 +28,14 @@ "@radix-ui/react-popover": "^1.1.6", "@radix-ui/react-select": "^2.1.6", "@radix-ui/react-separator": "^1.1.2", + "@radix-ui/react-slider": "^1.3.4", "@radix-ui/react-slot": "^1.1.2", "@radix-ui/react-tabs": "^1.1.3", "@radix-ui/react-tooltip": "^1.1.8", "@tabler/icons-react": "^3.30.0", "@tanstack/react-table": "^8.21.2", "@types/mdx": "^2.0.13", + "@types/react-syntax-highlighter": "^15.5.13", "ai": "^4.1.54", "class-variance-authority": "^0.7.1", "clsx": "^2.1.1", @@ -54,6 +56,7 @@ "react-json-view": "^1.21.3", "react-json-view-lite": "^2.4.0", "react-markdown": "^10.0.1", + "react-syntax-highlighter": "^15.6.1", "rehype-raw": "^7.0.0", "rehype-sanitize": "^6.0.0", "remark-gfm": "^4.0.1", diff --git a/surfsense_web/pnpm-lock.yaml b/surfsense_web/pnpm-lock.yaml index a8eb3c82a..e7ab3e9f9 100644 --- a/surfsense_web/pnpm-lock.yaml +++ b/surfsense_web/pnpm-lock.yaml @@ -47,6 +47,9 @@ importers: '@radix-ui/react-separator': specifier: ^1.1.2 version: 1.1.2(@types/react-dom@19.0.4(@types/react@19.0.10))(@types/react@19.0.10)(react-dom@19.0.0(react@19.0.0))(react@19.0.0) + '@radix-ui/react-slider': + specifier: ^1.3.4 + version: 1.3.4(@types/react-dom@19.0.4(@types/react@19.0.10))(@types/react@19.0.10)(react-dom@19.0.0(react@19.0.0))(react@19.0.0) '@radix-ui/react-slot': specifier: ^1.1.2 version: 1.1.2(@types/react@19.0.10)(react@19.0.0) @@ -65,6 +68,9 @@ importers: '@types/mdx': specifier: ^2.0.13 version: 2.0.13 + '@types/react-syntax-highlighter': + specifier: ^15.5.13 + version: 15.5.13 ai: specifier: ^4.1.54 version: 4.1.54(react@19.0.0)(zod@3.24.2) @@ -125,6 +131,9 @@ importers: react-markdown: specifier: ^10.0.1 version: 10.0.1(@types/react@19.0.10)(react@19.0.0) + react-syntax-highlighter: + specifier: ^15.6.1 + version: 15.6.1(react@19.0.0) rehype-raw: specifier: ^7.0.0 version: 7.0.0 @@ -960,6 +969,19 @@ packages: '@types/react-dom': optional: true + '@radix-ui/react-collection@1.1.6': + resolution: {integrity: sha512-PbhRFK4lIEw9ADonj48tiYWzkllz81TM7KVYyyMMw2cwHO7D5h4XKEblL8NlaRisTK3QTe6tBEhDccFUryxHBQ==} + peerDependencies: + '@types/react': '*' + '@types/react-dom': '*' + react: ^16.8 || ^17.0 || ^18.0 || ^19.0 || ^19.0.0-rc + react-dom: ^16.8 || ^17.0 || ^18.0 || ^19.0 || ^19.0.0-rc + peerDependenciesMeta: + '@types/react': + optional: true + '@types/react-dom': + optional: true + '@radix-ui/react-compose-refs@1.0.0': resolution: {integrity: sha512-0KaSv6sx787/hK3eF53iOkiSLwAGlFMx5lotrqD2pTjB18KbybKoEIgkNZTKC60YECDQTKGTRcDBILwZVqVKvA==} peerDependencies: @@ -1480,6 +1502,19 @@ packages: '@types/react-dom': optional: true + '@radix-ui/react-primitive@2.1.2': + resolution: {integrity: sha512-uHa+l/lKfxuDD2zjN/0peM/RhhSmRjr5YWdk/37EnSv1nJ88uvG85DPexSm8HdFQROd2VdERJ6ynXbkCFi+APw==} + peerDependencies: + '@types/react': '*' + '@types/react-dom': '*' + react: ^16.8 || ^17.0 || ^18.0 || ^19.0 || ^19.0.0-rc + react-dom: ^16.8 || ^17.0 || ^18.0 || ^19.0 || ^19.0.0-rc + peerDependenciesMeta: + '@types/react': + optional: true + '@types/react-dom': + optional: true + '@radix-ui/react-roving-focus@1.1.2': resolution: {integrity: sha512-zgMQWkNO169GtGqRvYrzb0Zf8NhMHS2DuEB/TiEmVnpr5OqPU3i8lfbxaAmC2J/KYuIQxyoQQ6DxepyXp61/xw==} peerDependencies: @@ -1545,6 +1580,19 @@ packages: '@types/react-dom': optional: true + '@radix-ui/react-slider@1.3.4': + resolution: {integrity: sha512-Cp6hEmQtRJFci285vkdIJ+HCDLTRDk+25VhFwa1fcubywjMUE3PynBgtN5RLudOgSCYMlT4jizCXdmV+8J7Y2w==} + peerDependencies: + '@types/react': '*' + '@types/react-dom': '*' + react: ^16.8 || ^17.0 || ^18.0 || ^19.0 || ^19.0.0-rc + react-dom: ^16.8 || ^17.0 || ^18.0 || ^19.0 || ^19.0.0-rc + peerDependenciesMeta: + '@types/react': + optional: true + '@types/react-dom': + optional: true + '@radix-ui/react-slot@1.0.0': resolution: {integrity: sha512-3mrKauI/tWXo1Ll+gN5dHcxDPdm/Df1ufcDLCecn+pnCIVcdWE7CujXo8QaXOWRJyZyQWWbpB8eFwHzWXlv5mQ==} peerDependencies: @@ -1577,6 +1625,15 @@ packages: '@types/react': optional: true + '@radix-ui/react-slot@1.2.2': + resolution: {integrity: sha512-y7TBO4xN4Y94FvcWIOIh18fM4R1A8S4q1jhoz4PNzOoHsFcN8pogcFmZrTYAm4F9VRUrWP/Mw7xSKybIeRI+CQ==} + peerDependencies: + '@types/react': '*' + react: ^16.8 || ^17.0 || ^18.0 || ^19.0 || ^19.0.0-rc + peerDependenciesMeta: + '@types/react': + optional: true + '@radix-ui/react-tabs@1.1.3': resolution: {integrity: sha512-9mFyI30cuRDImbmFF6O2KUJdgEOsGh9Vmx9x/Dh9tOhL7BngmQPQfwW4aejKm5OHpfWIdmeV6ySyuxoOGjtNng==} peerDependencies: @@ -1996,6 +2053,9 @@ packages: '@types/estree@1.0.6': resolution: {integrity: sha512-AYnb1nQyY49te+VRAVgmzfcgjYS91mY5P0TKUDCLEM+gNnA+3T6rWITXRLYCpahpqSQbN5cE+gHpnPyXjHWxcw==} + '@types/hast@2.3.10': + resolution: {integrity: sha512-McWspRw8xx8J9HurkVBfYj0xKoE25tOFlHGdx4MJ5xORQrMGZNqJhVQWaIbm6Oyla5kYOXtDiopzKRJzEOkwJw==} + '@types/hast@3.0.4': resolution: {integrity: sha512-WPs+bbQw5aCj+x6laNGWLH3wviHtoCv/P3+otBhbOhJgG8qtpdAMlTCxLtsTWA7LH1Oh/bFCHsBn0TPS5m30EQ==} @@ -2022,6 +2082,9 @@ packages: peerDependencies: '@types/react': ^19.0.0 + '@types/react-syntax-highlighter@15.5.13': + resolution: {integrity: sha512-uLGJ87j6Sz8UaBAooU0T6lWJ0dBmjZgN1PZTrj05TNql2/XpC6+4HhMT5syIdFUUt+FASfCeLLv4kBygNU+8qA==} + '@types/react@19.0.10': resolution: {integrity: sha512-JuRQ9KXLEjaUNjTWpzuR231Z2WpIwczOkBEIvbHNCzQefFIT0L8IqE6NV6ULLyC1SI/i234JnDoMkfg+RjQj2g==} @@ -2283,12 +2346,21 @@ packages: character-entities-html4@2.1.0: resolution: {integrity: sha512-1v7fgQRj6hnSwFpq1Eu0ynr/CDEw0rXo2B61qXrLNdHZmPKgb7fqS1a2JwF0rISo9q77jDI8VMEHoApn8qDoZA==} + character-entities-legacy@1.1.4: + resolution: {integrity: sha512-3Xnr+7ZFS1uxeiUDvV02wQ+QDbc55o97tIV5zHScSPJpcLm/r0DFPcoY3tYRp+VZukxuMeKgXYmsXQHO05zQeA==} + character-entities-legacy@3.0.0: resolution: {integrity: sha512-RpPp0asT/6ufRm//AJVwpViZbGM/MkjQFxJccQRHmISF/22NBtsHqAWmL+/pmkPWoIUJdWyeVleTl1wydHATVQ==} + character-entities@1.2.4: + resolution: {integrity: sha512-iBMyeEHxfVnIakwOuDXpVkc54HijNgCyQB2w0VfGQThle6NXn50zU6V/u+LDhxHcDUPojn6Kpga3PTAD8W1bQw==} + character-entities@2.0.2: resolution: {integrity: sha512-shx7oQ0Awen/BRIdkjkvz54PnEEI/EjwXDSIZp86/KKdbafHh1Df/RYGBhn4hbe2+uKC9FnT5UCEdyPz3ai9hQ==} + character-reference-invalid@1.1.4: + resolution: {integrity: sha512-mKKUkUbhPpQlCOfIuZkvSEgktjPFIsZKRRbC6KWVEMvlzblj3i3asQv5ODsrwt0N3pHAEvjP8KTQPHkp0+6jOg==} + character-reference-invalid@2.0.1: resolution: {integrity: sha512-iBZ4F4wRbyORVsu0jPV7gXkOsGYjGHPmAyv+HiHG8gi5PtC9KI2j1+v8/tlibRvjoWX027ypmG/n0HtO5t7unw==} @@ -2333,6 +2405,9 @@ packages: resolution: {integrity: sha512-1rXeuUUiGGrykh+CeBdu5Ie7OJwinCgQY0bc7GCRxy5xVHy+moaqkpL/jqQq0MtQOeYcrqEz4abc5f0KtU7W4A==} engines: {node: '>=12.5.0'} + comma-separated-tokens@1.0.8: + resolution: {integrity: sha512-GHuDRO12Sypu2cV70d1dkA2EUmXHgntrzbpvOB+Qy+49ypNfGgFQIC2fhhXbnyrJRynDCAARsT7Ou0M6hirpfw==} + comma-separated-tokens@2.0.3: resolution: {integrity: sha512-Fu4hJdvzeylCfQPp9SGWidpzrMs7tTrlu6Vb8XGaRGck8QSNZJJp538Wrb60Lax4fPwR64ViY468OIUTbRlGZg==} @@ -2711,6 +2786,9 @@ packages: fastq@1.19.1: resolution: {integrity: sha512-GwLTyxkCXjXbxqIhTsMI2Nui8huMPtnxg7krajPJAjnEG/iiOS7i+zCtWGZR9G0NBKbXKh6X9m9UIsYX/N6vvQ==} + fault@1.0.4: + resolution: {integrity: sha512-CJ0HCB5tL5fYTEA7ToAq5+kTwd++Borf1/bifxd9iT70QcXr4MRrO3Llf8Ifs70q+SJcGHFtnIE/Nw6giCtECA==} + fbemitter@3.0.0: resolution: {integrity: sha512-KWKaceCwKQU0+HPoop6gn4eOHk50bBv/VxjJtGMfwmJt3D29JpN4H4eisCtIPA+a8GVBam+ldMMpMjJUvpDyHw==} @@ -2764,6 +2842,10 @@ packages: resolution: {integrity: sha512-gIXjKqtFuWEgzFRJA9WCQeSJLZDjgJUOMCMzxtvFq/37KojM1BFGufqsCy0r4qSQmYLsZYMeyRqzIWOMup03sw==} engines: {node: '>=14'} + format@0.2.2: + resolution: {integrity: sha512-wzsgA6WOq+09wrU1tsJ09udeR/YZRaeArL9e1wPbFg3GG2yDnC2ldKpxs4xunpFF9DgqCqOIra3bc1HWrJ37Ww==} + engines: {node: '>=0.4.x'} + framer-motion@12.4.7: resolution: {integrity: sha512-VhrcbtcAMXfxlrjeHPpWVu2+mkcoR31e02aNSR7OUS/hZAciKa8q6o3YN2mA1h+jjscRsSyKvX6E1CiY/7OLMw==} peerDependencies: @@ -2934,6 +3016,9 @@ packages: hast-util-from-parse5@8.0.3: resolution: {integrity: sha512-3kxEVkEKt0zvcZ3hCRYI8rqrgwtlIOFMWkbclACvjlDw8Li9S2hk/d51OI0nr/gIpdMHNepwgOKqZ/sy0Clpyg==} + hast-util-parse-selector@2.2.5: + resolution: {integrity: sha512-7j6mrk/qqkSehsM92wQjdIgWM2/BW61u/53G6xmC8i1OmEdKLHbk419QKQUjz6LglWsfqoiHmyMRkP1BGjecNQ==} + hast-util-parse-selector@4.0.0: resolution: {integrity: sha512-wkQCkSYoOGCRKERFWcxMVMOcYE2K1AaNLU8DXS9arxnLOUEWbOXKXiJUNzEpqZ3JOKpnha3jkFrumEjVliDe7A==} @@ -2961,9 +3046,18 @@ packages: hast-util-whitespace@3.0.0: resolution: {integrity: sha512-88JUN06ipLwsnv+dVn+OIYOvAuvBMy/Qoi6O7mQHxdPXpjy+Cd6xRkWwux7DKO+4sYILtLBRIKgsdpS2gQc7qw==} + hastscript@6.0.0: + resolution: {integrity: sha512-nDM6bvd7lIqDUiYEiu5Sl/+6ReP0BMk/2f4U/Rooccxkj0P5nm+acM5PrGJ/t5I8qPGiqZSE6hVAwZEdZIvP4w==} + hastscript@9.0.1: resolution: {integrity: sha512-g7df9rMFX/SPi34tyGCyUBREQoKkapwdY/T04Qn9TDWfHhAYt4/I0gMVirzK5wEzeUqIjEB+LXC/ypb7Aqno5w==} + highlight.js@10.7.3: + resolution: {integrity: sha512-tzcUFauisWKNHaRkN4Wjl/ZA07gENAjFl3J/c480dprkGTg5EQstgaNFqBfUqCq54kZRIEcreTsAgF/m2quD7A==} + + highlightjs-vue@1.0.0: + resolution: {integrity: sha512-PDEfEF102G23vHmPhLyPboFCD+BkMGu+GuJe2d9/eH4FsCwvgBpnc9n0pGE+ffKdph38s6foEZiEjdgHdzp+IA==} + html-url-attributes@3.0.1: resolution: {integrity: sha512-ol6UPyBWqsrO6EJySPz2O7ZSr856WDrEzM5zMqp+FJJLGMW35cLYmmZnl0vztAZxRUoNZJFTCohfjuIJ8I4QBQ==} @@ -2998,9 +3092,15 @@ packages: resolution: {integrity: sha512-4gd7VpWNQNB4UKKCFFVcp1AVv+FMOgs9NKzjHKusc8jTMhd5eL1NqQqOpE0KzMds804/yHlglp3uxgluOqAPLw==} engines: {node: '>= 0.4'} + is-alphabetical@1.0.4: + resolution: {integrity: sha512-DwzsA04LQ10FHTZuL0/grVDk4rFoVH1pjAToYwBrHSxcrBIGQuXrQMtD5U1b0U2XVgKZCTLLP8u2Qxqhy3l2Vg==} + is-alphabetical@2.0.1: resolution: {integrity: sha512-FWyyY60MeTNyeSRpkM2Iry0G9hpr7/9kD40mD/cGQEuilcZYS4okz8SN2Q6rLCJ8gbCt6fN+rC+6tMGS99LaxQ==} + is-alphanumerical@1.0.4: + resolution: {integrity: sha512-UzoZUr+XfVz3t3v4KyGEniVL9BDRoQtY7tOyrRybkVNjDFWyo1yhXNGrrBTQxp3ib9BLAWs7k2YKBQsFRkZG9A==} + is-alphanumerical@2.0.1: resolution: {integrity: sha512-hmbYhX/9MUMF5uh7tOXyK/n0ZvWpad5caBA17GsC6vyuCqaWliRG5K1qS9inmUhEMaOBIW7/whAnSwveW/LtZw==} @@ -3046,6 +3146,9 @@ packages: resolution: {integrity: sha512-PwwhEakHVKTdRNVOw+/Gyh0+MzlCl4R6qKvkhuvLtPMggI1WAHt9sOwZxQLSGpUaDnrdyDsomoRgNnCfKNSXXg==} engines: {node: '>= 0.4'} + is-decimal@1.0.4: + resolution: {integrity: sha512-RGdriMmQQvZ2aqaQq3awNA6dCGtKpiDFcOzrTWrDAT2MiWrKQVPmxLGHl7Y2nNu6led0kEyoX0enY0qXYsv9zw==} + is-decimal@2.0.1: resolution: {integrity: sha512-AAB9hiomQs5DXWcRB1rqsxGUstbRroFOPPVAomNk/3XHR5JyEZChOyTWe2oayKnsSsr/kcGqF+z6yuH6HHpN0A==} @@ -3073,6 +3176,9 @@ packages: resolution: {integrity: sha512-xelSayHH36ZgE7ZWhli7pW34hNbNl8Ojv5KVmkJD4hBdD3th8Tfk9vYasLM+mXWOZhFkgZfxhLSnrwRr4elSSg==} engines: {node: '>=0.10.0'} + is-hexadecimal@1.0.4: + resolution: {integrity: sha512-gyPJuv83bHMpocVYoqof5VDiZveEoGoFL8m3BXNb2VW8Xs+rz9kqO8LOQ5DH6EsuvilT1ApazU0pyl+ytbPtlw==} + is-hexadecimal@2.0.1: resolution: {integrity: sha512-DgZQp241c8oO6cA1SbTEWiXeoxV42vlcJxgH+B3hi1AiqqKruZR3ZGF8In3fj4+/y/7rHvlOZLZtgJ/4ttYGZg==} @@ -3311,6 +3417,9 @@ packages: resolution: {integrity: sha512-lyuxPGr/Wfhrlem2CL/UcnUc1zcqKAImBDzukY7Y5F/yQiNdko6+fRLevlw1HgMySw7f611UIY408EtxRSoK3Q==} hasBin: true + lowlight@1.20.0: + resolution: {integrity: sha512-8Ktj+prEb1RoCPkEOrPMYUN/nCggB7qAWe3a7OpMjWQkh3l2RD5wKRQ+o8Q8YuI9RG/xs95waaI/E6ym/7NsTw==} + lru-cache@10.4.3: resolution: {integrity: sha512-JNAzZcXrCt42VGLuYz0zfAzDfAvJWW6AfYlDBQyDV5DClI2m5sAmK+OIO7s59XfsRsWHp02jAJrRadPRGTt6SQ==} @@ -3665,6 +3774,9 @@ packages: resolution: {integrity: sha512-GQ2EWRpQV8/o+Aw8YqtfZZPfNRWZYkbidE9k5rpl/hC3vtHHBfGm2Ifi6qWV+coDGkrUKZAxE3Lot5kcsRlh+g==} engines: {node: '>=6'} + parse-entities@2.0.0: + resolution: {integrity: sha512-kkywGpCcRYhqQIchaWqZ875wzpS/bMKhz5HnN3p7wveJTkTtyAB/AlnS0f8DFSqYW1T82t6yEAkEcB+A1I3MbQ==} + parse-entities@4.0.2: resolution: {integrity: sha512-GG2AQYWoLgL877gQIKeRPGO1xF9+eG1ujIb5soS5gPvLQ1y2o8FL90w2QWNdf9I361Mpp7726c+lj3U0qK1uGw==} @@ -3741,12 +3853,23 @@ packages: resolution: {integrity: sha512-vkcDPrRZo1QZLbn5RLGPpg/WmIQ65qoWWhcGKf/b5eplkkarX0m9z8ppCat4mlOqUsWpyNuYgO3VRyrYHSzX5g==} engines: {node: '>= 0.8.0'} + prismjs@1.27.0: + resolution: {integrity: sha512-t13BGPUlFDR7wRB5kQDG4jjl7XeuH6jbJGt11JHPL96qwsEHNX2+68tFXqc1/k+/jALsbSWJKUOT/hcYAZ5LkA==} + engines: {node: '>=6'} + + prismjs@1.30.0: + resolution: {integrity: sha512-DEvV2ZF2r2/63V+tK8hQvrR2ZGn10srHbXviTlcv7Kpzw8jWiNTqbVgjO3IY8RxrrOUF8VPMQQFysYYYv0YZxw==} + engines: {node: '>=6'} + promise@7.3.1: resolution: {integrity: sha512-nolQXZ/4L+bP/UGlkfaIujX9BKxGwmQ9OT4mOt5yvy8iK1h3wqTEJCijzGANTCCl9nWjY41juyAn2K3Q1hLLTg==} prop-types@15.8.1: resolution: {integrity: sha512-oj87CgZICdulUohogVAR7AjlC0327U4el4L6eAvOqCeudMDVU0NThNaV+b9Df4dXgSP1gXMTnPdhfe/2qDH5cg==} + property-information@5.6.0: + resolution: {integrity: sha512-YUHSPk+A30YPv+0Qf8i9Mbfe/C0hdPXk1s1jPVToV8pk8BQtpw10ct89Eo7OWkutrwqvT0eicAxlOg3dOAu8JA==} + property-information@6.5.0: resolution: {integrity: sha512-PgTgs/BlvHxOu8QuEN7wi5A0OmXaBcHpmCSTehcs6Uuu9IkDIEo13Hy7n898RHfrQ49vKCoGeWZSaAK01nwVig==} @@ -3870,6 +3993,11 @@ packages: '@types/react': optional: true + react-syntax-highlighter@15.6.1: + resolution: {integrity: sha512-OqJ2/vL7lEeV5zTJyG7kmARppUjiB9h9udl4qHQjjgEos66z00Ia0OckwYfRxCSFrW8RJIBnsBwQsHZbVPspqg==} + peerDependencies: + react: '>= 0.14.0' + react-textarea-autosize@8.5.7: resolution: {integrity: sha512-2MqJ3p0Jh69yt9ktFIaZmORHXw4c4bxSIhCeWiFwmJ9EYKgLmuNII3e9c9b2UO+ijl4StnpZdqpxNIhTdHvqtQ==} engines: {node: '>=10'} @@ -3904,6 +4032,9 @@ packages: resolution: {integrity: sha512-00o4I+DVrefhv+nX0ulyi3biSHCPDe+yLv5o/p6d/UVlirijB8E16FtfwSAi4g3tcqrQ4lRAqQSoFEZJehYEcw==} engines: {node: '>= 0.4'} + refractor@3.6.0: + resolution: {integrity: sha512-MY9W41IOWxxk31o+YvFCNyNzdkc9M20NoZK5vq6jkv4I/uh2zkWcfudj0Q1fovjUQJrNewS9NMzeTtqPf+n5EA==} + regenerator-runtime@0.14.1: resolution: {integrity: sha512-dYnhHh0nJoMfnkZs6GmmhFknAGRrLznOu5nc9ML+EJxGvrx6H7teuevqVqCuPcPK//3eDrrjQhehXVx9cnkGdw==} @@ -4091,6 +4222,9 @@ packages: resolution: {integrity: sha512-2ymg6oRBpebeZi9UUNsgQ89bhx01TcTkmNTGnNO88imTmbSgy4nfujrgVEFKWpMTEGA11EDkTt7mqObTPdigIA==} engines: {node: '>= 8'} + space-separated-tokens@1.1.5: + resolution: {integrity: sha512-q/JSVd1Lptzhf5bkYm4ob4iWPjx0KiRe3sRFBNrVqbJkFaBm5vbbowy1mymoPNLRa52+oadOhJ+K49wsSeSjTA==} + space-separated-tokens@2.0.2: resolution: {integrity: sha512-PEGlAwrG8yXGXRjW32fGbg66JAlOAwbObuqVoJpv/mRgoWDQfgH1wDPvtzWyUSNAXBGSk8h755YDbbcEy3SH2Q==} @@ -4205,6 +4339,9 @@ packages: tailwind-merge@3.2.0: resolution: {integrity: sha512-FQT/OVqCD+7edmmJpsgCsY820RTD5AkBryuG5IUqR5YQZSdj5xlH5nLgH7YPths7WsLPSpSBNneJdM8aS8aeFA==} + tailwind-merge@3.3.0: + resolution: {integrity: sha512-fyW/pEfcQSiigd5SNn0nApUOxx0zB/dm6UDU/rEwc2c3sX2smWUNbapHv+QRqLGVp9GWX3THIa7MUGPo+YkDzQ==} + tailwindcss-animate@1.0.7: resolution: {integrity: sha512-bl6mpH3T7I3UFxuvDEXLxy/VuFxBk5bbzplh7tXI68mwMokNYd1t9qPBHlnyTwfa4JGC4zP516I1hYYtQ/vspA==} peerDependencies: @@ -4458,6 +4595,10 @@ packages: resolution: {integrity: sha512-si7QWI6zUMq56bESFvagtmzMdGOtoxfR+Sez11Mobfc7tm+VkUckk9bW2UeffTGVUbOksxmSw0AA2gs8g71NCQ==} engines: {node: '>=12'} + xtend@4.0.2: + resolution: {integrity: sha512-LKYU1iAXJXUgAXn9URjiu+MWhyUXHsvfp7mcuYm9dSUKK0/CjtrUwFAxD82/mCWbtLsGjFIad0wIsod4zrTAEQ==} + engines: {node: '>=0.4'} + yaml@1.10.2: resolution: {integrity: sha512-r3vXyErRCYJ7wg28yvBY5VSoAF8ZvlcW9/BwUzEtUsjvX/DKs24dIkuwjtuprwJJHsbyUbLApepYTR1BN4uHrg==} engines: {node: '>= 6'} @@ -5094,6 +5235,18 @@ snapshots: '@types/react': 19.0.10 '@types/react-dom': 19.0.4(@types/react@19.0.10) + '@radix-ui/react-collection@1.1.6(@types/react-dom@19.0.4(@types/react@19.0.10))(@types/react@19.0.10)(react-dom@19.0.0(react@19.0.0))(react@19.0.0)': + dependencies: + '@radix-ui/react-compose-refs': 1.1.2(@types/react@19.0.10)(react@19.0.0) + '@radix-ui/react-context': 1.1.2(@types/react@19.0.10)(react@19.0.0) + '@radix-ui/react-primitive': 2.1.2(@types/react-dom@19.0.4(@types/react@19.0.10))(@types/react@19.0.10)(react-dom@19.0.0(react@19.0.0))(react@19.0.0) + '@radix-ui/react-slot': 1.2.2(@types/react@19.0.10)(react@19.0.0) + react: 19.0.0 + react-dom: 19.0.0(react@19.0.0) + optionalDependencies: + '@types/react': 19.0.10 + '@types/react-dom': 19.0.4(@types/react@19.0.10) + '@radix-ui/react-compose-refs@1.0.0(react@19.0.0)': dependencies: '@babel/runtime': 7.26.9 @@ -5654,6 +5807,15 @@ snapshots: '@types/react': 19.0.10 '@types/react-dom': 19.0.4(@types/react@19.0.10) + '@radix-ui/react-primitive@2.1.2(@types/react-dom@19.0.4(@types/react@19.0.10))(@types/react@19.0.10)(react-dom@19.0.0(react@19.0.0))(react@19.0.0)': + dependencies: + '@radix-ui/react-slot': 1.2.2(@types/react@19.0.10)(react@19.0.0) + react: 19.0.0 + react-dom: 19.0.0(react@19.0.0) + optionalDependencies: + '@types/react': 19.0.10 + '@types/react-dom': 19.0.4(@types/react@19.0.10) + '@radix-ui/react-roving-focus@1.1.2(@types/react-dom@19.0.4(@types/react@19.0.10))(@types/react@19.0.10)(react-dom@19.0.0(react@19.0.0))(react@19.0.0)': dependencies: '@radix-ui/primitive': 1.1.1 @@ -5743,6 +5905,25 @@ snapshots: '@types/react': 19.0.10 '@types/react-dom': 19.0.4(@types/react@19.0.10) + '@radix-ui/react-slider@1.3.4(@types/react-dom@19.0.4(@types/react@19.0.10))(@types/react@19.0.10)(react-dom@19.0.0(react@19.0.0))(react@19.0.0)': + dependencies: + '@radix-ui/number': 1.1.1 + '@radix-ui/primitive': 1.1.2 + '@radix-ui/react-collection': 1.1.6(@types/react-dom@19.0.4(@types/react@19.0.10))(@types/react@19.0.10)(react-dom@19.0.0(react@19.0.0))(react@19.0.0) + '@radix-ui/react-compose-refs': 1.1.2(@types/react@19.0.10)(react@19.0.0) + '@radix-ui/react-context': 1.1.2(@types/react@19.0.10)(react@19.0.0) + '@radix-ui/react-direction': 1.1.1(@types/react@19.0.10)(react@19.0.0) + '@radix-ui/react-primitive': 2.1.2(@types/react-dom@19.0.4(@types/react@19.0.10))(@types/react@19.0.10)(react-dom@19.0.0(react@19.0.0))(react@19.0.0) + '@radix-ui/react-use-controllable-state': 1.2.2(@types/react@19.0.10)(react@19.0.0) + '@radix-ui/react-use-layout-effect': 1.1.1(@types/react@19.0.10)(react@19.0.0) + '@radix-ui/react-use-previous': 1.1.1(@types/react@19.0.10)(react@19.0.0) + '@radix-ui/react-use-size': 1.1.1(@types/react@19.0.10)(react@19.0.0) + react: 19.0.0 + react-dom: 19.0.0(react@19.0.0) + optionalDependencies: + '@types/react': 19.0.10 + '@types/react-dom': 19.0.4(@types/react@19.0.10) + '@radix-ui/react-slot@1.0.0(react@19.0.0)': dependencies: '@babel/runtime': 7.26.9 @@ -5771,6 +5952,13 @@ snapshots: optionalDependencies: '@types/react': 19.0.10 + '@radix-ui/react-slot@1.2.2(@types/react@19.0.10)(react@19.0.0)': + dependencies: + '@radix-ui/react-compose-refs': 1.1.2(@types/react@19.0.10)(react@19.0.0) + react: 19.0.0 + optionalDependencies: + '@types/react': 19.0.10 + '@radix-ui/react-tabs@1.1.3(@types/react-dom@19.0.4(@types/react@19.0.10))(@types/react@19.0.10)(react-dom@19.0.0(react@19.0.0))(react@19.0.0)': dependencies: '@radix-ui/primitive': 1.1.1 @@ -6155,6 +6343,10 @@ snapshots: '@types/estree@1.0.6': {} + '@types/hast@2.3.10': + dependencies: + '@types/unist': 2.0.11 + '@types/hast@3.0.4': dependencies: '@types/unist': 3.0.3 @@ -6179,6 +6371,10 @@ snapshots: dependencies: '@types/react': 19.0.10 + '@types/react-syntax-highlighter@15.5.13': + dependencies: + '@types/react': 19.0.10 + '@types/react@19.0.10': dependencies: csstype: 3.1.3 @@ -6470,10 +6666,16 @@ snapshots: character-entities-html4@2.1.0: {} + character-entities-legacy@1.1.4: {} + character-entities-legacy@3.0.0: {} + character-entities@1.2.4: {} + character-entities@2.0.2: {} + character-reference-invalid@1.1.4: {} + character-reference-invalid@2.0.1: {} chokidar@3.6.0: @@ -6528,6 +6730,8 @@ snapshots: color-string: 1.9.1 optional: true + comma-separated-tokens@1.0.8: {} + comma-separated-tokens@2.0.3: {} commander@4.1.1: {} @@ -6646,7 +6850,7 @@ snapshots: react: 19.0.0 react-dom: 19.0.0(react@19.0.0) react-easy-sort: 1.6.0(react-dom@19.0.0(react@19.0.0))(react@19.0.0) - tailwind-merge: 3.2.0 + tailwind-merge: 3.3.0 tsup: 6.7.0(postcss@8.5.3)(typescript@5.8.2) transitivePeerDependencies: - '@swc/core' @@ -7120,6 +7324,10 @@ snapshots: dependencies: reusify: 1.1.0 + fault@1.0.4: + dependencies: + format: 0.2.2 + fbemitter@3.0.0: dependencies: fbjs: 3.0.5 @@ -7185,6 +7393,8 @@ snapshots: cross-spawn: 7.0.6 signal-exit: 4.1.0 + format@0.2.2: {} + framer-motion@12.4.7(react-dom@19.0.0(react@19.0.0))(react@19.0.0): dependencies: motion-dom: 12.4.5 @@ -7403,6 +7613,8 @@ snapshots: vfile-location: 5.0.3 web-namespaces: 2.0.1 + hast-util-parse-selector@2.2.5: {} + hast-util-parse-selector@4.0.0: dependencies: '@types/hast': 3.0.4 @@ -7502,6 +7714,14 @@ snapshots: dependencies: '@types/hast': 3.0.4 + hastscript@6.0.0: + dependencies: + '@types/hast': 2.3.10 + comma-separated-tokens: 1.0.8 + hast-util-parse-selector: 2.2.5 + property-information: 5.6.0 + space-separated-tokens: 1.1.5 + hastscript@9.0.1: dependencies: '@types/hast': 3.0.4 @@ -7510,6 +7730,10 @@ snapshots: property-information: 7.0.0 space-separated-tokens: 2.0.2 + highlight.js@10.7.3: {} + + highlightjs-vue@1.0.0: {} + html-url-attributes@3.0.1: {} html-void-elements@3.0.0: {} @@ -7535,8 +7759,15 @@ snapshots: hasown: 2.0.2 side-channel: 1.1.0 + is-alphabetical@1.0.4: {} + is-alphabetical@2.0.1: {} + is-alphanumerical@1.0.4: + dependencies: + is-alphabetical: 1.0.4 + is-decimal: 1.0.4 + is-alphanumerical@2.0.1: dependencies: is-alphabetical: 2.0.1 @@ -7593,6 +7824,8 @@ snapshots: call-bound: 1.0.3 has-tostringtag: 1.0.2 + is-decimal@1.0.4: {} + is-decimal@2.0.1: {} is-extendable@0.1.1: {} @@ -7616,6 +7849,8 @@ snapshots: dependencies: is-extglob: 2.1.1 + is-hexadecimal@1.0.4: {} + is-hexadecimal@2.0.1: {} is-map@2.0.3: {} @@ -7819,6 +8054,11 @@ snapshots: dependencies: js-tokens: 4.0.0 + lowlight@1.20.0: + dependencies: + fault: 1.0.4 + highlight.js: 10.7.3 + lru-cache@10.4.3: {} lru-cache@11.1.0: {} @@ -8438,6 +8678,15 @@ snapshots: dependencies: callsites: 3.1.0 + parse-entities@2.0.0: + dependencies: + character-entities: 1.2.4 + character-entities-legacy: 1.1.4 + character-reference-invalid: 1.1.4 + is-alphanumerical: 1.0.4 + is-decimal: 1.0.4 + is-hexadecimal: 1.0.4 + parse-entities@4.0.2: dependencies: '@types/unist': 2.0.11 @@ -8506,6 +8755,10 @@ snapshots: prelude-ls@1.2.1: {} + prismjs@1.27.0: {} + + prismjs@1.30.0: {} + promise@7.3.1: dependencies: asap: 2.0.6 @@ -8516,6 +8769,10 @@ snapshots: object-assign: 4.1.1 react-is: 16.13.1 + property-information@5.6.0: + dependencies: + xtend: 4.0.2 + property-information@6.5.0: {} property-information@7.0.0: {} @@ -8648,6 +8905,16 @@ snapshots: optionalDependencies: '@types/react': 19.0.10 + react-syntax-highlighter@15.6.1(react@19.0.0): + dependencies: + '@babel/runtime': 7.26.9 + highlight.js: 10.7.3 + highlightjs-vue: 1.0.0 + lowlight: 1.20.0 + prismjs: 1.30.0 + react: 19.0.0 + refractor: 3.6.0 + react-textarea-autosize@8.5.7(@types/react@19.0.10)(react@19.0.0): dependencies: '@babel/runtime': 7.26.9 @@ -8706,6 +8973,12 @@ snapshots: get-proto: 1.0.1 which-builtin-type: 1.2.1 + refractor@3.6.0: + dependencies: + hastscript: 6.0.0 + parse-entities: 2.0.0 + prismjs: 1.27.0 + regenerator-runtime@0.14.1: {} regex-recursion@6.0.2: @@ -8980,6 +9253,8 @@ snapshots: dependencies: whatwg-url: 7.1.0 + space-separated-tokens@1.1.5: {} + space-separated-tokens@2.0.2: {} sprintf-js@1.0.3: {} @@ -9110,6 +9385,8 @@ snapshots: tailwind-merge@3.2.0: {} + tailwind-merge@3.3.0: {} + tailwindcss-animate@1.0.7(tailwindcss@4.0.9): dependencies: tailwindcss: 4.0.9 @@ -9410,6 +9687,8 @@ snapshots: string-width: 5.1.2 strip-ansi: 7.1.0 + xtend@4.0.2: {} + yaml@1.10.2: {} yocto-queue@0.1.0: {}