From 3675505eb1a9c4ca967aacfc80698999c9e37f3e Mon Sep 17 00:00:00 2001 From: "DESKTOP-RTLN3BA\\$punk" Date: Sun, 27 Apr 2025 15:53:33 -0700 Subject: [PATCH 01/70] feat: Added LinkUP Search Engine Connector --- .../alembic/versions/4_add_linkup_api_enum.py | 45 ++++ .../app/agents/researcher/nodes.py | 70 ++++-- surfsense_backend/app/db.py | 3 +- .../app/schemas/search_source_connector.py | 10 + surfsense_backend/app/temp_test.py | 17 ++ .../app/utils/connector_service.py | 95 ++++++++ surfsense_backend/pyproject.toml | 1 + surfsense_backend/uv.lock | 15 ++ .../connectors/(manage)/page.tsx | 1 + .../connectors/[connector_id]/edit/page.tsx | 11 + .../connectors/[connector_id]/page.tsx | 16 +- .../connectors/add/linkup-api/page.tsx | 207 ++++++++++++++++++ .../[search_space_id]/connectors/add/page.tsx | 9 +- .../components/ModernHeroWithGradients.tsx | 2 +- .../components/chat/ConnectorComponents.tsx | 4 +- .../components/editConnector/types.ts | 1 + surfsense_web/hooks/useConnectorEditPage.ts | 11 +- surfsense_web/lib/connectors/utils.ts | 1 + 18 files changed, 492 insertions(+), 27 deletions(-) create mode 100644 surfsense_backend/alembic/versions/4_add_linkup_api_enum.py create mode 100644 surfsense_backend/app/temp_test.py create mode 100644 surfsense_web/app/dashboard/[search_space_id]/connectors/add/linkup-api/page.tsx diff --git a/surfsense_backend/alembic/versions/4_add_linkup_api_enum.py b/surfsense_backend/alembic/versions/4_add_linkup_api_enum.py new file mode 100644 index 000000000..8ccfac2d2 --- /dev/null +++ b/surfsense_backend/alembic/versions/4_add_linkup_api_enum.py @@ -0,0 +1,45 @@ +"""Add LINKUP_API to SearchSourceConnectorType enum + +Revision ID: 4 +Revises: 3 +Create Date: 2025-04-18 10:00:00.000000 + +""" +from typing import Sequence, Union + +from alembic import op +import sqlalchemy as sa + + +# revision identifiers, used by Alembic. +revision: str = '4' +down_revision: Union[str, None] = '3' +branch_labels: Union[str, Sequence[str], None] = None +depends_on: Union[str, Sequence[str], None] = None + + +def upgrade() -> None: + # ### commands auto generated by Alembic - please adjust! ### + + # Manually add the command to add the enum value + op.execute("ALTER TYPE searchsourceconnectortype ADD VALUE 'LINKUP_API'") + + # Pass for the rest, as autogenerate didn't run to add other schema details + pass + # ### end Alembic commands ### + + +def downgrade() -> None: + # ### commands auto generated by Alembic - please adjust! ### + + # Downgrading removal of an enum value requires recreating the type + op.execute("ALTER TYPE searchsourceconnectortype RENAME TO searchsourceconnectortype_old") + op.execute("CREATE TYPE searchsourceconnectortype AS ENUM('SERPER_API', 'TAVILY_API', 'SLACK_CONNECTOR', 'NOTION_CONNECTOR', 'GITHUB_CONNECTOR', 'LINEAR_CONNECTOR')") + op.execute(( + "ALTER TABLE search_source_connectors ALTER COLUMN connector_type TYPE searchsourceconnectortype USING " + "connector_type::text::searchsourceconnectortype" + )) + op.execute("DROP TYPE searchsourceconnectortype_old") + + pass + # ### end Alembic commands ### \ No newline at end of file diff --git a/surfsense_backend/app/agents/researcher/nodes.py b/surfsense_backend/app/agents/researcher/nodes.py index 15935f2ea..1b42d7155 100644 --- a/surfsense_backend/app/agents/researcher/nodes.py +++ b/surfsense_backend/app/agents/researcher/nodes.py @@ -143,7 +143,7 @@ async def fetch_relevant_documents( connectors_to_search: List[str], writer: StreamWriter = None, state: State = None, - top_k: int = 20 + top_k: int = 10 ) -> List[Dict[str, Any]]: """ Fetch relevant documents for research questions using the provided connectors. @@ -264,22 +264,6 @@ async def fetch_relevant_documents( streaming_service.only_update_terminal(f"Found {len(files_chunks)} file chunks relevant to the query") writer({"yeild_value": streaming_service._format_annotations()}) - elif connector == "TAVILY_API": - source_object, tavily_chunks = await connector_service.search_tavily( - user_query=reformulated_query, - user_id=user_id, - top_k=top_k - ) - - # Add to sources and raw documents - if source_object: - all_sources.append(source_object) - all_raw_documents.extend(tavily_chunks) - - # Stream found document count - if streaming_service and writer: - streaming_service.only_update_terminal(f"Found {len(tavily_chunks)} web search results relevant to the query") - writer({"yeild_value": streaming_service._format_annotations()}) elif connector == "SLACK_CONNECTOR": source_object, slack_chunks = await connector_service.search_slack( @@ -352,6 +336,47 @@ async def fetch_relevant_documents( if streaming_service and writer: streaming_service.only_update_terminal(f"Found {len(linear_chunks)} Linear issues relevant to the query") writer({"yeild_value": streaming_service._format_annotations()}) + + elif connector == "TAVILY_API": + source_object, tavily_chunks = await connector_service.search_tavily( + user_query=reformulated_query, + user_id=user_id, + top_k=top_k + ) + + # Add to sources and raw documents + if source_object: + all_sources.append(source_object) + all_raw_documents.extend(tavily_chunks) + + # Stream found document count + if streaming_service and writer: + streaming_service.only_update_terminal(f"Found {len(tavily_chunks)} web search results relevant to the query") + writer({"yeild_value": streaming_service._format_annotations()}) + + elif connector == "LINKUP_API": + if top_k > 10: + linkup_mode = "deep" + else: + linkup_mode = "standard" + + source_object, linkup_chunks = await connector_service.search_linkup( + user_query=reformulated_query, + user_id=user_id, + mode=linkup_mode + ) + + # Add to sources and raw documents + if source_object: + all_sources.append(source_object) + all_raw_documents.extend(linkup_chunks) + + # Stream found document count + if streaming_service and writer: + streaming_service.only_update_terminal(f"Found {len(linkup_chunks)} Linkup chunks relevant to the query") + writer({"yeild_value": streaming_service._format_annotations()}) + + except Exception as e: error_message = f"Error searching connector {connector}: {str(e)}" print(error_message) @@ -462,6 +487,14 @@ async def process_sections(state: State, config: RunnableConfig, writer: StreamW streaming_service.only_update_terminal("Searching for relevant information across all connectors...") writer({"yeild_value": streaming_service._format_annotations()}) + if configuration.num_sections == 1: + TOP_K = 10 + elif configuration.num_sections == 3: + TOP_K = 20 + elif configuration.num_sections == 6: + TOP_K = 30 + + relevant_documents = [] async with async_session_maker() as db_session: try: @@ -472,7 +505,8 @@ async def process_sections(state: State, config: RunnableConfig, writer: StreamW db_session=db_session, connectors_to_search=configuration.connectors_to_search, writer=writer, - state=state + state=state, + top_k=TOP_K ) except Exception as e: error_message = f"Error fetching relevant documents: {str(e)}" diff --git a/surfsense_backend/app/db.py b/surfsense_backend/app/db.py index 4426f4ffa..320f059dd 100644 --- a/surfsense_backend/app/db.py +++ b/surfsense_backend/app/db.py @@ -44,8 +44,9 @@ class DocumentType(str, Enum): LINEAR_CONNECTOR = "LINEAR_CONNECTOR" class SearchSourceConnectorType(str, Enum): - SERPER_API = "SERPER_API" + SERPER_API = "SERPER_API" # NOT IMPLEMENTED YET : DON'T REMEMBER WHY : MOST PROBABLY BECAUSE WE NEED TO CRAWL THE RESULTS RETURNED BY IT TAVILY_API = "TAVILY_API" + LINKUP_API = "LINKUP_API" SLACK_CONNECTOR = "SLACK_CONNECTOR" NOTION_CONNECTOR = "NOTION_CONNECTOR" GITHUB_CONNECTOR = "GITHUB_CONNECTOR" diff --git a/surfsense_backend/app/schemas/search_source_connector.py b/surfsense_backend/app/schemas/search_source_connector.py index 6accc12af..cb7152e06 100644 --- a/surfsense_backend/app/schemas/search_source_connector.py +++ b/surfsense_backend/app/schemas/search_source_connector.py @@ -36,6 +36,16 @@ class SearchSourceConnectorBase(BaseModel): # Ensure the API key is not empty if not config.get("TAVILY_API_KEY"): raise ValueError("TAVILY_API_KEY cannot be empty") + + elif connector_type == SearchSourceConnectorType.LINKUP_API: + # For LINKUP_API, only allow LINKUP_API_KEY + allowed_keys = ["LINKUP_API_KEY"] + if set(config.keys()) != set(allowed_keys): + raise ValueError(f"For LINKUP_API connector type, config must only contain these keys: {allowed_keys}") + + # Ensure the API key is not empty + if not config.get("LINKUP_API_KEY"): + raise ValueError("LINKUP_API_KEY cannot be empty") elif connector_type == SearchSourceConnectorType.SLACK_CONNECTOR: # For SLACK_CONNECTOR, only allow SLACK_BOT_TOKEN diff --git a/surfsense_backend/app/temp_test.py b/surfsense_backend/app/temp_test.py new file mode 100644 index 000000000..f8ff10fec --- /dev/null +++ b/surfsense_backend/app/temp_test.py @@ -0,0 +1,17 @@ +from linkup import LinkupClient + +# Initialize the client (API key can be read from the environment variable or passed as an argument) +client = LinkupClient( + api_key="0ed1d08a-c8eb-4f01-9e3d-67cf87a3cd8f" +) + +# Perform a search query +search_response = client.search( + query="What is Surfsense?", + depth="standard", # "standard" or "deep" + output_type="searchResults", # "searchResults" or "sourcedAnswer" or "structured" + structured_output_schema=None, # must be filled if output_type is "structured" +) +print(search_response) + +# results=[LinkupSearchTextResult(type='text', name='SurfSense - Future Tools', url='https://www.futuretools.io/tools/surfsense', content='SurfSense is an open-source AI research assistant that functions as a personal, private alternative to tools like NotebookLM or Perplexity. It enables users to save webpages (even those behind login walls), upload documents, and build a searchable knowledge base that can be queried through natural language. The tool integrates with various external sources including search engines, Slack ...'), LinkupSearchTextResult(type='text', name='r/selfhosted on Reddit: SurfSense - Personal AI Assistant for World Wide Web Surfers.', url='https://www.reddit.com/r/selfhosted/comments/1fl58vh/surfsense_personal_ai_assistant_for_world_wide/', content='14 votes, 22 comments. Hi Everyone, For the past few months I have been trying to build a Personal AI Assistant for World Wide Web Surfers. It…\nWhat it is and why I am making it: Well when I’m browsing the internet, I tend to save a ton of content—but remembering when and what you saved? Total brain freeze! That’s where SurfSense comes in. SurfSense is a Personal AI Assistant for anything you see (Social Media Chats, Calendar Invites, Important Mails, Tutorials, Recipes and anything ) on the World Wide Web.\nPlease test it out at https://github.com/MODSetter/SurfSense and let me know your feedback.\nPosted by u/Uiqueblhats - 14 votes and 22 comments'), LinkupSearchTextResult(type='text', name='SurfSense - GitHub', url='https://github.com/DLMJR/surfsense', content='While tools like NotebookLM and Perplexity are impressive and highly effective for conducting research on any topic/query, SurfSense elevates this capability by integrating with your personal knowledge base. It is a highly customizable AI research agent, connected to external sources such as search engines (Tavily), Slack, Notion, YouTube, GitHub and more to come.'), LinkupSearchTextResult(type='text', name='How to Set Up and Use SurfSense: Your Personal AI Assistant', url='https://fxis.ai/edu/how-to-set-up-and-use-surfsense-your-personal-ai-assistant/', content='SurfSense is the answer to the common struggle of remembering what content you’ve saved while browsing the internet. Imagine your favorite library, but instead of books, it’s filled with every useful webpage, chat message, recipe, and tutorial you’ve come across. With SurfSense, you can instantly recall any of these digital treasures. Let’s embark on a journey to set up and utilize ...'), LinkupSearchTextResult(type='text', name='Surf Sense | F6S', url='https://www.f6s.com/surfsense', content='Surf Sense - Government - Surf Sense is the modern infrastructure network of the ocean.\nsurfsense.com.au · Nathan Adler · Sydney, Australia · Product leader, ex-engineer, start-up founder & maker, with end-to-end product development background in software and hardware. Product · Employee @Airtasker · Product · Employee @SafetyCulture · B Engineering / B Commerce @UNSW See 3 more ·'), LinkupSearchTextResult(type='text', name='Surf Sense | Online Surf Coaching & Knowledge Platform', url='https://www.surf-sense.com/', content='Join Surf Sense, the ultimate online surf coaching platform designed for intermediate and advanced surfers. Access expert-guided courses, weekly live Q&A sessions, and a thriving global surf community. Start improving your surfing today!\nundefined'), LinkupSearchTextResult(type='text', name='SurfSense - The Open Source Alternative to NotebookLM / Perplexity ...', url='https://www.redditmedia.com/r/selfhosted/comments/1jzi67a/surfsense_the_open_source_alternative_to/', content="For those of you who aren't familiar with SurfSense, it aims to be the open-source alternative to NotebookLM, Perplexity, or Glean. In short, it's a Highly Customizable AI Research Agent but connected to your personal external sources like search engines (Tavily), Slack, Notion, YouTube, GitHub, and more coming soon."), LinkupSearchTextResult(type='text', name='GitHub - MODSetter/SurfSense: Open Source Alternative to NotebookLM ...', url='https://github.com/MODSetter/SurfSense', content='While tools like NotebookLM and Perplexity are impressive and highly effective for conducting research on any topic/query, SurfSense elevates this capability by integrating with your personal knowledge base. It is a highly customizable AI research agent, connected to external sources such as search engines (Tavily), Slack, Linear, Notion, YouTube, GitHub and more to come.'), LinkupSearchTextResult(type='text', name='GitHub - MODSetter/SurfSense: Open Source Alternative to NotebookLM / Perplexity / Glean, connected to external sources such as search engines (Tavily), Slack, Linear, Notion, YouTube, GitHub and more.', url='https://github.com/MODSetter/SurfSense', content='Open Source Alternative to NotebookLM / Perplexity / Glean, connected to external sources such as search engines (Tavily), Slack, Linear, Notion, YouTube, GitHub and more. - MODSetter/SurfSense\nWhile tools like NotebookLM and Perplexity are impressive and highly effective for conducting research on any topic/query, SurfSense elevates this capability by integrating with your personal knowledge base.\nThe SurfSense extension can be used to save any webpage you like.\nThe SurfSense Podcast feature is currently being reworked for better UI and stability.\nSurfSense is actively being developed.'), LinkupSearchTextResult(type='text', name='SurfSense - Chrome Web Store', url='https://chromewebstore.google.com/detail/surfsense/jihmihbdpfjhppdlifphccgefjhifblf', content='Extension to collect Browsing History for SurfSense.\nWell when I’m browsing the internet, I tend to save a ton of content—but remembering when and what you saved? Total brain freeze! ❄️ That’s where SurfSense comes in. SurfSense is like a Knowledge Graph 🧠 Brain 🧠 for anything you see on the World Wide Web.\nSurfSense has disclosed the following information regarding the collection and usage of your data.\nThen, ask your personal knowledge base anything about your saved content., and voilà—instant recall! 🧑\u200d💻🌐 Use this extension to capture & save your Web Content and chat with your personal Knowledge Graph 🧠 Brain 🧠 at https://www.surfsense.net')] \ No newline at end of file diff --git a/surfsense_backend/app/utils/connector_service.py b/surfsense_backend/app/utils/connector_service.py index 9a6e13c43..7f88c1c0f 100644 --- a/surfsense_backend/app/utils/connector_service.py +++ b/surfsense_backend/app/utils/connector_service.py @@ -5,6 +5,7 @@ from sqlalchemy.future import select from app.retriver.chunks_hybrid_search import ChucksHybridSearchRetriever from app.db import SearchSourceConnector, SearchSourceConnectorType from tavily import TavilyClient +from linkup import LinkupClient class ConnectorService: @@ -643,3 +644,97 @@ class ConnectorService: } return result_object, linear_chunks + + async def search_linkup(self, user_query: str, user_id: str, mode: str = "standard") -> tuple: + """ + Search using Linkup API and return both the source information and documents + + Args: + user_query: The user's query + user_id: The user's ID + mode: Search depth mode, can be "standard" or "deep" + + Returns: + tuple: (sources_info, documents) + """ + # Get Linkup connector configuration + linkup_connector = await self.get_connector_by_type(user_id, SearchSourceConnectorType.LINKUP_API) + + if not linkup_connector: + # Return empty results if no Linkup connector is configured + return { + "id": 10, + "name": "Linkup Search", + "type": "LINKUP_API", + "sources": [], + }, [] + + # Initialize Linkup client with API key from connector config + linkup_api_key = linkup_connector.config.get("LINKUP_API_KEY") + linkup_client = LinkupClient(api_key=linkup_api_key) + + # Perform search with Linkup + try: + response = linkup_client.search( + query=user_query, + depth=mode, # Use the provided mode ("standard" or "deep") + output_type="searchResults", # Default to search results + ) + + # Extract results from Linkup response - access as attribute instead of using .get() + linkup_results = response.results if hasattr(response, 'results') else [] + + # Process each result and create sources directly without deduplication + sources_list = [] + documents = [] + + for i, result in enumerate(linkup_results): + # Fix for UI + linkup_results[i]['document']['id'] = self.source_id_counter + # Create a source entry + source = { + "id": self.source_id_counter, + "title": result.name if hasattr(result, 'name') else "Linkup Result", + "description": result.content[:100] if hasattr(result, 'content') else "", + "url": result.url if hasattr(result, 'url') else "" + } + sources_list.append(source) + + # Create a document entry + document = { + "chunk_id": f"linkup_chunk_{i}", + "content": result.content if hasattr(result, 'content') else "", + "score": 1.0, # Default score since not provided by Linkup + "document": { + "id": self.source_id_counter, + "title": result.name if hasattr(result, 'name') else "Linkup Result", + "document_type": "LINKUP_API", + "metadata": { + "url": result.url if hasattr(result, 'url') else "", + "type": result.type if hasattr(result, 'type') else "", + "source": "LINKUP_API" + } + } + } + documents.append(document) + self.source_id_counter += 1 + + # Create result object + result_object = { + "id": 10, + "name": "Linkup Search", + "type": "LINKUP_API", + "sources": sources_list, + } + + return result_object, documents + + except Exception as e: + # Log the error and return empty results + print(f"Error searching with Linkup: {str(e)}") + return { + "id": 10, + "name": "Linkup Search", + "type": "LINKUP_API", + "sources": [], + }, [] diff --git a/surfsense_backend/pyproject.toml b/surfsense_backend/pyproject.toml index 7b7a6f900..8f8dc4c0e 100644 --- a/surfsense_backend/pyproject.toml +++ b/surfsense_backend/pyproject.toml @@ -15,6 +15,7 @@ dependencies = [ "langchain-community>=0.3.17", "langchain-unstructured>=0.1.6", "langgraph>=0.3.29", + "linkup-sdk>=0.2.4", "litellm>=1.61.4", "markdownify>=0.14.1", "notion-client>=2.3.0", diff --git a/surfsense_backend/uv.lock b/surfsense_backend/uv.lock index 9b485b0df..9601bccb3 100644 --- a/surfsense_backend/uv.lock +++ b/surfsense_backend/uv.lock @@ -1413,6 +1413,19 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/8b/e4/5380e8229c442e406404977d2ec71a9db6a3e6a89fce7791c6ad7cd2bdbe/langsmith-0.3.8-py3-none-any.whl", hash = "sha256:fbb9dd97b0f090219447fca9362698d07abaeda1da85aa7cc6ec6517b36581b1", size = 332800 }, ] +[[package]] +name = "linkup-sdk" +version = "0.2.4" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "httpx" }, + { name = "pydantic" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/c2/c7/d9a85331bf2611ecac67f1ad92a6ced641b2e2e93eea26b17a9af701b3d1/linkup_sdk-0.2.4.tar.gz", hash = "sha256:2b8fd1894b9b4715bc14aabcbf53df6def9024f2cc426f234cc59e1807ec4c12", size = 9392 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/18/d8/bb9e01328fe5ad979e3e459c0f76321d295663906deef56eeaa5ce0cf269/linkup_sdk-0.2.4-py3-none-any.whl", hash = "sha256:8bc4c4f34de93529136a14e42441d803868d681c2bf3fd59be51923e44f1f1d4", size = 8325 }, +] + [[package]] name = "litellm" version = "1.61.4" @@ -3078,6 +3091,7 @@ dependencies = [ { name = "langchain-community" }, { name = "langchain-unstructured" }, { name = "langgraph" }, + { name = "linkup-sdk" }, { name = "litellm" }, { name = "markdownify" }, { name = "notion-client" }, @@ -3106,6 +3120,7 @@ requires-dist = [ { name = "langchain-community", specifier = ">=0.3.17" }, { name = "langchain-unstructured", specifier = ">=0.1.6" }, { name = "langgraph", specifier = ">=0.3.29" }, + { name = "linkup-sdk", specifier = ">=0.2.4" }, { name = "litellm", specifier = ">=1.61.4" }, { name = "markdownify", specifier = ">=0.14.1" }, { name = "notion-client", specifier = ">=2.3.0" }, diff --git a/surfsense_web/app/dashboard/[search_space_id]/connectors/(manage)/page.tsx b/surfsense_web/app/dashboard/[search_space_id]/connectors/(manage)/page.tsx index 24fe6265d..af92a6ae5 100644 --- a/surfsense_web/app/dashboard/[search_space_id]/connectors/(manage)/page.tsx +++ b/surfsense_web/app/dashboard/[search_space_id]/connectors/(manage)/page.tsx @@ -46,6 +46,7 @@ const getConnectorTypeDisplay = (type: string): string => { "NOTION_CONNECTOR": "Notion", "GITHUB_CONNECTOR": "GitHub", "LINEAR_CONNECTOR": "Linear", + "LINKUP_API": "Linkup", // Add other connector types here as needed }; return typeMap[type] || type; diff --git a/surfsense_web/app/dashboard/[search_space_id]/connectors/[connector_id]/edit/page.tsx b/surfsense_web/app/dashboard/[search_space_id]/connectors/[connector_id]/edit/page.tsx index d41295faa..5afea12c9 100644 --- a/surfsense_web/app/dashboard/[search_space_id]/connectors/[connector_id]/edit/page.tsx +++ b/surfsense_web/app/dashboard/[search_space_id]/connectors/[connector_id]/edit/page.tsx @@ -160,6 +160,17 @@ export default function EditConnectorPage() { /> )} + {/* == Linkup == */} + {connector.connector_type === 'LINKUP_API' && ( + + )} + + + + + + Connect Linkup API + + Integrate with Linkup API to enhance your search capabilities with AI-powered search results. + + + + + + API Key Required + + You'll need a Linkup API key to use this connector. You can get one by signing up at{" "} + + linkup.so + + + + +
+ + ( + + Connector Name + + + + + A friendly name to identify this connector. + + + + )} + /> + + ( + + Linkup API Key + + + + + Your API key will be encrypted and stored securely. + + + + )} + /> + +
+ +
+ + +
+ +

What you get with Linkup API:

+
    +
  • AI-powered search results tailored to your queries
  • +
  • Real-time information from the web
  • +
  • Enhanced search capabilities for your projects
  • +
+
+
+
+ + ); +} diff --git a/surfsense_web/app/dashboard/[search_space_id]/connectors/add/page.tsx b/surfsense_web/app/dashboard/[search_space_id]/connectors/add/page.tsx index 1f7490270..c04dae645 100644 --- a/surfsense_web/app/dashboard/[search_space_id]/connectors/add/page.tsx +++ b/surfsense_web/app/dashboard/[search_space_id]/connectors/add/page.tsx @@ -16,6 +16,7 @@ import { IconWorldWww, IconTicket, IconLayoutKanban, + IconLinkPlus, } from "@tabler/icons-react"; import { AnimatePresence, motion } from "framer-motion"; import Link from "next/link"; @@ -50,7 +51,13 @@ const connectorCategories: ConnectorCategory[] = [ icon: , status: "available", }, - // Add other search engine connectors like Tavily, Serper if they have UI config + { + id: "linkup-api", + title: "Linkup API", + description: "Search the web using the Linkup API", + icon: , + status: "available", + }, ], }, { diff --git a/surfsense_web/components/ModernHeroWithGradients.tsx b/surfsense_web/components/ModernHeroWithGradients.tsx index 052c993da..b30c4bc82 100644 --- a/surfsense_web/components/ModernHeroWithGradients.tsx +++ b/surfsense_web/components/ModernHeroWithGradients.tsx @@ -36,7 +36,7 @@ export function ModernHeroWithGradients() {

- A Customizable AI Research Agent just like NotebookLM or Perplexity, but connected to external sources such as search engines (Tavily), Slack, Linear, Notion, YouTube, GitHub and more. + A Customizable AI Research Agent just like NotebookLM or Perplexity, but connected to external sources such as search engines (Tavily, LinkUp), Slack, Linear, Notion, YouTube, GitHub and more.

{ const iconProps = { className: "h-4 w-4" }; switch(connectorType) { + case 'LINKUP_API': + return ; case 'LINEAR_CONNECTOR': return ; case 'GITHUB_CONNECTOR': diff --git a/surfsense_web/components/editConnector/types.ts b/surfsense_web/components/editConnector/types.ts index 364f23526..cc43e1c81 100644 --- a/surfsense_web/components/editConnector/types.ts +++ b/surfsense_web/components/editConnector/types.ts @@ -30,5 +30,6 @@ export const editConnectorSchema = z.object({ SERPER_API_KEY: z.string().optional(), TAVILY_API_KEY: z.string().optional(), LINEAR_API_KEY: z.string().optional(), + LINKUP_API_KEY: z.string().optional(), }); export type EditConnectorFormValues = z.infer; diff --git a/surfsense_web/hooks/useConnectorEditPage.ts b/surfsense_web/hooks/useConnectorEditPage.ts index d7672025d..7e81c5524 100644 --- a/surfsense_web/hooks/useConnectorEditPage.ts +++ b/surfsense_web/hooks/useConnectorEditPage.ts @@ -59,7 +59,8 @@ export function useConnectorEditPage(connectorId: number, searchSpaceId: string) NOTION_INTEGRATION_TOKEN: config.NOTION_INTEGRATION_TOKEN || "", SERPER_API_KEY: config.SERPER_API_KEY || "", TAVILY_API_KEY: config.TAVILY_API_KEY || "", - LINEAR_API_KEY: config.LINEAR_API_KEY || "" + LINEAR_API_KEY: config.LINEAR_API_KEY || "", + LINKUP_API_KEY: config.LINKUP_API_KEY || "" }); if (currentConnector.connector_type === 'GITHUB_CONNECTOR') { const savedRepos = config.repo_full_names || []; @@ -164,6 +165,12 @@ export function useConnectorEditPage(connectorId: number, searchSpaceId: string) newConfig = { LINEAR_API_KEY: formData.LINEAR_API_KEY }; } break; + case 'LINKUP_API': + if (formData.LINKUP_API_KEY !== originalConfig.LINKUP_API_KEY) { + if (!formData.LINKUP_API_KEY) { toast.error("Linkup API Key cannot be empty."); setIsSaving(false); return; } + newConfig = { LINKUP_API_KEY: formData.LINKUP_API_KEY }; + } + break; } if (newConfig !== null) { @@ -203,6 +210,8 @@ export function useConnectorEditPage(connectorId: number, searchSpaceId: string) editForm.setValue('TAVILY_API_KEY', newlySavedConfig.TAVILY_API_KEY || ""); } else if(connector.connector_type === 'LINEAR_CONNECTOR') { editForm.setValue('LINEAR_API_KEY', newlySavedConfig.LINEAR_API_KEY || ""); + } else if(connector.connector_type === 'LINKUP_API') { + editForm.setValue('LINKUP_API_KEY', newlySavedConfig.LINKUP_API_KEY || ""); } } if (connector.connector_type === 'GITHUB_CONNECTOR') { diff --git a/surfsense_web/lib/connectors/utils.ts b/surfsense_web/lib/connectors/utils.ts index 5efc59386..f93bd3f82 100644 --- a/surfsense_web/lib/connectors/utils.ts +++ b/surfsense_web/lib/connectors/utils.ts @@ -7,6 +7,7 @@ export const getConnectorTypeDisplay = (type: string): string => { "NOTION_CONNECTOR": "Notion", "GITHUB_CONNECTOR": "GitHub", "LINEAR_CONNECTOR": "Linear", + "LINKUP_API": "Linkup", }; return typeMap[type] || type; }; From a945aceac77b402ac33a20f669fd1ad6a692229e Mon Sep 17 00:00:00 2001 From: "DESKTOP-RTLN3BA\\$punk" Date: Sun, 27 Apr 2025 15:56:31 -0700 Subject: [PATCH 02/70] chore: readme --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index e412fe2be..ad8633c47 100644 --- a/README.md +++ b/README.md @@ -6,7 +6,7 @@ # SurfSense -While tools like NotebookLM and Perplexity are impressive and highly effective for conducting research on any topic/query, SurfSense elevates this capability by integrating with your personal knowledge base. It is a highly customizable AI research agent, connected to external sources such as search engines (Tavily), Slack, Linear, Notion, YouTube, GitHub and more to come. +While tools like NotebookLM and Perplexity are impressive and highly effective for conducting research on any topic/query, SurfSense elevates this capability by integrating with your personal knowledge base. It is a highly customizable AI research agent, connected to external sources such as search engines (Tavily, LinkUp), Slack, Linear, Notion, YouTube, GitHub and more to come. # Video @@ -42,7 +42,7 @@ Open source and easy to deploy locally. - RAG as a Service API Backend. #### ℹ️ **External Sources** -- Search Engines (Tavily) +- Search Engines (Tavily, LinkUp) - Slack - Linear - Notion From cc4e02183ba8b98e3b50dcdcea46d4efe0eb54b6 Mon Sep 17 00:00:00 2001 From: "DESKTOP-RTLN3BA\\$punk" Date: Sun, 27 Apr 2025 15:59:48 -0700 Subject: [PATCH 03/70] oops disables --- surfsense_backend/app/temp_test.py | 17 ----------------- 1 file changed, 17 deletions(-) delete mode 100644 surfsense_backend/app/temp_test.py diff --git a/surfsense_backend/app/temp_test.py b/surfsense_backend/app/temp_test.py deleted file mode 100644 index f8ff10fec..000000000 --- a/surfsense_backend/app/temp_test.py +++ /dev/null @@ -1,17 +0,0 @@ -from linkup import LinkupClient - -# Initialize the client (API key can be read from the environment variable or passed as an argument) -client = LinkupClient( - api_key="0ed1d08a-c8eb-4f01-9e3d-67cf87a3cd8f" -) - -# Perform a search query -search_response = client.search( - query="What is Surfsense?", - depth="standard", # "standard" or "deep" - output_type="searchResults", # "searchResults" or "sourcedAnswer" or "structured" - structured_output_schema=None, # must be filled if output_type is "structured" -) -print(search_response) - -# results=[LinkupSearchTextResult(type='text', name='SurfSense - Future Tools', url='https://www.futuretools.io/tools/surfsense', content='SurfSense is an open-source AI research assistant that functions as a personal, private alternative to tools like NotebookLM or Perplexity. It enables users to save webpages (even those behind login walls), upload documents, and build a searchable knowledge base that can be queried through natural language. The tool integrates with various external sources including search engines, Slack ...'), LinkupSearchTextResult(type='text', name='r/selfhosted on Reddit: SurfSense - Personal AI Assistant for World Wide Web Surfers.', url='https://www.reddit.com/r/selfhosted/comments/1fl58vh/surfsense_personal_ai_assistant_for_world_wide/', content='14 votes, 22 comments. Hi Everyone, For the past few months I have been trying to build a Personal AI Assistant for World Wide Web Surfers. It…\nWhat it is and why I am making it: Well when I’m browsing the internet, I tend to save a ton of content—but remembering when and what you saved? Total brain freeze! That’s where SurfSense comes in. SurfSense is a Personal AI Assistant for anything you see (Social Media Chats, Calendar Invites, Important Mails, Tutorials, Recipes and anything ) on the World Wide Web.\nPlease test it out at https://github.com/MODSetter/SurfSense and let me know your feedback.\nPosted by u/Uiqueblhats - 14 votes and 22 comments'), LinkupSearchTextResult(type='text', name='SurfSense - GitHub', url='https://github.com/DLMJR/surfsense', content='While tools like NotebookLM and Perplexity are impressive and highly effective for conducting research on any topic/query, SurfSense elevates this capability by integrating with your personal knowledge base. It is a highly customizable AI research agent, connected to external sources such as search engines (Tavily), Slack, Notion, YouTube, GitHub and more to come.'), LinkupSearchTextResult(type='text', name='How to Set Up and Use SurfSense: Your Personal AI Assistant', url='https://fxis.ai/edu/how-to-set-up-and-use-surfsense-your-personal-ai-assistant/', content='SurfSense is the answer to the common struggle of remembering what content you’ve saved while browsing the internet. Imagine your favorite library, but instead of books, it’s filled with every useful webpage, chat message, recipe, and tutorial you’ve come across. With SurfSense, you can instantly recall any of these digital treasures. Let’s embark on a journey to set up and utilize ...'), LinkupSearchTextResult(type='text', name='Surf Sense | F6S', url='https://www.f6s.com/surfsense', content='Surf Sense - Government - Surf Sense is the modern infrastructure network of the ocean.\nsurfsense.com.au · Nathan Adler · Sydney, Australia · Product leader, ex-engineer, start-up founder & maker, with end-to-end product development background in software and hardware. Product · Employee @Airtasker · Product · Employee @SafetyCulture · B Engineering / B Commerce @UNSW See 3 more ·'), LinkupSearchTextResult(type='text', name='Surf Sense | Online Surf Coaching & Knowledge Platform', url='https://www.surf-sense.com/', content='Join Surf Sense, the ultimate online surf coaching platform designed for intermediate and advanced surfers. Access expert-guided courses, weekly live Q&A sessions, and a thriving global surf community. Start improving your surfing today!\nundefined'), LinkupSearchTextResult(type='text', name='SurfSense - The Open Source Alternative to NotebookLM / Perplexity ...', url='https://www.redditmedia.com/r/selfhosted/comments/1jzi67a/surfsense_the_open_source_alternative_to/', content="For those of you who aren't familiar with SurfSense, it aims to be the open-source alternative to NotebookLM, Perplexity, or Glean. In short, it's a Highly Customizable AI Research Agent but connected to your personal external sources like search engines (Tavily), Slack, Notion, YouTube, GitHub, and more coming soon."), LinkupSearchTextResult(type='text', name='GitHub - MODSetter/SurfSense: Open Source Alternative to NotebookLM ...', url='https://github.com/MODSetter/SurfSense', content='While tools like NotebookLM and Perplexity are impressive and highly effective for conducting research on any topic/query, SurfSense elevates this capability by integrating with your personal knowledge base. It is a highly customizable AI research agent, connected to external sources such as search engines (Tavily), Slack, Linear, Notion, YouTube, GitHub and more to come.'), LinkupSearchTextResult(type='text', name='GitHub - MODSetter/SurfSense: Open Source Alternative to NotebookLM / Perplexity / Glean, connected to external sources such as search engines (Tavily), Slack, Linear, Notion, YouTube, GitHub and more.', url='https://github.com/MODSetter/SurfSense', content='Open Source Alternative to NotebookLM / Perplexity / Glean, connected to external sources such as search engines (Tavily), Slack, Linear, Notion, YouTube, GitHub and more. - MODSetter/SurfSense\nWhile tools like NotebookLM and Perplexity are impressive and highly effective for conducting research on any topic/query, SurfSense elevates this capability by integrating with your personal knowledge base.\nThe SurfSense extension can be used to save any webpage you like.\nThe SurfSense Podcast feature is currently being reworked for better UI and stability.\nSurfSense is actively being developed.'), LinkupSearchTextResult(type='text', name='SurfSense - Chrome Web Store', url='https://chromewebstore.google.com/detail/surfsense/jihmihbdpfjhppdlifphccgefjhifblf', content='Extension to collect Browsing History for SurfSense.\nWell when I’m browsing the internet, I tend to save a ton of content—but remembering when and what you saved? Total brain freeze! ❄️ That’s where SurfSense comes in. SurfSense is like a Knowledge Graph 🧠 Brain 🧠 for anything you see on the World Wide Web.\nSurfSense has disclosed the following information regarding the collection and usage of your data.\nThen, ask your personal knowledge base anything about your saved content., and voilà—instant recall! 🧑\u200d💻🌐 Use this extension to capture & save your Web Content and chat with your personal Knowledge Graph 🧠 Brain 🧠 at https://www.surfsense.net')] \ No newline at end of file From f956a39498ba5989cf327ab00eaf64ac4b7e73d1 Mon Sep 17 00:00:00 2001 From: "DESKTOP-RTLN3BA\\$punk" Date: Sun, 27 Apr 2025 16:17:20 -0700 Subject: [PATCH 04/70] chore(fix): linkup citation mapping --- surfsense_backend/app/utils/connector_service.py | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/surfsense_backend/app/utils/connector_service.py b/surfsense_backend/app/utils/connector_service.py index 7f88c1c0f..23e3035e8 100644 --- a/surfsense_backend/app/utils/connector_service.py +++ b/surfsense_backend/app/utils/connector_service.py @@ -684,13 +684,24 @@ class ConnectorService: # Extract results from Linkup response - access as attribute instead of using .get() linkup_results = response.results if hasattr(response, 'results') else [] + # Only proceed if we have results + if not linkup_results: + return { + "id": 10, + "name": "Linkup Search", + "type": "LINKUP_API", + "sources": [], + }, [] + # Process each result and create sources directly without deduplication sources_list = [] documents = [] for i, result in enumerate(linkup_results): - # Fix for UI - linkup_results[i]['document']['id'] = self.source_id_counter + # Only process results that have content + if not hasattr(result, 'content') or not result.content: + continue + # Create a source entry source = { "id": self.source_id_counter, From a971bb1f721889bc343bf18e9b414184efdb0f20 Mon Sep 17 00:00:00 2001 From: "DESKTOP-RTLN3BA\\$punk" Date: Sun, 27 Apr 2025 20:39:17 -0700 Subject: [PATCH 05/70] chore: update README and refactor ConnectorService for improved document handling and error management --- surfsense_backend/README.md | 1 - .../app/agents/researcher/nodes.py | 12 +- .../researcher/sub_section_writer/nodes.py | 2 +- .../app/utils/connector_service.py | 687 ++++++++++-------- 4 files changed, 399 insertions(+), 303 deletions(-) diff --git a/surfsense_backend/README.md b/surfsense_backend/README.md index 879fa4372..f78ec7df5 100644 --- a/surfsense_backend/README.md +++ b/surfsense_backend/README.md @@ -110,7 +110,6 @@ See pyproject.toml for detailed dependency information. Key dependencies include - fastapi and related packages - fastapi-users: Authentication and user management - firecrawl-py: Web crawling capabilities -- gpt-researcher: Advanced research capabilities - langchain components for AI workflows - litellm: LLM model integration - pgvector: Vector similarity search in PostgreSQL diff --git a/surfsense_backend/app/agents/researcher/nodes.py b/surfsense_backend/app/agents/researcher/nodes.py index 1b42d7155..4c3bc721f 100644 --- a/surfsense_backend/app/agents/researcher/nodes.py +++ b/surfsense_backend/app/agents/researcher/nodes.py @@ -143,7 +143,8 @@ async def fetch_relevant_documents( connectors_to_search: List[str], writer: StreamWriter = None, state: State = None, - top_k: int = 10 + top_k: int = 10, + connector_service: ConnectorService = None ) -> List[Dict[str, Any]]: """ Fetch relevant documents for research questions using the provided connectors. @@ -162,7 +163,7 @@ async def fetch_relevant_documents( List of relevant documents """ # Initialize services - connector_service = ConnectorService(db_session) + # connector_service = ConnectorService(db_session) # Only use streaming if both writer and state are provided streaming_service = state.streaming_service if state is not None else None @@ -494,10 +495,12 @@ async def process_sections(state: State, config: RunnableConfig, writer: StreamW elif configuration.num_sections == 6: TOP_K = 30 - relevant_documents = [] async with async_session_maker() as db_session: try: + # Create connector service inside the db_session scope + connector_service = ConnectorService(db_session) + relevant_documents = await fetch_relevant_documents( research_questions=all_questions, user_id=configuration.user_id, @@ -506,7 +509,8 @@ async def process_sections(state: State, config: RunnableConfig, writer: StreamW connectors_to_search=configuration.connectors_to_search, writer=writer, state=state, - top_k=TOP_K + top_k=TOP_K, + connector_service=connector_service ) except Exception as e: error_message = f"Error fetching relevant documents: {str(e)}" diff --git a/surfsense_backend/app/agents/researcher/sub_section_writer/nodes.py b/surfsense_backend/app/agents/researcher/sub_section_writer/nodes.py index 0bec4618c..f1d50aeeb 100644 --- a/surfsense_backend/app/agents/researcher/sub_section_writer/nodes.py +++ b/surfsense_backend/app/agents/researcher/sub_section_writer/nodes.py @@ -102,7 +102,7 @@ async def write_sub_section(state: State, config: RunnableConfig) -> Dict[str, A # Extract content and metadata content = doc.get("content", "") doc_info = doc.get("document", {}) - document_id = doc_info.get("id", f"{i+1}") # Use document ID or index+1 as source_id + document_id = doc_info.get("id") # Use document ID # Format document according to the citation system prompt's expected format formatted_doc = f""" diff --git a/surfsense_backend/app/utils/connector_service.py b/surfsense_backend/app/utils/connector_service.py index 23e3035e8..c7ad692e0 100644 --- a/surfsense_backend/app/utils/connector_service.py +++ b/surfsense_backend/app/utils/connector_service.py @@ -1,5 +1,6 @@ import json from typing import List, Dict, Any, Optional, Tuple +import asyncio from sqlalchemy.ext.asyncio import AsyncSession from sqlalchemy.future import select from app.retriver.chunks_hybrid_search import ChucksHybridSearchRetriever @@ -13,6 +14,7 @@ class ConnectorService: self.session = session self.retriever = ChucksHybridSearchRetriever(session) self.source_id_counter = 1 + self.counter_lock = asyncio.Lock() # Lock to protect counter in multithreaded environments async def search_crawled_urls(self, user_query: str, user_id: str, search_space_id: int, top_k: int = 20) -> tuple: """ @@ -29,25 +31,35 @@ class ConnectorService: document_type="CRAWLED_URL" ) + # Early return if no results + if not crawled_urls_chunks: + return { + "id": 1, + "name": "Crawled URLs", + "type": "CRAWLED_URL", + "sources": [], + }, [] + # Process each chunk and create sources directly without deduplication sources_list = [] - for i, chunk in enumerate(crawled_urls_chunks): - # Fix for UI - crawled_urls_chunks[i]['document']['id'] = self.source_id_counter - # Extract document metadata - document = chunk.get('document', {}) - metadata = document.get('metadata', {}) + async with self.counter_lock: + for i, chunk in enumerate(crawled_urls_chunks): + # Fix for UI + crawled_urls_chunks[i]['document']['id'] = self.source_id_counter + # Extract document metadata + document = chunk.get('document', {}) + metadata = document.get('metadata', {}) - # Create a source entry - source = { - "id": self.source_id_counter, - "title": document.get('title', 'Untitled Document'), - "description": metadata.get('og:description', metadata.get('ogDescription', chunk.get('content', '')[:100])), - "url": metadata.get('url', '') - } + # Create a source entry + source = { + "id": self.source_id_counter, + "title": document.get('title', 'Untitled Document'), + "description": metadata.get('og:description', metadata.get('ogDescription', chunk.get('content', '')[:100])), + "url": metadata.get('url', '') + } - self.source_id_counter += 1 - sources_list.append(source) + self.source_id_counter += 1 + sources_list.append(source) # Create result object result_object = { @@ -73,26 +85,36 @@ class ConnectorService: search_space_id=search_space_id, document_type="FILE" ) + + # Early return if no results + if not files_chunks: + return { + "id": 2, + "name": "Files", + "type": "FILE", + "sources": [], + }, [] # Process each chunk and create sources directly without deduplication sources_list = [] - for i, chunk in enumerate(files_chunks): - # Fix for UI - files_chunks[i]['document']['id'] = self.source_id_counter - # Extract document metadata - document = chunk.get('document', {}) - metadata = document.get('metadata', {}) + async with self.counter_lock: + for i, chunk in enumerate(files_chunks): + # Fix for UI + files_chunks[i]['document']['id'] = self.source_id_counter + # Extract document metadata + document = chunk.get('document', {}) + metadata = document.get('metadata', {}) - # Create a source entry - source = { - "id": self.source_id_counter, - "title": document.get('title', 'Untitled Document'), - "description": metadata.get('og:description', metadata.get('ogDescription', chunk.get('content', '')[:100])), - "url": metadata.get('url', '') - } + # Create a source entry + source = { + "id": self.source_id_counter, + "title": document.get('title', 'Untitled Document'), + "description": metadata.get('og:description', metadata.get('ogDescription', chunk.get('content', '')[:100])), + "url": metadata.get('url', '') + } - self.source_id_counter += 1 - sources_list.append(source) + self.source_id_counter += 1 + sources_list.append(source) # Create result object result_object = { @@ -163,39 +185,49 @@ class ConnectorService: # Extract results from Tavily response tavily_results = response.get("results", []) + # Early return if no results + if not tavily_results: + return { + "id": 3, + "name": "Tavily Search", + "type": "TAVILY_API", + "sources": [], + }, [] + # Process each result and create sources directly without deduplication sources_list = [] documents = [] - for i, result in enumerate(tavily_results): - - # Create a source entry - source = { - "id": self.source_id_counter, - "title": result.get("title", "Tavily Result"), - "description": result.get("content", "")[:100], - "url": result.get("url", "") - } - sources_list.append(source) - - # Create a document entry - document = { - "chunk_id": f"tavily_chunk_{i}", - "content": result.get("content", ""), - "score": result.get("score", 0.0), - "document": { + async with self.counter_lock: + for i, result in enumerate(tavily_results): + + # Create a source entry + source = { "id": self.source_id_counter, "title": result.get("title", "Tavily Result"), - "document_type": "TAVILY_API", - "metadata": { - "url": result.get("url", ""), - "published_date": result.get("published_date", ""), - "source": "TAVILY_API" + "description": result.get("content", "")[:100], + "url": result.get("url", "") + } + sources_list.append(source) + + # Create a document entry + document = { + "chunk_id": f"tavily_chunk_{i}", + "content": result.get("content", ""), + "score": result.get("score", 0.0), + "document": { + "id": self.source_id_counter, + "title": result.get("title", "Tavily Result"), + "document_type": "TAVILY_API", + "metadata": { + "url": result.get("url", ""), + "published_date": result.get("published_date", ""), + "source": "TAVILY_API" + } } } - } - documents.append(document) - self.source_id_counter += 1 + documents.append(document) + self.source_id_counter += 1 # Create result object result_object = { @@ -231,45 +263,55 @@ class ConnectorService: search_space_id=search_space_id, document_type="SLACK_CONNECTOR" ) + + # Early return if no results + if not slack_chunks: + return { + "id": 4, + "name": "Slack", + "type": "SLACK_CONNECTOR", + "sources": [], + }, [] # Process each chunk and create sources directly without deduplication sources_list = [] - for i, chunk in enumerate(slack_chunks): - # Fix for UI - slack_chunks[i]['document']['id'] = self.source_id_counter - # Extract document metadata - document = chunk.get('document', {}) - metadata = document.get('metadata', {}) + async with self.counter_lock: + for i, chunk in enumerate(slack_chunks): + # Fix for UI + slack_chunks[i]['document']['id'] = self.source_id_counter + # Extract document metadata + document = chunk.get('document', {}) + metadata = document.get('metadata', {}) - # Create a mapped source entry with Slack-specific metadata - channel_name = metadata.get('channel_name', 'Unknown Channel') - channel_id = metadata.get('channel_id', '') - message_date = metadata.get('start_date', '') - - # Create a more descriptive title for Slack messages - title = f"Slack: {channel_name}" - if message_date: - title += f" ({message_date})" + # Create a mapped source entry with Slack-specific metadata + channel_name = metadata.get('channel_name', 'Unknown Channel') + channel_id = metadata.get('channel_id', '') + message_date = metadata.get('start_date', '') - # Create a more descriptive description for Slack messages - description = chunk.get('content', '')[:100] - if len(description) == 100: - description += "..." - - # For URL, we can use a placeholder or construct a URL to the Slack channel if available - url = "" - if channel_id: - url = f"https://slack.com/app_redirect?channel={channel_id}" + # Create a more descriptive title for Slack messages + title = f"Slack: {channel_name}" + if message_date: + title += f" ({message_date})" + + # Create a more descriptive description for Slack messages + description = chunk.get('content', '')[:100] + if len(description) == 100: + description += "..." + + # For URL, we can use a placeholder or construct a URL to the Slack channel if available + url = "" + if channel_id: + url = f"https://slack.com/app_redirect?channel={channel_id}" - source = { - "id": self.source_id_counter, - "title": title, - "description": description, - "url": url, - } + source = { + "id": self.source_id_counter, + "title": title, + "description": description, + "url": url, + } - self.source_id_counter += 1 - sources_list.append(source) + self.source_id_counter += 1 + sources_list.append(source) # Create result object result_object = { @@ -301,47 +343,57 @@ class ConnectorService: search_space_id=search_space_id, document_type="NOTION_CONNECTOR" ) + + # Early return if no results + if not notion_chunks: + return { + "id": 5, + "name": "Notion", + "type": "NOTION_CONNECTOR", + "sources": [], + }, [] # Process each chunk and create sources directly without deduplication sources_list = [] - for i, chunk in enumerate(notion_chunks): - # Fix for UI - notion_chunks[i]['document']['id'] = self.source_id_counter - - # Extract document metadata - document = chunk.get('document', {}) - metadata = document.get('metadata', {}) - - # Create a mapped source entry with Notion-specific metadata - page_title = metadata.get('page_title', 'Untitled Page') - page_id = metadata.get('page_id', '') - indexed_at = metadata.get('indexed_at', '') - - # Create a more descriptive title for Notion pages - title = f"Notion: {page_title}" - if indexed_at: - title += f" (indexed: {indexed_at})" + async with self.counter_lock: + for i, chunk in enumerate(notion_chunks): + # Fix for UI + notion_chunks[i]['document']['id'] = self.source_id_counter - # Create a more descriptive description for Notion pages - description = chunk.get('content', '')[:100] - if len(description) == 100: - description += "..." + # Extract document metadata + document = chunk.get('document', {}) + metadata = document.get('metadata', {}) + + # Create a mapped source entry with Notion-specific metadata + page_title = metadata.get('page_title', 'Untitled Page') + page_id = metadata.get('page_id', '') + indexed_at = metadata.get('indexed_at', '') - # For URL, we can use a placeholder or construct a URL to the Notion page if available - url = "" - if page_id: - # Notion page URLs follow this format - url = f"https://notion.so/{page_id.replace('-', '')}" + # Create a more descriptive title for Notion pages + title = f"Notion: {page_title}" + if indexed_at: + title += f" (indexed: {indexed_at})" + + # Create a more descriptive description for Notion pages + description = chunk.get('content', '')[:100] + if len(description) == 100: + description += "..." + + # For URL, we can use a placeholder or construct a URL to the Notion page if available + url = "" + if page_id: + # Notion page URLs follow this format + url = f"https://notion.so/{page_id.replace('-', '')}" - source = { - "id": self.source_id_counter, - "title": title, - "description": description, - "url": url, - } + source = { + "id": self.source_id_counter, + "title": title, + "description": description, + "url": url, + } - self.source_id_counter += 1 - sources_list.append(source) + self.source_id_counter += 1 + sources_list.append(source) # Create result object result_object = { @@ -373,65 +425,75 @@ class ConnectorService: search_space_id=search_space_id, document_type="EXTENSION" ) + + # Early return if no results + if not extension_chunks: + return { + "id": 6, + "name": "Extension", + "type": "EXTENSION", + "sources": [], + }, [] # Process each chunk and create sources directly without deduplication sources_list = [] - for i, chunk in enumerate(extension_chunks): - # Fix for UI - extension_chunks[i]['document']['id'] = self.source_id_counter - - # Extract document metadata - document = chunk.get('document', {}) - metadata = document.get('metadata', {}) + async with self.counter_lock: + for i, chunk in enumerate(extension_chunks): + # Fix for UI + extension_chunks[i]['document']['id'] = self.source_id_counter + + # Extract document metadata + document = chunk.get('document', {}) + metadata = document.get('metadata', {}) - # Extract extension-specific metadata - webpage_title = metadata.get('VisitedWebPageTitle', 'Untitled Page') - webpage_url = metadata.get('VisitedWebPageURL', '') - visit_date = metadata.get('VisitedWebPageDateWithTimeInISOString', '') - visit_duration = metadata.get('VisitedWebPageVisitDurationInMilliseconds', '') - browsing_session_id = metadata.get('BrowsingSessionId', '') - - # Create a more descriptive title for extension data - title = webpage_title - if visit_date: - # Format the date for display (simplified) - try: - # Just extract the date part for display - formatted_date = visit_date.split('T')[0] if 'T' in visit_date else visit_date - title += f" (visited: {formatted_date})" - except: - # Fallback if date parsing fails - title += f" (visited: {visit_date})" + # Extract extension-specific metadata + webpage_title = metadata.get('VisitedWebPageTitle', 'Untitled Page') + webpage_url = metadata.get('VisitedWebPageURL', '') + visit_date = metadata.get('VisitedWebPageDateWithTimeInISOString', '') + visit_duration = metadata.get('VisitedWebPageVisitDurationInMilliseconds', '') + browsing_session_id = metadata.get('BrowsingSessionId', '') - # Create a more descriptive description for extension data - description = chunk.get('content', '')[:100] - if len(description) == 100: - description += "..." - - # Add visit duration if available - if visit_duration: - try: - duration_seconds = int(visit_duration) / 1000 - if duration_seconds < 60: - duration_text = f"{duration_seconds:.1f} seconds" - else: - duration_text = f"{duration_seconds/60:.1f} minutes" + # Create a more descriptive title for extension data + title = webpage_title + if visit_date: + # Format the date for display (simplified) + try: + # Just extract the date part for display + formatted_date = visit_date.split('T')[0] if 'T' in visit_date else visit_date + title += f" (visited: {formatted_date})" + except: + # Fallback if date parsing fails + title += f" (visited: {visit_date})" - if description: - description += f" | Duration: {duration_text}" - except: - # Fallback if duration parsing fails - pass + # Create a more descriptive description for extension data + description = chunk.get('content', '')[:100] + if len(description) == 100: + description += "..." + + # Add visit duration if available + if visit_duration: + try: + duration_seconds = int(visit_duration) / 1000 + if duration_seconds < 60: + duration_text = f"{duration_seconds:.1f} seconds" + else: + duration_text = f"{duration_seconds/60:.1f} minutes" + + if description: + description += f" | Duration: {duration_text}" + except: + # Fallback if duration parsing fails + pass - source = { - "id": self.source_id_counter, - "title": title, - "description": description, - "url": webpage_url - } + source = { + "id": self.source_id_counter, + "title": title, + "description": description, + "url": webpage_url + } - self.source_id_counter += 1 - sources_list.append(source) + self.source_id_counter += 1 + sources_list.append(source) # Create result object result_object = { @@ -463,47 +525,57 @@ class ConnectorService: search_space_id=search_space_id, document_type="YOUTUBE_VIDEO" ) + + # Early return if no results + if not youtube_chunks: + return { + "id": 7, + "name": "YouTube Videos", + "type": "YOUTUBE_VIDEO", + "sources": [], + }, [] # Process each chunk and create sources directly without deduplication sources_list = [] - for i, chunk in enumerate(youtube_chunks): - # Fix for UI - youtube_chunks[i]['document']['id'] = self.source_id_counter - - # Extract document metadata - document = chunk.get('document', {}) - metadata = document.get('metadata', {}) - - # Extract YouTube-specific metadata - video_title = metadata.get('video_title', 'Untitled Video') - video_id = metadata.get('video_id', '') - channel_name = metadata.get('channel_name', '') - published_date = metadata.get('published_date', '') - - # Create a more descriptive title for YouTube videos - title = video_title - if channel_name: - title += f" - {channel_name}" + async with self.counter_lock: + for i, chunk in enumerate(youtube_chunks): + # Fix for UI + youtube_chunks[i]['document']['id'] = self.source_id_counter - # Create a more descriptive description for YouTube videos - description = metadata.get('description', chunk.get('content', '')[:100]) - if len(description) == 100: - description += "..." + # Extract document metadata + document = chunk.get('document', {}) + metadata = document.get('metadata', {}) + + # Extract YouTube-specific metadata + video_title = metadata.get('video_title', 'Untitled Video') + video_id = metadata.get('video_id', '') + channel_name = metadata.get('channel_name', '') + published_date = metadata.get('published_date', '') - # For URL, construct a URL to the YouTube video - url = f"https://www.youtube.com/watch?v={video_id}" if video_id else "" + # Create a more descriptive title for YouTube videos + title = video_title + if channel_name: + title += f" - {channel_name}" + + # Create a more descriptive description for YouTube videos + description = metadata.get('description', chunk.get('content', '')[:100]) + if len(description) == 100: + description += "..." + + # For URL, construct a URL to the YouTube video + url = f"https://www.youtube.com/watch?v={video_id}" if video_id else "" - source = { - "id": self.source_id_counter, - "title": title, - "description": description, - "url": url, - "video_id": video_id, # Additional field for YouTube videos - "channel_name": channel_name # Additional field for YouTube videos - } + source = { + "id": self.source_id_counter, + "title": title, + "description": description, + "url": url, + "video_id": video_id, # Additional field for YouTube videos + "channel_name": channel_name # Additional field for YouTube videos + } - self.source_id_counter += 1 - sources_list.append(source) + self.source_id_counter += 1 + sources_list.append(source) # Create result object result_object = { @@ -529,27 +601,37 @@ class ConnectorService: search_space_id=search_space_id, document_type="GITHUB_CONNECTOR" ) + + # Early return if no results + if not github_chunks: + return { + "id": 8, + "name": "GitHub", + "type": "GITHUB_CONNECTOR", + "sources": [], + }, [] # Process each chunk and create sources directly without deduplication sources_list = [] - for i, chunk in enumerate(github_chunks): - # Fix for UI - assign a unique ID for citation/source tracking - github_chunks[i]['document']['id'] = self.source_id_counter - - # Extract document metadata - document = chunk.get('document', {}) - metadata = document.get('metadata', {}) + async with self.counter_lock: + for i, chunk in enumerate(github_chunks): + # Fix for UI - assign a unique ID for citation/source tracking + github_chunks[i]['document']['id'] = self.source_id_counter + + # Extract document metadata + document = chunk.get('document', {}) + metadata = document.get('metadata', {}) - # Create a source entry - source = { - "id": self.source_id_counter, - "title": document.get('title', 'GitHub Document'), # Use specific title if available - "description": metadata.get('description', chunk.get('content', '')[:100]), # Use description or content preview - "url": metadata.get('url', '') # Use URL if available in metadata - } + # Create a source entry + source = { + "id": self.source_id_counter, + "title": document.get('title', 'GitHub Document'), # Use specific title if available + "description": metadata.get('description', chunk.get('content', '')[:100]), # Use description or content preview + "url": metadata.get('url', '') # Use URL if available in metadata + } - self.source_id_counter += 1 - sources_list.append(source) + self.source_id_counter += 1 + sources_list.append(source) # Create result object result_object = { @@ -581,59 +663,69 @@ class ConnectorService: search_space_id=search_space_id, document_type="LINEAR_CONNECTOR" ) + + # Early return if no results + if not linear_chunks: + return { + "id": 9, + "name": "Linear Issues", + "type": "LINEAR_CONNECTOR", + "sources": [], + }, [] # Process each chunk and create sources directly without deduplication sources_list = [] - for i, chunk in enumerate(linear_chunks): - # Fix for UI - linear_chunks[i]['document']['id'] = self.source_id_counter - - # Extract document metadata - document = chunk.get('document', {}) - metadata = document.get('metadata', {}) - - # Extract Linear-specific metadata - issue_identifier = metadata.get('issue_identifier', '') - issue_title = metadata.get('issue_title', 'Untitled Issue') - issue_state = metadata.get('state', '') - comment_count = metadata.get('comment_count', 0) - - # Create a more descriptive title for Linear issues - title = f"Linear: {issue_identifier} - {issue_title}" - if issue_state: - title += f" ({issue_state})" + async with self.counter_lock: + for i, chunk in enumerate(linear_chunks): + # Fix for UI + linear_chunks[i]['document']['id'] = self.source_id_counter - # Create a more descriptive description for Linear issues - description = chunk.get('content', '')[:100] - if len(description) == 100: - description += "..." - - # Add comment count info to description - if comment_count: - if description: - description += f" | Comments: {comment_count}" - else: - description = f"Comments: {comment_count}" - - # For URL, we could construct a URL to the Linear issue if we have the workspace info - # For now, use a generic placeholder - url = "" - if issue_identifier: - # This is a generic format, may need to be adjusted based on actual Linear workspace - url = f"https://linear.app/issue/{issue_identifier}" + # Extract document metadata + document = chunk.get('document', {}) + metadata = document.get('metadata', {}) - source = { - "id": self.source_id_counter, - "title": title, - "description": description, - "url": url, - "issue_identifier": issue_identifier, - "state": issue_state, - "comment_count": comment_count - } + # Extract Linear-specific metadata + issue_identifier = metadata.get('issue_identifier', '') + issue_title = metadata.get('issue_title', 'Untitled Issue') + issue_state = metadata.get('state', '') + comment_count = metadata.get('comment_count', 0) + + # Create a more descriptive title for Linear issues + title = f"Linear: {issue_identifier} - {issue_title}" + if issue_state: + title += f" ({issue_state})" + + # Create a more descriptive description for Linear issues + description = chunk.get('content', '')[:100] + if len(description) == 100: + description += "..." + + # Add comment count info to description + if comment_count: + if description: + description += f" | Comments: {comment_count}" + else: + description = f"Comments: {comment_count}" + + # For URL, we could construct a URL to the Linear issue if we have the workspace info + # For now, use a generic placeholder + url = "" + if issue_identifier: + # This is a generic format, may need to be adjusted based on actual Linear workspace + url = f"https://linear.app/issue/{issue_identifier}" - self.source_id_counter += 1 - sources_list.append(source) + source = { + "id": self.source_id_counter, + "title": title, + "description": description, + "url": url, + "issue_identifier": issue_identifier, + "state": issue_state, + "comment_count": comment_count + } + + self.source_id_counter += 1 + sources_list.append(source) # Create result object result_object = { @@ -697,38 +789,39 @@ class ConnectorService: sources_list = [] documents = [] - for i, result in enumerate(linkup_results): - # Only process results that have content - if not hasattr(result, 'content') or not result.content: - continue - - # Create a source entry - source = { - "id": self.source_id_counter, - "title": result.name if hasattr(result, 'name') else "Linkup Result", - "description": result.content[:100] if hasattr(result, 'content') else "", - "url": result.url if hasattr(result, 'url') else "" - } - sources_list.append(source) - - # Create a document entry - document = { - "chunk_id": f"linkup_chunk_{i}", - "content": result.content if hasattr(result, 'content') else "", - "score": 1.0, # Default score since not provided by Linkup - "document": { + async with self.counter_lock: + for i, result in enumerate(linkup_results): + # Only process results that have content + if not hasattr(result, 'content') or not result.content: + continue + + # Create a source entry + source = { "id": self.source_id_counter, "title": result.name if hasattr(result, 'name') else "Linkup Result", - "document_type": "LINKUP_API", - "metadata": { - "url": result.url if hasattr(result, 'url') else "", - "type": result.type if hasattr(result, 'type') else "", - "source": "LINKUP_API" + "description": result.content[:100] if hasattr(result, 'content') else "", + "url": result.url if hasattr(result, 'url') else "" + } + sources_list.append(source) + + # Create a document entry + document = { + "chunk_id": f"linkup_chunk_{i}", + "content": result.content if hasattr(result, 'content') else "", + "score": 1.0, # Default score since not provided by Linkup + "document": { + "id": self.source_id_counter, + "title": result.name if hasattr(result, 'name') else "Linkup Result", + "document_type": "LINKUP_API", + "metadata": { + "url": result.url if hasattr(result, 'url') else "", + "type": result.type if hasattr(result, 'type') else "", + "source": "LINKUP_API" + } } } - } - documents.append(document) - self.source_id_counter += 1 + documents.append(document) + self.source_id_counter += 1 # Create result object result_object = { From 22da221ad533e7cf257b2c7171704d73cafb5592 Mon Sep 17 00:00:00 2001 From: "DESKTOP-RTLN3BA\\$punk" Date: Tue, 29 Apr 2025 23:02:07 -0700 Subject: [PATCH 06/70] feat: Shifted to RecursiveChunker and CodeChunker - Codebase Q/A should be lot better now. --- surfsense_backend/app/config/__init__.py | 14 +- .../app/tasks/background_tasks.py | 40 +-- .../app/tasks/connectors_indexing_tasks.py | 10 +- surfsense_backend/pyproject.toml | 2 +- surfsense_backend/uv.lock | 288 +++++++++++++++--- 5 files changed, 285 insertions(+), 69 deletions(-) diff --git a/surfsense_backend/app/config/__init__.py b/surfsense_backend/app/config/__init__.py index c7f842b71..4adf2b7dc 100644 --- a/surfsense_backend/app/config/__init__.py +++ b/surfsense_backend/app/config/__init__.py @@ -1,12 +1,10 @@ import os from pathlib import Path -from chonkie import AutoEmbeddings, LateChunker -from rerankers import Reranker -from langchain_community.chat_models import ChatLiteLLM - - +from chonkie import AutoEmbeddings, CodeChunker, RecursiveChunker from dotenv import load_dotenv +from langchain_community.chat_models import ChatLiteLLM +from rerankers import Reranker # Get the base directory of the project BASE_DIR = Path(__file__).resolve().parent.parent.parent @@ -39,10 +37,12 @@ class Config: # Chonkie Configuration | Edit this to your needs EMBEDDING_MODEL = os.getenv("EMBEDDING_MODEL") embedding_model_instance = AutoEmbeddings.get_embeddings(EMBEDDING_MODEL) - chunker_instance = LateChunker( - embedding_model=EMBEDDING_MODEL, + chunker_instance = RecursiveChunker( chunk_size=embedding_model_instance.max_seq_length, ) + code_chunker_instance = CodeChunker( + chunk_size=embedding_model_instance.max_seq_length + ) # Reranker's Configuration | Pinecode, Cohere etc. Read more at https://github.com/AnswerDotAI/rerankers?tab=readme-ov-file#usage RERANKERS_MODEL_NAME = os.getenv("RERANKERS_MODEL_NAME") diff --git a/surfsense_backend/app/tasks/background_tasks.py b/surfsense_backend/app/tasks/background_tasks.py index b2f6f8c81..68b56c435 100644 --- a/surfsense_backend/app/tasks/background_tasks.py +++ b/surfsense_backend/app/tasks/background_tasks.py @@ -80,7 +80,7 @@ async def add_crawled_url_document( # Process chunks chunks = [ - Chunk(content=chunk.text, embedding=chunk.embedding) + Chunk(content=chunk.text, embedding=config.embedding_model_instance.embed(chunk.text)) for chunk in config.chunker_instance.chunk(content_in_markdown) ] @@ -166,7 +166,7 @@ async def add_extension_received_document( # Process chunks chunks = [ - Chunk(content=chunk.text, embedding=chunk.embedding) + Chunk(content=chunk.text, embedding=config.embedding_model_instance.embed(chunk.text)) for chunk in config.chunker_instance.chunk(content.pageContent) ] @@ -215,7 +215,7 @@ async def add_received_file_document( # Process chunks chunks = [ - Chunk(content=chunk.text, embedding=chunk.embedding) + Chunk(content=chunk.text, embedding=config.embedding_model_instance.embed(chunk.text)) for chunk in config.chunker_instance.chunk(file_in_markdown) ] @@ -256,14 +256,14 @@ async def add_youtube_video_document( """ try: from youtube_transcript_api import YouTubeTranscriptApi - + # Extract video ID from URL def get_youtube_video_id(url: str): from urllib.parse import urlparse, parse_qs - + parsed_url = urlparse(url) hostname = parsed_url.hostname - + if hostname == "youtu.be": return parsed_url.path[1:] if hostname in ("www.youtube.com", "youtube.com"): @@ -275,26 +275,27 @@ async def add_youtube_video_document( if parsed_url.path.startswith("/v/"): return parsed_url.path.split("/")[2] return None - + # Get video ID video_id = get_youtube_video_id(url) if not video_id: raise ValueError(f"Could not extract video ID from URL: {url}") - + # Get video metadata import json from urllib.parse import urlencode from urllib.request import urlopen - - params = {"format": "json", "url": f"https://www.youtube.com/watch?v={video_id}"} + + params = {"format": "json", + "url": f"https://www.youtube.com/watch?v={video_id}"} oembed_url = "https://www.youtube.com/oembed" query_string = urlencode(params) full_url = oembed_url + "?" + query_string - + with urlopen(full_url) as response: response_text = response.read() video_data = json.loads(response_text.decode()) - + # Get video transcript try: captions = YouTubeTranscriptApi.get_transcript(video_id) @@ -309,7 +310,7 @@ async def add_youtube_video_document( transcript_text = "\n".join(transcript_segments) except Exception as e: transcript_text = f"No captions available for this video. Error: {str(e)}" - + # Format document metadata in a more maintainable way metadata_sections = [ ("METADATA", [ @@ -343,17 +344,18 @@ async def add_youtube_video_document( summary_chain = SUMMARY_PROMPT_TEMPLATE | config.long_context_llm_instance summary_result = await summary_chain.ainvoke({"document": combined_document_string}) summary_content = summary_result.content - summary_embedding = config.embedding_model_instance.embed(summary_content) + summary_embedding = config.embedding_model_instance.embed( + summary_content) # Process chunks chunks = [ - Chunk(content=chunk.text, embedding=chunk.embedding) + Chunk(content=chunk.text, embedding=config.embedding_model_instance.embed(chunk.text)) for chunk in config.chunker_instance.chunk(transcript_text) ] - + # Create document from app.db import Document, DocumentType - + document = Document( title=video_data.get("title", "YouTube Video"), document_type=DocumentType.YOUTUBE_VIDEO, @@ -369,11 +371,11 @@ async def add_youtube_video_document( chunks=chunks, search_space_id=search_space_id ) - + session.add(document) await session.commit() await session.refresh(document) - + return document except SQLAlchemyError as db_error: await session.rollback() diff --git a/surfsense_backend/app/tasks/connectors_indexing_tasks.py b/surfsense_backend/app/tasks/connectors_indexing_tasks.py index 7c210628d..94643a45d 100644 --- a/surfsense_backend/app/tasks/connectors_indexing_tasks.py +++ b/surfsense_backend/app/tasks/connectors_indexing_tasks.py @@ -222,7 +222,7 @@ async def index_slack_messages( # Process chunks chunks = [ - Chunk(content=chunk.text, embedding=chunk.embedding) + Chunk(content=chunk.text, embedding=config.embedding_model_instance.embed(chunk.text)) for chunk in config.chunker_instance.chunk(channel_content) ] @@ -515,7 +515,7 @@ async def index_notion_pages( # Process chunks logger.debug(f"Chunking content for page {page_title}") chunks = [ - Chunk(content=chunk.text, embedding=chunk.embedding) + Chunk(content=chunk.text, embedding=config.embedding_model_instance.embed(chunk.text)) for chunk in config.chunker_instance.chunk(markdown_content) ] @@ -720,8 +720,8 @@ async def index_github_repos( # Chunk the content try: chunks_data = [ - Chunk(content=chunk.text, embedding=chunk.embedding) - for chunk in config.chunker_instance.chunk(file_content) + Chunk(content=chunk.text, embedding=config.embedding_model_instance.embed(chunk.text)) + for chunk in config.code_chunker_instance.chunk(file_content) ] except Exception as chunk_err: logger.error(f"Failed to chunk file {full_path_key}: {chunk_err}") @@ -984,7 +984,7 @@ async def index_linear_issues( # Process chunks - using the full issue content with comments chunks = [ - Chunk(content=chunk.text, embedding=chunk.embedding) + Chunk(content=chunk.text, embedding=config.embedding_model_instance.embed(chunk.text)) for chunk in config.chunker_instance.chunk(issue_content) ] diff --git a/surfsense_backend/pyproject.toml b/surfsense_backend/pyproject.toml index 8f8dc4c0e..c447a74b8 100644 --- a/surfsense_backend/pyproject.toml +++ b/surfsense_backend/pyproject.toml @@ -7,7 +7,7 @@ requires-python = ">=3.12" dependencies = [ "alembic>=1.13.0", "asyncpg>=0.30.0", - "chonkie[all]>=0.4.1", + "chonkie[all]>=1.0.6", "fastapi>=0.115.8", "fastapi-users[oauth,sqlalchemy]>=14.0.1", "firecrawl-py>=1.12.0", diff --git a/surfsense_backend/uv.lock b/surfsense_backend/uv.lock index 9601bccb3..a5621abda 100644 --- a/surfsense_backend/uv.lock +++ b/surfsense_backend/uv.lock @@ -13,6 +13,24 @@ resolution-markers = [ "(python_full_version < '3.12.4' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version < '3.12.4' and sys_platform != 'darwin' and sys_platform != 'linux')", ] +[[package]] +name = "accelerate" +version = "1.6.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "huggingface-hub" }, + { name = "numpy" }, + { name = "packaging" }, + { name = "psutil" }, + { name = "pyyaml" }, + { name = "safetensors" }, + { name = "torch" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/8a/6e/c29a1dcde7db07f47870ed63e5124086b11874ad52ccd533dc1ca2c799da/accelerate-1.6.0.tar.gz", hash = "sha256:28c1ef1846e690944f98b68dc7b8bb6c51d032d45e85dcbb3adb0c8b99dffb32", size = 363804 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/63/b1/8198e3cdd11a426b1df2912e3381018c4a4a55368f6d0857ba3ca418ef93/accelerate-1.6.0-py3-none-any.whl", hash = "sha256:1aee717d3d3735ad6d09710a7c26990ee4652b79b4e93df46551551b5227c2aa", size = 354748 }, +] + [[package]] name = "aiofiles" version = "24.1.0" @@ -201,19 +219,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/fc/30/d4986a882011f9df997a55e6becd864812ccfcd821d64aac8570ee39f719/attrs-25.1.0-py3-none-any.whl", hash = "sha256:c75a69e28a550a7e93789579c22aa26b0f5b83b75dc4e08fe092980051e1090a", size = 63152 }, ] -[[package]] -name = "autotiktokenizer" -version = "0.2.2" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "huggingface-hub" }, - { name = "tiktoken" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/a6/1a/c6f494750dc67c2e5b06b91ae9565d46adb384f25f61a7136ff79dd02413/autotiktokenizer-0.2.2.tar.gz", hash = "sha256:f0954f14cedfe538b96ba0eed2e39996378c0bdf649fd977d6a047e419e05fdb", size = 15401 } -wheels = [ - { url = "https://files.pythonhosted.org/packages/d8/7b/c34469a1495d755bac1c80fbf3c0c2c29eb03ffe61172d889426025173bd/autotiktokenizer-0.2.2-py3-none-any.whl", hash = "sha256:ebbf15d9d5516fcb3287a8153bd8efbcc932f9c99089b2357255413cf37815d9", size = 8957 }, -] - [[package]] name = "backoff" version = "2.2.1" @@ -363,23 +368,36 @@ wheels = [ [[package]] name = "chonkie" -version = "0.4.1" +version = "1.0.6" source = { registry = "https://pypi.org/simple" } dependencies = [ - { name = "autotiktokenizer" }, + { name = "tokenizers" }, { name = "tqdm" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/2e/94/4a1bc8bdf06e7327bb256abb85767647125286c9bbc7cbcd77a550b96d63/chonkie-0.4.1.tar.gz", hash = "sha256:164216efa01af02e750e7cb218cea87918a18f83ebbd8f020b25557f1ed36aa9", size = 43284 } +sdist = { url = "https://files.pythonhosted.org/packages/5a/db/16d5d23a216db734bcb68e61c466ff48a55dc0d2cdc7ecdd73aaea1f6f7d/chonkie-1.0.6.tar.gz", hash = "sha256:feefad3cbbb62b4a55f4c6409bd8d8f0ee180d8319c4d32e31539a768955b3b0", size = 70056 } wheels = [ - { url = "https://files.pythonhosted.org/packages/c0/b5/c0d77500a413794773edb630bdc7061121c237a4eaf6ce222226c200d603/chonkie-0.4.1-py3-none-any.whl", hash = "sha256:af7d95d17f4ed60a26e32f0bad60f807287e3301189114755d727657ed2ef964", size = 51193 }, + { url = "https://files.pythonhosted.org/packages/bc/46/d6d9789eb6e61bfa073a13fd2b5cbbcf022a7781adbb060a25d82f16437e/chonkie-1.0.6-py3-none-any.whl", hash = "sha256:d8cfcf665cb6a64ac6ca87da61207372a88b9e5a7bb697faade78069c853e4b1", size = 89526 }, ] [package.optional-dependencies] all = [ + { name = "accelerate" }, + { name = "cohere" }, + { name = "google-genai" }, + { name = "huggingface-hub" }, + { name = "jsonschema" }, + { name = "magika" }, { name = "model2vec" }, { name = "numpy" }, { name = "openai" }, + { name = "pydantic" }, + { name = "rich" }, { name = "sentence-transformers" }, + { name = "tiktoken" }, + { name = "torch" }, + { name = "transformers" }, + { name = "tree-sitter" }, + { name = "tree-sitter-language-pack" }, ] [[package]] @@ -394,6 +412,26 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/7e/d4/7ebdbd03970677812aac39c869717059dbb71a4cfc033ca6e5221787892c/click-8.1.8-py3-none-any.whl", hash = "sha256:63c132bbbed01578a06712a2d1f497bb62d9c1c0d329b7903a866228027263b2", size = 98188 }, ] +[[package]] +name = "cohere" +version = "5.15.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "fastavro" }, + { name = "httpx" }, + { name = "httpx-sse" }, + { name = "pydantic" }, + { name = "pydantic-core" }, + { name = "requests" }, + { name = "tokenizers" }, + { name = "types-requests" }, + { name = "typing-extensions" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/a1/33/69c7d1b25a20eafef4197a1444c7f87d5241e936194e54876ea8996157e6/cohere-5.15.0.tar.gz", hash = "sha256:e802d4718ddb0bb655654382ebbce002756a3800faac30296cde7f1bdc6ff2cc", size = 135021 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/c7/87/94694db7fe6df979fbc03286eaabdfa98f1c8fa532960e5afdf965e10960/cohere-5.15.0-py3-none-any.whl", hash = "sha256:22ff867c2a6f2fc2b585360c6072f584f11f275ef6d9242bac24e0fa2df1dfb5", size = 259522 }, +] + [[package]] name = "colorama" version = "0.4.6" @@ -660,6 +698,26 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/a6/08/9968963c1fb8c34627b7f1fbcdfe9438540f87dc7c9bfb59bb4fd19a4ecf/fastapi_users_db_sqlalchemy-7.0.0-py3-none-any.whl", hash = "sha256:5fceac018e7cfa69efc70834dd3035b3de7988eb4274154a0dbe8b14f5aa001e", size = 6891 }, ] +[[package]] +name = "fastavro" +version = "1.10.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/f3/67/7121d2221e998706cac00fa779ec44c1c943cb65e8a7ed1bd57d78d93f2c/fastavro-1.10.0.tar.gz", hash = "sha256:47bf41ac6d52cdfe4a3da88c75a802321321b37b663a900d12765101a5d6886f", size = 987970 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/9c/a4/8e69c0a5cd121e5d476237de1bde5a7947f791ae45768ae52ed0d3ea8d18/fastavro-1.10.0-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:cfe57cb0d72f304bd0dcc5a3208ca6a7363a9ae76f3073307d095c9d053b29d4", size = 1036343 }, + { url = "https://files.pythonhosted.org/packages/1e/01/aa219e2b33e5873d27b867ec0fad9f35f23d461114e1135a7e46c06786d2/fastavro-1.10.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:74e517440c824cb65fb29d3e3903a9406f4d7c75490cef47e55c4c82cdc66270", size = 3263368 }, + { url = "https://files.pythonhosted.org/packages/a7/ba/1766e2d7d95df2e95e9e9a089dc7a537c0616720b053a111a918fa7ee6b6/fastavro-1.10.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:203c17d44cadde76e8eecb30f2d1b4f33eb478877552d71f049265dc6f2ecd10", size = 3328933 }, + { url = "https://files.pythonhosted.org/packages/2e/40/26e56696b9696ab4fbba25a96b8037ca3f9fd8a8cc55b4b36400ef023e49/fastavro-1.10.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:6575be7f2b5f94023b5a4e766b0251924945ad55e9a96672dc523656d17fe251", size = 3258045 }, + { url = "https://files.pythonhosted.org/packages/4e/bc/2f6c92c06c5363372abe828bccdd95762f2c1983b261509f94189c38c8a1/fastavro-1.10.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:fe471deb675ed2f01ee2aac958fbf8ebb13ea00fa4ce7f87e57710a0bc592208", size = 3418001 }, + { url = "https://files.pythonhosted.org/packages/0c/ce/cfd16546c04ebbca1be80873b533c788cec76f7bfac231bfac6786047572/fastavro-1.10.0-cp312-cp312-win_amd64.whl", hash = "sha256:567ff515f2a5d26d9674b31c95477f3e6022ec206124c62169bc2ffaf0889089", size = 487855 }, + { url = "https://files.pythonhosted.org/packages/c9/c4/163cf154cc694c2dccc70cd6796db6214ac668a1260bf0310401dad188dc/fastavro-1.10.0-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:82263af0adfddb39c85f9517d736e1e940fe506dfcc35bc9ab9f85e0fa9236d8", size = 1022741 }, + { url = "https://files.pythonhosted.org/packages/38/01/a24598f5f31b8582a92fe9c41bf91caeed50d5b5eaa7576e6f8b23cb488d/fastavro-1.10.0-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:566c193109ff0ff84f1072a165b7106c4f96050078a4e6ac7391f81ca1ef3efa", size = 3237421 }, + { url = "https://files.pythonhosted.org/packages/a7/bf/08bcf65cfb7feb0e5b1329fafeb4a9b95b7b5ec723ba58c7dbd0d04ded34/fastavro-1.10.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e400d2e55d068404d9fea7c5021f8b999c6f9d9afa1d1f3652ec92c105ffcbdd", size = 3300222 }, + { url = "https://files.pythonhosted.org/packages/53/4d/a6c25f3166328f8306ec2e6be1123ed78a55b8ab774a43a661124508881f/fastavro-1.10.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:9b8227497f71565270f9249fc9af32a93644ca683a0167cfe66d203845c3a038", size = 3233276 }, + { url = "https://files.pythonhosted.org/packages/47/1c/b2b2ce2bf866a248ae23e96a87b3b8369427ff79be9112073039bee1d245/fastavro-1.10.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:8e62d04c65461b30ac6d314e4197ad666371e97ae8cb2c16f971d802f6c7f514", size = 3388936 }, + { url = "https://files.pythonhosted.org/packages/1f/2c/43927e22a2d57587b3aa09765098a6d833246b672d34c10c5f135414745a/fastavro-1.10.0-cp313-cp313-win_amd64.whl", hash = "sha256:86baf8c9740ab570d0d4d18517da71626fe9be4d1142bea684db52bd5adb078f", size = 483967 }, +] + [[package]] name = "filelock" version = "3.17.0" @@ -858,6 +916,24 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/94/b6/60f2910485d32f7bba92cc33e5053b3f29d61fccaa57e5e58c600bb7e0d2/google_cloud_vision-3.10.1-py3-none-any.whl", hash = "sha256:91959ea12b0d6a8442e30c0a5062cd305f349a4840f9184b5061b3153bbd8476", size = 526076 }, ] +[[package]] +name = "google-genai" +version = "1.12.1" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "anyio" }, + { name = "google-auth" }, + { name = "httpx" }, + { name = "pydantic" }, + { name = "requests" }, + { name = "typing-extensions" }, + { name = "websockets" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/38/9c/c907dbea921663bb7c41f415337bedd08259d17da8d156396c7237611744/google_genai-1.12.1.tar.gz", hash = "sha256:5c7eda422360643ce602a3f6b23152470ec1039310ef40080cbe4e71237f6391", size = 167752 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/29/2c/5b454dec837328eb167e78f45a14da502af223f8b94a4824e2fd0df74f19/google_genai-1.12.1-py3-none-any.whl", hash = "sha256:7cbc1bc029712946ce41bcf80c0eaa89eb8c09c308efbbfe30fd491f402c258a", size = 165940 }, +] + [[package]] name = "googleapis-common-protos" version = "1.69.2" @@ -1490,6 +1566,24 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/80/83/8c54533b3576f4391eebea88454738978669a6cad0d8e23266224007939d/lxml-5.3.1-cp313-cp313-win_amd64.whl", hash = "sha256:91fb6a43d72b4f8863d21f347a9163eecbf36e76e2f51068d59cd004c506f332", size = 3814484 }, ] +[[package]] +name = "magika" +version = "0.6.1" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "click" }, + { name = "numpy" }, + { name = "onnxruntime" }, + { name = "python-dotenv" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/6d/18/ea70f6abd36f455037340f12c8125918c726d08cd6e01f0b76b6884e0c38/magika-0.6.1.tar.gz", hash = "sha256:e3dd22c73936630b1cd79d0f412d6d9a53dc99ba5e3709b1ac53f56bc998e635", size = 3030234 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/1f/be/c9f7bb9ee94abe8d344b660672001313e459c67b867b24abe32d5c80a9ce/magika-0.6.1-py3-none-any.whl", hash = "sha256:15838d2469f1394d8e9598bc7fceea1ede7f35aebe9675c6b45c6b5c48315931", size = 2968516 }, + { url = "https://files.pythonhosted.org/packages/3c/b9/016b174520e81faef5edb31b6c7a73966dc84ee33acd23a2e7b775df7ba4/magika-0.6.1-py3-none-macosx_11_0_arm64.whl", hash = "sha256:dadd036296a2e4840fd48fa0712848fe122da438e8f607dc8f19ca4663c359dc", size = 12408519 }, + { url = "https://files.pythonhosted.org/packages/02/b7/e7dfeb235823a82d676c68a748541c24db0249b854f945f6e3cec11c1b7e/magika-0.6.1-py3-none-manylinux_2_28_x86_64.whl", hash = "sha256:133c0e1a844361de86ca2dd7c530e38b324e86177d30c52e36fd82101c190b5c", size = 15089294 }, + { url = "https://files.pythonhosted.org/packages/64/f0/bec5bff0125d08c1bc3baef88beeb910121085249f67b5994ea961615b55/magika-0.6.1-py3-none-win_amd64.whl", hash = "sha256:0342b6230ea9aea7ab4b8fa92e1b46f1cc62e724d452ee8d6821a37f56738d22", size = 12378455 }, +] + [[package]] name = "makefun" version = "1.15.6" @@ -1643,7 +1737,7 @@ wheels = [ [[package]] name = "model2vec" -version = "0.4.0" +version = "0.4.1" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "jinja2" }, @@ -1655,9 +1749,9 @@ dependencies = [ { name = "tokenizers" }, { name = "tqdm" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/83/e2/3fb7bd8c612f71ad3abded92e7401f97f1e71427d3a68a3fb85f39394b17/model2vec-0.4.0.tar.gz", hash = "sha256:48d4a3da040499b0090f736eb8f22ea0fdd35b67462d81d789c70004423adbae", size = 2486998 } +sdist = { url = "https://files.pythonhosted.org/packages/b8/c1/3cd6cab10e8b7da8c32acebf85672d38a26f5f03165bfeaa617a5ec0bb61/model2vec-0.4.1.tar.gz", hash = "sha256:fc6038416679eebe448951708f2d0bebdee8510f47970af1c81a8f054a3c3f9f", size = 2660626 } wheels = [ - { url = "https://files.pythonhosted.org/packages/93/7d/39ff093c4e45303a06e3c5825c6144cbd21f18a1393a154bbf93232b0f1a/model2vec-0.4.0-py3-none-any.whl", hash = "sha256:df30685a55841c61c6638e4f329648e76b148507bd778801d7bfcd6b970a4f2f", size = 38593 }, + { url = "https://files.pythonhosted.org/packages/cd/76/c8575f90f521017597c5e57e3bfef61e3f27d9cb6c741a82a24d72b10a60/model2vec-0.4.1-py3-none-any.whl", hash = "sha256:04a397a17da9b967082b6baa4c494f0be48c89ec4e1a3975b4f290f045238a38", size = 41972 }, ] [[package]] @@ -1764,18 +1858,40 @@ wheels = [ [[package]] name = "numpy" -version = "1.26.4" +version = "2.2.5" source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/65/6e/09db70a523a96d25e115e71cc56a6f9031e7b8cd166c1ac8438307c14058/numpy-1.26.4.tar.gz", hash = "sha256:2a02aba9ed12e4ac4eb3ea9421c420301a0c6460d9830d74a9df87efa4912010", size = 15786129 } +sdist = { url = "https://files.pythonhosted.org/packages/dc/b2/ce4b867d8cd9c0ee84938ae1e6a6f7926ebf928c9090d036fc3c6a04f946/numpy-2.2.5.tar.gz", hash = "sha256:a9c0d994680cd991b1cb772e8b297340085466a6fe964bc9d4e80f5e2f43c291", size = 20273920 } wheels = [ - { url = "https://files.pythonhosted.org/packages/95/12/8f2020a8e8b8383ac0177dc9570aad031a3beb12e38847f7129bacd96228/numpy-1.26.4-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:b3ce300f3644fb06443ee2222c2201dd3a89ea6040541412b8fa189341847218", size = 20335901 }, - { url = "https://files.pythonhosted.org/packages/75/5b/ca6c8bd14007e5ca171c7c03102d17b4f4e0ceb53957e8c44343a9546dcc/numpy-1.26.4-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:03a8c78d01d9781b28a6989f6fa1bb2c4f2d51201cf99d3dd875df6fbd96b23b", size = 13685868 }, - { url = "https://files.pythonhosted.org/packages/79/f8/97f10e6755e2a7d027ca783f63044d5b1bc1ae7acb12afe6a9b4286eac17/numpy-1.26.4-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:9fad7dcb1aac3c7f0584a5a8133e3a43eeb2fe127f47e3632d43d677c66c102b", size = 13925109 }, - { url = "https://files.pythonhosted.org/packages/0f/50/de23fde84e45f5c4fda2488c759b69990fd4512387a8632860f3ac9cd225/numpy-1.26.4-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:675d61ffbfa78604709862923189bad94014bef562cc35cf61d3a07bba02a7ed", size = 17950613 }, - { url = "https://files.pythonhosted.org/packages/4c/0c/9c603826b6465e82591e05ca230dfc13376da512b25ccd0894709b054ed0/numpy-1.26.4-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:ab47dbe5cc8210f55aa58e4805fe224dac469cde56b9f731a4c098b91917159a", size = 13572172 }, - { url = "https://files.pythonhosted.org/packages/76/8c/2ba3902e1a0fc1c74962ea9bb33a534bb05984ad7ff9515bf8d07527cadd/numpy-1.26.4-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:1dda2e7b4ec9dd512f84935c5f126c8bd8b9f2fc001e9f54af255e8c5f16b0e0", size = 17786643 }, - { url = "https://files.pythonhosted.org/packages/28/4a/46d9e65106879492374999e76eb85f87b15328e06bd1550668f79f7b18c6/numpy-1.26.4-cp312-cp312-win32.whl", hash = "sha256:50193e430acfc1346175fcbdaa28ffec49947a06918b7b92130744e81e640110", size = 5677803 }, - { url = "https://files.pythonhosted.org/packages/16/2e/86f24451c2d530c88daf997cb8d6ac622c1d40d19f5a031ed68a4b73a374/numpy-1.26.4-cp312-cp312-win_amd64.whl", hash = "sha256:08beddf13648eb95f8d867350f6a018a4be2e5ad54c8d8caed89ebca558b2818", size = 15517754 }, + { url = "https://files.pythonhosted.org/packages/e2/f7/1fd4ff108cd9d7ef929b8882692e23665dc9c23feecafbb9c6b80f4ec583/numpy-2.2.5-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:ee461a4eaab4f165b68780a6a1af95fb23a29932be7569b9fab666c407969051", size = 20948633 }, + { url = "https://files.pythonhosted.org/packages/12/03/d443c278348371b20d830af155ff2079acad6a9e60279fac2b41dbbb73d8/numpy-2.2.5-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:ec31367fd6a255dc8de4772bd1658c3e926d8e860a0b6e922b615e532d320ddc", size = 14176123 }, + { url = "https://files.pythonhosted.org/packages/2b/0b/5ca264641d0e7b14393313304da48b225d15d471250376f3fbdb1a2be603/numpy-2.2.5-cp312-cp312-macosx_14_0_arm64.whl", hash = "sha256:47834cde750d3c9f4e52c6ca28a7361859fcaf52695c7dc3cc1a720b8922683e", size = 5163817 }, + { url = "https://files.pythonhosted.org/packages/04/b3/d522672b9e3d28e26e1613de7675b441bbd1eaca75db95680635dd158c67/numpy-2.2.5-cp312-cp312-macosx_14_0_x86_64.whl", hash = "sha256:2c1a1c6ccce4022383583a6ded7bbcda22fc635eb4eb1e0a053336425ed36dfa", size = 6698066 }, + { url = "https://files.pythonhosted.org/packages/a0/93/0f7a75c1ff02d4b76df35079676b3b2719fcdfb39abdf44c8b33f43ef37d/numpy-2.2.5-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:9d75f338f5f79ee23548b03d801d28a505198297534f62416391857ea0479571", size = 14087277 }, + { url = "https://files.pythonhosted.org/packages/b0/d9/7c338b923c53d431bc837b5b787052fef9ae68a56fe91e325aac0d48226e/numpy-2.2.5-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3a801fef99668f309b88640e28d261991bfad9617c27beda4a3aec4f217ea073", size = 16135742 }, + { url = "https://files.pythonhosted.org/packages/2d/10/4dec9184a5d74ba9867c6f7d1e9f2e0fb5fe96ff2bf50bb6f342d64f2003/numpy-2.2.5-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:abe38cd8381245a7f49967a6010e77dbf3680bd3627c0fe4362dd693b404c7f8", size = 15581825 }, + { url = "https://files.pythonhosted.org/packages/80/1f/2b6fcd636e848053f5b57712a7d1880b1565eec35a637fdfd0a30d5e738d/numpy-2.2.5-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:5a0ac90e46fdb5649ab6369d1ab6104bfe5854ab19b645bf5cda0127a13034ae", size = 17899600 }, + { url = "https://files.pythonhosted.org/packages/ec/87/36801f4dc2623d76a0a3835975524a84bd2b18fe0f8835d45c8eae2f9ff2/numpy-2.2.5-cp312-cp312-win32.whl", hash = "sha256:0cd48122a6b7eab8f06404805b1bd5856200e3ed6f8a1b9a194f9d9054631beb", size = 6312626 }, + { url = "https://files.pythonhosted.org/packages/8b/09/4ffb4d6cfe7ca6707336187951992bd8a8b9142cf345d87ab858d2d7636a/numpy-2.2.5-cp312-cp312-win_amd64.whl", hash = "sha256:ced69262a8278547e63409b2653b372bf4baff0870c57efa76c5703fd6543282", size = 12645715 }, + { url = "https://files.pythonhosted.org/packages/e2/a0/0aa7f0f4509a2e07bd7a509042967c2fab635690d4f48c6c7b3afd4f448c/numpy-2.2.5-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:059b51b658f4414fff78c6d7b1b4e18283ab5fa56d270ff212d5ba0c561846f4", size = 20935102 }, + { url = "https://files.pythonhosted.org/packages/7e/e4/a6a9f4537542912ec513185396fce52cdd45bdcf3e9d921ab02a93ca5aa9/numpy-2.2.5-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:47f9ed103af0bc63182609044b0490747e03bd20a67e391192dde119bf43d52f", size = 14191709 }, + { url = "https://files.pythonhosted.org/packages/be/65/72f3186b6050bbfe9c43cb81f9df59ae63603491d36179cf7a7c8d216758/numpy-2.2.5-cp313-cp313-macosx_14_0_arm64.whl", hash = "sha256:261a1ef047751bb02f29dfe337230b5882b54521ca121fc7f62668133cb119c9", size = 5149173 }, + { url = "https://files.pythonhosted.org/packages/e5/e9/83e7a9432378dde5802651307ae5e9ea07bb72b416728202218cd4da2801/numpy-2.2.5-cp313-cp313-macosx_14_0_x86_64.whl", hash = "sha256:4520caa3807c1ceb005d125a75e715567806fed67e315cea619d5ec6e75a4191", size = 6684502 }, + { url = "https://files.pythonhosted.org/packages/ea/27/b80da6c762394c8ee516b74c1f686fcd16c8f23b14de57ba0cad7349d1d2/numpy-2.2.5-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3d14b17b9be5f9c9301f43d2e2a4886a33b53f4e6fdf9ca2f4cc60aeeee76372", size = 14084417 }, + { url = "https://files.pythonhosted.org/packages/aa/fc/ebfd32c3e124e6a1043e19c0ab0769818aa69050ce5589b63d05ff185526/numpy-2.2.5-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2ba321813a00e508d5421104464510cc962a6f791aa2fca1c97b1e65027da80d", size = 16133807 }, + { url = "https://files.pythonhosted.org/packages/bf/9b/4cc171a0acbe4666f7775cfd21d4eb6bb1d36d3a0431f48a73e9212d2278/numpy-2.2.5-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:a4cbdef3ddf777423060c6f81b5694bad2dc9675f110c4b2a60dc0181543fac7", size = 15575611 }, + { url = "https://files.pythonhosted.org/packages/a3/45/40f4135341850df48f8edcf949cf47b523c404b712774f8855a64c96ef29/numpy-2.2.5-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:54088a5a147ab71a8e7fdfd8c3601972751ded0739c6b696ad9cb0343e21ab73", size = 17895747 }, + { url = "https://files.pythonhosted.org/packages/f8/4c/b32a17a46f0ffbde8cc82df6d3daeaf4f552e346df143e1b188a701a8f09/numpy-2.2.5-cp313-cp313-win32.whl", hash = "sha256:c8b82a55ef86a2d8e81b63da85e55f5537d2157165be1cb2ce7cfa57b6aef38b", size = 6309594 }, + { url = "https://files.pythonhosted.org/packages/13/ae/72e6276feb9ef06787365b05915bfdb057d01fceb4a43cb80978e518d79b/numpy-2.2.5-cp313-cp313-win_amd64.whl", hash = "sha256:d8882a829fd779f0f43998e931c466802a77ca1ee0fe25a3abe50278616b1471", size = 12638356 }, + { url = "https://files.pythonhosted.org/packages/79/56/be8b85a9f2adb688e7ded6324e20149a03541d2b3297c3ffc1a73f46dedb/numpy-2.2.5-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:e8b025c351b9f0e8b5436cf28a07fa4ac0204d67b38f01433ac7f9b870fa38c6", size = 20963778 }, + { url = "https://files.pythonhosted.org/packages/ff/77/19c5e62d55bff507a18c3cdff82e94fe174957bad25860a991cac719d3ab/numpy-2.2.5-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:8dfa94b6a4374e7851bbb6f35e6ded2120b752b063e6acdd3157e4d2bb922eba", size = 14207279 }, + { url = "https://files.pythonhosted.org/packages/75/22/aa11f22dc11ff4ffe4e849d9b63bbe8d4ac6d5fae85ddaa67dfe43be3e76/numpy-2.2.5-cp313-cp313t-macosx_14_0_arm64.whl", hash = "sha256:97c8425d4e26437e65e1d189d22dff4a079b747ff9c2788057bfb8114ce1e133", size = 5199247 }, + { url = "https://files.pythonhosted.org/packages/4f/6c/12d5e760fc62c08eded0394f62039f5a9857f758312bf01632a81d841459/numpy-2.2.5-cp313-cp313t-macosx_14_0_x86_64.whl", hash = "sha256:352d330048c055ea6db701130abc48a21bec690a8d38f8284e00fab256dc1376", size = 6711087 }, + { url = "https://files.pythonhosted.org/packages/ef/94/ece8280cf4218b2bee5cec9567629e61e51b4be501e5c6840ceb593db945/numpy-2.2.5-cp313-cp313t-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8b4c0773b6ada798f51f0f8e30c054d32304ccc6e9c5d93d46cb26f3d385ab19", size = 14059964 }, + { url = "https://files.pythonhosted.org/packages/39/41/c5377dac0514aaeec69115830a39d905b1882819c8e65d97fc60e177e19e/numpy-2.2.5-cp313-cp313t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:55f09e00d4dccd76b179c0f18a44f041e5332fd0e022886ba1c0bbf3ea4a18d0", size = 16121214 }, + { url = "https://files.pythonhosted.org/packages/db/54/3b9f89a943257bc8e187145c6bc0eb8e3d615655f7b14e9b490b053e8149/numpy-2.2.5-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:02f226baeefa68f7d579e213d0f3493496397d8f1cff5e2b222af274c86a552a", size = 15575788 }, + { url = "https://files.pythonhosted.org/packages/b1/c4/2e407e85df35b29f79945751b8f8e671057a13a376497d7fb2151ba0d290/numpy-2.2.5-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:c26843fd58f65da9491165072da2cccc372530681de481ef670dcc8e27cfb066", size = 17893672 }, + { url = "https://files.pythonhosted.org/packages/29/7e/d0b44e129d038dba453f00d0e29ebd6eaf2f06055d72b95b9947998aca14/numpy-2.2.5-cp313-cp313t-win32.whl", hash = "sha256:1a161c2c79ab30fe4501d5a2bbfe8b162490757cf90b7f05be8b80bc02f7bb8e", size = 6377102 }, + { url = "https://files.pythonhosted.org/packages/63/be/b85e4aa4bf42c6502851b971f1c326d583fcc68227385f92089cf50a7b45/numpy-2.2.5-cp313-cp313t-win_amd64.whl", hash = "sha256:d403c84991b5ad291d3809bace5e85f4bbf44a04bdc9a88ed2bb1807b3360bb8", size = 12750096 }, ] [[package]] @@ -2813,15 +2929,15 @@ flashrank = [ [[package]] name = "rich" -version = "13.9.4" +version = "14.0.0" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "markdown-it-py" }, { name = "pygments" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/ab/3a/0316b28d0761c6734d6bc14e770d85506c986c85ffb239e688eeaab2c2bc/rich-13.9.4.tar.gz", hash = "sha256:439594978a49a09530cff7ebc4b5c7103ef57baf48d5ea3184f21d9a2befa098", size = 223149 } +sdist = { url = "https://files.pythonhosted.org/packages/a1/53/830aa4c3066a8ab0ae9a9955976fb770fe9c6102117c8ec4ab3ea62d89e8/rich-14.0.0.tar.gz", hash = "sha256:82f1bc23a6a21ebca4ae0c45af9bdbc492ed20231dcb63f297d6d1021a9d5725", size = 224078 } wheels = [ - { url = "https://files.pythonhosted.org/packages/19/71/39c7c0d87f8d4e6c020a393182060eaefeeae6c01dab6a84ec346f2567df/rich-13.9.4-py3-none-any.whl", hash = "sha256:6049d5e6ec054bf2779ab3358186963bac2ea89175919d699e378b99738c2a90", size = 242424 }, + { url = "https://files.pythonhosted.org/packages/0d/9b/63f4c7ebc259242c89b3acafdb37b41d1185c07ff0011164674e9076b491/rich-14.0.0-py3-none-any.whl", hash = "sha256:1c9491e1951aac09caffd42f448ee3d04e58923ffe14993f6e83068dc395d7e0", size = 243229 }, ] [[package]] @@ -3112,7 +3228,7 @@ dependencies = [ requires-dist = [ { name = "alembic", specifier = ">=1.13.0" }, { name = "asyncpg", specifier = ">=0.30.0" }, - { name = "chonkie", extras = ["all"], specifier = ">=0.4.1" }, + { name = "chonkie", extras = ["all"], specifier = ">=1.0.6" }, { name = "fastapi", specifier = ">=0.115.8" }, { name = "fastapi-users", extras = ["oauth", "sqlalchemy"], specifier = ">=14.0.1" }, { name = "firecrawl-py", specifier = ">=1.12.0" }, @@ -3339,6 +3455,91 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/b6/1a/efeecb8d83705f2f4beac98d46f2148c95ecd7babfb31b5c0f1e7017e83d/transformers-4.48.3-py3-none-any.whl", hash = "sha256:78697f990f5ef350c23b46bf86d5081ce96b49479ab180b2de7687267de8fd36", size = 9669412 }, ] +[[package]] +name = "tree-sitter" +version = "0.24.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/a7/a2/698b9d31d08ad5558f8bfbfe3a0781bd4b1f284e89bde3ad18e05101a892/tree-sitter-0.24.0.tar.gz", hash = "sha256:abd95af65ca2f4f7eca356343391ed669e764f37748b5352946f00f7fc78e734", size = 168304 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/e9/57/3a590f287b5aa60c07d5545953912be3d252481bf5e178f750db75572bff/tree_sitter-0.24.0-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:14beeff5f11e223c37be7d5d119819880601a80d0399abe8c738ae2288804afc", size = 140788 }, + { url = "https://files.pythonhosted.org/packages/61/0b/fc289e0cba7dbe77c6655a4dd949cd23c663fd62a8b4d8f02f97e28d7fe5/tree_sitter-0.24.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:26a5b130f70d5925d67b47db314da209063664585a2fd36fa69e0717738efaf4", size = 133945 }, + { url = "https://files.pythonhosted.org/packages/86/d7/80767238308a137e0b5b5c947aa243e3c1e3e430e6d0d5ae94b9a9ffd1a2/tree_sitter-0.24.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5fc5c3c26d83c9d0ecb4fc4304fba35f034b7761d35286b936c1db1217558b4e", size = 564819 }, + { url = "https://files.pythonhosted.org/packages/bf/b3/6c5574f4b937b836601f5fb556b24804b0a6341f2eb42f40c0e6464339f4/tree_sitter-0.24.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:772e1bd8c0931c866b848d0369b32218ac97c24b04790ec4b0e409901945dd8e", size = 579303 }, + { url = "https://files.pythonhosted.org/packages/0a/f4/bd0ddf9abe242ea67cca18a64810f8af230fc1ea74b28bb702e838ccd874/tree_sitter-0.24.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:24a8dd03b0d6b8812425f3b84d2f4763322684e38baf74e5bb766128b5633dc7", size = 581054 }, + { url = "https://files.pythonhosted.org/packages/8c/1c/ff23fa4931b6ef1bbeac461b904ca7e49eaec7e7e5398584e3eef836ec96/tree_sitter-0.24.0-cp312-cp312-win_amd64.whl", hash = "sha256:f9e8b1605ab60ed43803100f067eed71b0b0e6c1fb9860a262727dbfbbb74751", size = 120221 }, + { url = "https://files.pythonhosted.org/packages/b2/2a/9979c626f303177b7612a802237d0533155bf1e425ff6f73cc40f25453e2/tree_sitter-0.24.0-cp312-cp312-win_arm64.whl", hash = "sha256:f733a83d8355fc95561582b66bbea92ffd365c5d7a665bc9ebd25e049c2b2abb", size = 108234 }, + { url = "https://files.pythonhosted.org/packages/61/cd/2348339c85803330ce38cee1c6cbbfa78a656b34ff58606ebaf5c9e83bd0/tree_sitter-0.24.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:0d4a6416ed421c4210f0ca405a4834d5ccfbb8ad6692d4d74f7773ef68f92071", size = 140781 }, + { url = "https://files.pythonhosted.org/packages/8b/a3/1ea9d8b64e8dcfcc0051028a9c84a630301290995cd6e947bf88267ef7b1/tree_sitter-0.24.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:e0992d483677e71d5c5d37f30dfb2e3afec2f932a9c53eec4fca13869b788c6c", size = 133928 }, + { url = "https://files.pythonhosted.org/packages/fe/ae/55c1055609c9428a4aedf4b164400ab9adb0b1bf1538b51f4b3748a6c983/tree_sitter-0.24.0-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:57277a12fbcefb1c8b206186068d456c600dbfbc3fd6c76968ee22614c5cd5ad", size = 564497 }, + { url = "https://files.pythonhosted.org/packages/ce/d0/f2ffcd04882c5aa28d205a787353130cbf84b2b8a977fd211bdc3b399ae3/tree_sitter-0.24.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d25fa22766d63f73716c6fec1a31ee5cf904aa429484256bd5fdf5259051ed74", size = 578917 }, + { url = "https://files.pythonhosted.org/packages/af/82/aebe78ea23a2b3a79324993d4915f3093ad1af43d7c2208ee90be9273273/tree_sitter-0.24.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:7d5d9537507e1c8c5fa9935b34f320bfec4114d675e028f3ad94f11cf9db37b9", size = 581148 }, + { url = "https://files.pythonhosted.org/packages/a1/b4/6b0291a590c2b0417cfdb64ccb8ea242f270a46ed429c641fbc2bfab77e0/tree_sitter-0.24.0-cp313-cp313-win_amd64.whl", hash = "sha256:f58bb4956917715ec4d5a28681829a8dad5c342cafd4aea269f9132a83ca9b34", size = 120207 }, + { url = "https://files.pythonhosted.org/packages/a8/18/542fd844b75272630229c9939b03f7db232c71a9d82aadc59c596319ea6a/tree_sitter-0.24.0-cp313-cp313-win_arm64.whl", hash = "sha256:23641bd25dcd4bb0b6fa91b8fb3f46cc9f1c9f475efe4d536d3f1f688d1b84c8", size = 108232 }, +] + +[[package]] +name = "tree-sitter-c-sharp" +version = "0.23.1" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/22/85/a61c782afbb706a47d990eaee6977e7c2bd013771c5bf5c81c617684f286/tree_sitter_c_sharp-0.23.1.tar.gz", hash = "sha256:322e2cfd3a547a840375276b2aea3335fa6458aeac082f6c60fec3f745c967eb", size = 1317728 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/58/04/f6c2df4c53a588ccd88d50851155945cff8cd887bd70c175e00aaade7edf/tree_sitter_c_sharp-0.23.1-cp39-abi3-macosx_10_9_x86_64.whl", hash = "sha256:2b612a6e5bd17bb7fa2aab4bb6fc1fba45c94f09cb034ab332e45603b86e32fd", size = 372235 }, + { url = "https://files.pythonhosted.org/packages/99/10/1aa9486f1e28fc22810fa92cbdc54e1051e7f5536a5e5b5e9695f609b31e/tree_sitter_c_sharp-0.23.1-cp39-abi3-macosx_11_0_arm64.whl", hash = "sha256:1a8b98f62bc53efcd4d971151950c9b9cd5cbe3bacdb0cd69fdccac63350d83e", size = 419046 }, + { url = "https://files.pythonhosted.org/packages/0f/21/13df29f8fcb9ba9f209b7b413a4764b673dfd58989a0dd67e9c7e19e9c2e/tree_sitter_c_sharp-0.23.1-cp39-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:986e93d845a438ec3c4416401aa98e6a6f6631d644bbbc2e43fcb915c51d255d", size = 415999 }, + { url = "https://files.pythonhosted.org/packages/ca/72/fc6846795bcdae2f8aa94cc8b1d1af33d634e08be63e294ff0d6794b1efc/tree_sitter_c_sharp-0.23.1-cp39-abi3-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a8024e466b2f5611c6dc90321f232d8584893c7fb88b75e4a831992f877616d2", size = 402830 }, + { url = "https://files.pythonhosted.org/packages/fe/3a/b6028c5890ce6653807d5fa88c72232c027c6ceb480dbeb3b186d60e5971/tree_sitter_c_sharp-0.23.1-cp39-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:7f9bf876866835492281d336b9e1f9626ab668737f74e914c31d285261507da7", size = 397880 }, + { url = "https://files.pythonhosted.org/packages/47/d2/4facaa34b40f8104d8751746d0e1cd2ddf0beb9f1404b736b97f372bd1f3/tree_sitter_c_sharp-0.23.1-cp39-abi3-win_amd64.whl", hash = "sha256:ae9a9e859e8f44e2b07578d44f9a220d3fa25b688966708af6aa55d42abeebb3", size = 377562 }, + { url = "https://files.pythonhosted.org/packages/d8/88/3cf6bd9959d94d1fec1e6a9c530c5f08ff4115a474f62aedb5fedb0f7241/tree_sitter_c_sharp-0.23.1-cp39-abi3-win_arm64.whl", hash = "sha256:c81548347a93347be4f48cb63ec7d60ef4b0efa91313330e69641e49aa5a08c5", size = 375157 }, +] + +[[package]] +name = "tree-sitter-embedded-template" +version = "0.23.2" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/28/d6/5a58ea2f0480f5ed188b733114a8c275532a2fd1568b3898793b13d28af5/tree_sitter_embedded_template-0.23.2.tar.gz", hash = "sha256:7b24dcf2e92497f54323e617564d36866230a8bfb719dbb7b45b461510dcddaa", size = 8471 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/ef/c1/be0c48ed9609b720e74ade86f24ea086e353fe9c7405ee9630c3d52d09a2/tree_sitter_embedded_template-0.23.2-cp39-abi3-macosx_10_9_x86_64.whl", hash = "sha256:a505c2d2494464029d79db541cab52f6da5fb326bf3d355e69bf98b84eb89ae0", size = 9554 }, + { url = "https://files.pythonhosted.org/packages/6d/a5/7c12f5d302525ee36d1eafc28a68e4454da5bad208436d547326bee4ed76/tree_sitter_embedded_template-0.23.2-cp39-abi3-macosx_11_0_arm64.whl", hash = "sha256:28028b93b42cc3753261ae7ce066675d407f59de512417524f9c3ab7792b1d37", size = 10051 }, + { url = "https://files.pythonhosted.org/packages/cd/87/95aaba8b64b849200bd7d4ae510cc394ecaef46a031499cbff301766970d/tree_sitter_embedded_template-0.23.2-cp39-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ec399d59ce93ffb60759a2d96053eed529f3c3f6a27128f261710d0d0de60e10", size = 17532 }, + { url = "https://files.pythonhosted.org/packages/13/f8/8c837b898f00b35f9f3f76a4abc525e80866a69343083c9ff329e17ecb03/tree_sitter_embedded_template-0.23.2-cp39-abi3-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:bcfa01f62b88d50dbcb736cc23baec8ddbfe08daacfdc613eee8c04ab65efd09", size = 17394 }, + { url = "https://files.pythonhosted.org/packages/89/9b/893adf9e465d2d7f14870871bf2f3b30045e5ac417cb596f667a72eda493/tree_sitter_embedded_template-0.23.2-cp39-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:6debd24791466f887109a433c31aa4a5deeba2b217817521c745a4e748a944ed", size = 16439 }, + { url = "https://files.pythonhosted.org/packages/40/96/e79934572723673db9f867000500c6eea61a37705e02c7aee9ee031bbb6f/tree_sitter_embedded_template-0.23.2-cp39-abi3-win_amd64.whl", hash = "sha256:158fecb38be5b15db0190ef7238e5248f24bf32ae3cab93bc1197e293a5641eb", size = 12572 }, + { url = "https://files.pythonhosted.org/packages/63/06/27f678b9874e4e2e39ddc6f5cce3374c8c60e6046ea8588a491ab6fc9fcb/tree_sitter_embedded_template-0.23.2-cp39-abi3-win_arm64.whl", hash = "sha256:9f1f3b79fe273f3d15a5b64c85fc6ebfb48decfbe8542accd05f5b7694860df0", size = 11232 }, +] + +[[package]] +name = "tree-sitter-language-pack" +version = "0.7.2" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "tree-sitter" }, + { name = "tree-sitter-c-sharp" }, + { name = "tree-sitter-embedded-template" }, + { name = "tree-sitter-yaml" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/9b/1e/2d63d93025fd5b527327c3fd348955cebaec02a3f1bcec88ab4d88ddfc39/tree_sitter_language_pack-0.7.2.tar.gz", hash = "sha256:46fc96cc3bddfee7091fdedec2ae7e34218679e58241e8319bf82026f6d02eae", size = 59264078 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/da/9d/2c6272bf4fd18a22d8c07d3c983940dbece4f0e9e21f5c78f15a2740f435/tree_sitter_language_pack-0.7.2-cp39-abi3-macosx_10_13_universal2.whl", hash = "sha256:4036603020bd32060d9931a64f8c3d8637de575f350f11534971012e51a27a95", size = 28132977 }, + { url = "https://files.pythonhosted.org/packages/2b/e2/0f2511019c27b870061f9ad719074095ef84cd7857a730765bfa066384be/tree_sitter_language_pack-0.7.2-cp39-abi3-manylinux2014_aarch64.whl", hash = "sha256:801926dbc81eeca4ce97b846cc899dcf3fecfdc3b2514a68eeeb118f70ac686d", size = 17576769 }, + { url = "https://files.pythonhosted.org/packages/3a/88/7b38233def5c359503ad4d36533f96f9fe2943a8eeeced66b36312c49e1b/tree_sitter_language_pack-0.7.2-cp39-abi3-manylinux2014_x86_64.whl", hash = "sha256:77be80335fb585f48eb268b0e07ca54f3da8f30c2eab7be749113f116c3ef316", size = 17433872 }, + { url = "https://files.pythonhosted.org/packages/f8/27/fc5dce240b68a1ed876bc80b2238fbaaa0f695dbaf88660728a0239a2b20/tree_sitter_language_pack-0.7.2-cp39-abi3-win_amd64.whl", hash = "sha256:d71c6b4c14b3370ca783319ede7a581a10e6dd1bdfe5d31d316d9216981a6406", size = 14316050 }, +] + +[[package]] +name = "tree-sitter-yaml" +version = "0.7.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/93/04/6de8be8112c50450cab753fcd6b74d8368c60f6099bf551cee0bec69563a/tree_sitter_yaml-0.7.0.tar.gz", hash = "sha256:9c8bb17d9755c3b0e757260917240c0d19883cd3b59a5d74f205baa8bf8435a4", size = 85085 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/69/1d/243dbdf59fae8a4109e19f0994e2627ddedb2e16b7cf99bd42be64367742/tree_sitter_yaml-0.7.0-cp39-abi3-macosx_10_9_x86_64.whl", hash = "sha256:e21553ac190ae05bf82796df8beb4d9158ba195b5846018cb36fbc3a35bd0679", size = 43335 }, + { url = "https://files.pythonhosted.org/packages/e2/63/e5d5868a1498e20fd07e7db62933766fd64950279862e3e7f150b88ec69d/tree_sitter_yaml-0.7.0-cp39-abi3-macosx_11_0_arm64.whl", hash = "sha256:c022054f1f9b54201082ea83073a6c24c42d0436ad8ee99ff2574cba8f928c28", size = 44574 }, + { url = "https://files.pythonhosted.org/packages/f5/ba/9cff9a3fddb1b6b38bc71ce1dfdb8892ab15a4042c104f4582e30318b412/tree_sitter_yaml-0.7.0-cp39-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1cd1725142f19e41c51d27c99cfc60780f596e069eb181cfa6433d993a19aa3d", size = 93088 }, + { url = "https://files.pythonhosted.org/packages/19/09/39d29d9a22cee0b3c3e4f3fdbd23e4534b9c2a84b5f962f369eafcfbf88c/tree_sitter_yaml-0.7.0-cp39-abi3-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9d1b268378254f75bb27396d83c96d886ccbfcda6bd8c2778e94e3e1d2459085", size = 91367 }, + { url = "https://files.pythonhosted.org/packages/b0/b7/285653b894b351436917b5fe5e738eecaeb2128b4e4bf72bfe0c6043f62e/tree_sitter_yaml-0.7.0-cp39-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:27c2e7f4f49ddf410003abbb82a7b00ec77ea263d8ef08dbce1a15d293eed2fd", size = 87405 }, + { url = "https://files.pythonhosted.org/packages/bb/73/0cdc82ea653c190475a4f63dd4a1f4efd5d1c7d09d2668b8d84008a4c4f8/tree_sitter_yaml-0.7.0-cp39-abi3-win_amd64.whl", hash = "sha256:98dce0d6bc376f842cfb1d3c32512eea95b37e61cd2c87074bb4b05c999917c8", size = 45360 }, + { url = "https://files.pythonhosted.org/packages/2e/32/af2d676b0176a958f22a75b04be836e09476a10844baab78c018a5030297/tree_sitter_yaml-0.7.0-cp39-abi3-win_arm64.whl", hash = "sha256:f0f8d8e05fa8e70f08d0f18a209d6026e171844f4ea7090e7c779b9c375b3a31", size = 43650 }, +] + [[package]] name = "triton" version = "3.2.0" @@ -3348,6 +3549,18 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/c7/30/37a3384d1e2e9320331baca41e835e90a3767303642c7a80d4510152cbcf/triton-3.2.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e5dfa23ba84541d7c0a531dfce76d8bcd19159d50a4a8b14ad01e91734a5c1b0", size = 253154278 }, ] +[[package]] +name = "types-requests" +version = "2.32.0.20250328" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "urllib3" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/00/7d/eb174f74e3f5634eaacb38031bbe467dfe2e545bc255e5c90096ec46bc46/types_requests-2.32.0.20250328.tar.gz", hash = "sha256:c9e67228ea103bd811c96984fac36ed2ae8da87a36a633964a21f199d60baf32", size = 22995 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/cc/15/3700282a9d4ea3b37044264d3e4d1b1f0095a4ebf860a99914fd544e3be3/types_requests-2.32.0.20250328-py3-none-any.whl", hash = "sha256:72ff80f84b15eb3aa7a8e2625fffb6a93f2ad5a0c20215fc1dcfa61117bcb2a2", size = 20663 }, +] + [[package]] name = "typing-extensions" version = "4.12.2" @@ -3381,7 +3594,7 @@ wheels = [ [[package]] name = "unstructured" -version = "0.16.25" +version = "0.17.2" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "backoff" }, @@ -3406,9 +3619,9 @@ dependencies = [ { name = "unstructured-client" }, { name = "wrapt" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/64/31/98c4c78e305d1294888adf87fd5ee30577a4c393951341ca32b43f167f1e/unstructured-0.16.25.tar.gz", hash = "sha256:73b9b0f51dbb687af572ecdb849a6811710b9cac797ddeab8ee80fa07d8aa5e6", size = 1683097 } +sdist = { url = "https://files.pythonhosted.org/packages/b4/49/b95ff4b609d7328cd0394ac9d8ad69839e11a1f879462496afcf4887154a/unstructured-0.17.2.tar.gz", hash = "sha256:af18c3caef0a6c562cf77e34ee8b6ff522b605031d2336ffe565df66f126aa46", size = 1684745 } wheels = [ - { url = "https://files.pythonhosted.org/packages/12/4f/ad08585b5c8a33c82ea119494c4d3023f4796958c56e668b15cc282ec0a0/unstructured-0.16.25-py3-none-any.whl", hash = "sha256:14719ccef2830216cf1c5bf654f75e2bf07b17ca5dcee9da5ac74618130fd337", size = 1769286 }, + { url = "https://files.pythonhosted.org/packages/cb/88/061a9dedd4e8cc0c31097c3275a9ef1fd7307e26afac5cd582487386e1b8/unstructured-0.17.2-py3-none-any.whl", hash = "sha256:527dd26a4b273aebef2f9119c9d4f0d0ce17640038d92296d23abe89be123840", size = 1771563 }, ] [package.optional-dependencies] @@ -3418,6 +3631,7 @@ all-docs = [ { name = "markdown" }, { name = "networkx" }, { name = "onnx" }, + { name = "onnxruntime" }, { name = "openpyxl" }, { name = "pandas" }, { name = "pdf2image" }, From 42bde287819b1db96c8a240fb731404b8ff8687c Mon Sep 17 00:00:00 2001 From: "DESKTOP-RTLN3BA\\$punk" Date: Wed, 30 Apr 2025 00:10:50 -0700 Subject: [PATCH 07/70] fix: Support for All Embeddings --- .../versions/5_remove_title_char_limit.py | 58 +++++++++++++++++++ surfsense_backend/app/config/__init__.py | 4 +- surfsense_backend/app/db.py | 6 +- 3 files changed, 63 insertions(+), 5 deletions(-) create mode 100644 surfsense_backend/alembic/versions/5_remove_title_char_limit.py diff --git a/surfsense_backend/alembic/versions/5_remove_title_char_limit.py b/surfsense_backend/alembic/versions/5_remove_title_char_limit.py new file mode 100644 index 000000000..57ed10899 --- /dev/null +++ b/surfsense_backend/alembic/versions/5_remove_title_char_limit.py @@ -0,0 +1,58 @@ +"""Remove char limit on title columns + +Revision ID: 5 +Revises: 4 +Create Date: 2023-06-10 00:00:00.000000 + +""" +from typing import Sequence, Union + +from alembic import op +import sqlalchemy as sa + + +# revision identifiers, used by Alembic. +revision: str = '5' +down_revision: Union[str, None] = '4' +branch_labels: Union[str, Sequence[str], None] = None +depends_on: Union[str, Sequence[str], None] = None + + +def upgrade() -> None: + # Alter Chat table + op.alter_column('chats', 'title', + existing_type=sa.String(200), + type_=sa.String(), + existing_nullable=False) + + # Alter Document table + op.alter_column('documents', 'title', + existing_type=sa.String(200), + type_=sa.String(), + existing_nullable=False) + + # Alter Podcast table + op.alter_column('podcasts', 'title', + existing_type=sa.String(200), + type_=sa.String(), + existing_nullable=False) + + +def downgrade() -> None: + # Revert Chat table + op.alter_column('chats', 'title', + existing_type=sa.String(), + type_=sa.String(200), + existing_nullable=False) + + # Revert Document table + op.alter_column('documents', 'title', + existing_type=sa.String(), + type_=sa.String(200), + existing_nullable=False) + + # Revert Podcast table + op.alter_column('podcasts', 'title', + existing_type=sa.String(), + type_=sa.String(200), + existing_nullable=False) \ No newline at end of file diff --git a/surfsense_backend/app/config/__init__.py b/surfsense_backend/app/config/__init__.py index 4adf2b7dc..91968aac0 100644 --- a/surfsense_backend/app/config/__init__.py +++ b/surfsense_backend/app/config/__init__.py @@ -38,10 +38,10 @@ class Config: EMBEDDING_MODEL = os.getenv("EMBEDDING_MODEL") embedding_model_instance = AutoEmbeddings.get_embeddings(EMBEDDING_MODEL) chunker_instance = RecursiveChunker( - chunk_size=embedding_model_instance.max_seq_length, + chunk_size=getattr(embedding_model_instance, 'max_seq_length', 512) ) code_chunker_instance = CodeChunker( - chunk_size=embedding_model_instance.max_seq_length + chunk_size=getattr(embedding_model_instance, 'max_seq_length', 512) ) # Reranker's Configuration | Pinecode, Cohere etc. Read more at https://github.com/AnswerDotAI/rerankers?tab=readme-ov-file#usage diff --git a/surfsense_backend/app/db.py b/surfsense_backend/app/db.py index 320f059dd..b4ee3e790 100644 --- a/surfsense_backend/app/db.py +++ b/surfsense_backend/app/db.py @@ -76,7 +76,7 @@ class Chat(BaseModel, TimestampMixin): __tablename__ = "chats" type = Column(SQLAlchemyEnum(ChatType), nullable=False) - title = Column(String(200), nullable=False, index=True) + title = Column(String, nullable=False, index=True) initial_connectors = Column(ARRAY(String), nullable=True) messages = Column(JSON, nullable=False) @@ -86,7 +86,7 @@ class Chat(BaseModel, TimestampMixin): class Document(BaseModel, TimestampMixin): __tablename__ = "documents" - title = Column(String(200), nullable=False, index=True) + title = Column(String, nullable=False, index=True) document_type = Column(SQLAlchemyEnum(DocumentType), nullable=False) document_metadata = Column(JSON, nullable=True) @@ -109,7 +109,7 @@ class Chunk(BaseModel, TimestampMixin): class Podcast(BaseModel, TimestampMixin): __tablename__ = "podcasts" - title = Column(String(200), nullable=False, index=True) + title = Column(String, nullable=False, index=True) is_generated = Column(Boolean, nullable=False, default=False) podcast_content = Column(Text, nullable=False, default="") file_location = Column(String(500), nullable=False, default="") From d899678b72d24e97bed702e2c0e625d1db02ef71 Mon Sep 17 00:00:00 2001 From: "DESKTOP-RTLN3BA\\$punk" Date: Wed, 30 Apr 2025 23:52:29 -0700 Subject: [PATCH 08/70] chore: updated docs --- surfsense_backend/app/config/__init__.py | 4 ++-- surfsense_web/content/docs/docker-installation.mdx | 2 +- surfsense_web/content/docs/manual-installation.mdx | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/surfsense_backend/app/config/__init__.py b/surfsense_backend/app/config/__init__.py index 91968aac0..8c457e17b 100644 --- a/surfsense_backend/app/config/__init__.py +++ b/surfsense_backend/app/config/__init__.py @@ -18,7 +18,7 @@ class Config: # Database DATABASE_URL = os.getenv("DATABASE_URL") - # Google OAuth + # AUTH: Google OAuth GOOGLE_OAUTH_CLIENT_ID = os.getenv("GOOGLE_OAUTH_CLIENT_ID") GOOGLE_OAUTH_CLIENT_SECRET = os.getenv("GOOGLE_OAUTH_CLIENT_SECRET") NEXT_FRONTEND_URL = os.getenv("NEXT_FRONTEND_URL") @@ -27,7 +27,7 @@ class Config: LONG_CONTEXT_LLM = os.getenv("LONG_CONTEXT_LLM") long_context_llm_instance = ChatLiteLLM(model=LONG_CONTEXT_LLM) - # GPT Researcher + # FAST & STRATEGIC LLM's FAST_LLM = os.getenv("FAST_LLM") STRATEGIC_LLM = os.getenv("STRATEGIC_LLM") fast_llm_instance = ChatLiteLLM(model=FAST_LLM) diff --git a/surfsense_web/content/docs/docker-installation.mdx b/surfsense_web/content/docs/docker-installation.mdx index 2a373d048..236366546 100644 --- a/surfsense_web/content/docs/docker-installation.mdx +++ b/surfsense_web/content/docs/docker-installation.mdx @@ -65,7 +65,7 @@ Before you begin, ensure you have: | GOOGLE_OAUTH_CLIENT_ID | Google OAuth client ID obtained from Google Cloud Console | | GOOGLE_OAUTH_CLIENT_SECRET | Google OAuth client secret obtained from Google Cloud Console | | NEXT_FRONTEND_URL | URL where your frontend application is hosted (e.g., `http://localhost:3000`) | - | EMBEDDING_MODEL | Name of the embedding model (e.g., `mixedbread-ai/mxbai-embed-large-v1`) | + | EMBEDDING_MODEL | Name of the embedding model (e.g., `openai://text-embedding-ada-002`, `anthropic://claude-v1`, `mixedbread-ai/mxbai-embed-large-v1`) | | RERANKERS_MODEL_NAME | Name of the reranker model (e.g., `ms-marco-MiniLM-L-12-v2`) | | RERANKERS_MODEL_TYPE | Type of reranker model (e.g., `flashrank`) | | FAST_LLM | LiteLLM routed smaller, faster LLM (e.g., `openai/gpt-4o-mini`, `ollama/deepseek-r1:8b`) | diff --git a/surfsense_web/content/docs/manual-installation.mdx b/surfsense_web/content/docs/manual-installation.mdx index 477f5ef17..3813b1b88 100644 --- a/surfsense_web/content/docs/manual-installation.mdx +++ b/surfsense_web/content/docs/manual-installation.mdx @@ -53,7 +53,7 @@ Edit the `.env` file and set the following variables: | GOOGLE_OAUTH_CLIENT_ID | Google OAuth client ID | | GOOGLE_OAUTH_CLIENT_SECRET | Google OAuth client secret | | NEXT_FRONTEND_URL | Frontend application URL (e.g., `http://localhost:3000`) | -| EMBEDDING_MODEL | Name of the embedding model (e.g., `mixedbread-ai/mxbai-embed-large-v1`) | +| EMBEDDING_MODEL | Name of the embedding model (e.g., `openai://text-embedding-ada-002`, `anthropic://claude-v1`, `mixedbread-ai/mxbai-embed-large-v1`) | | RERANKERS_MODEL_NAME | Name of the reranker model (e.g., `ms-marco-MiniLM-L-12-v2`) | | RERANKERS_MODEL_TYPE | Type of reranker model (e.g., `flashrank`) | | FAST_LLM | LiteLLM routed faster LLM (e.g., `openai/gpt-4o-mini`, `ollama/deepseek-r1:8b`) | From dc97072145f3840d5b4e1d7d5721556f829dde06 Mon Sep 17 00:00:00 2001 From: "DESKTOP-RTLN3BA\\$punk" Date: Wed, 30 Apr 2025 23:52:58 -0700 Subject: [PATCH 09/70] chore: updated docs --- surfsense_backend/app/config/__init__.py | 4 ++-- surfsense_web/content/docs/docker-installation.mdx | 2 +- surfsense_web/content/docs/manual-installation.mdx | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/surfsense_backend/app/config/__init__.py b/surfsense_backend/app/config/__init__.py index 91968aac0..8c457e17b 100644 --- a/surfsense_backend/app/config/__init__.py +++ b/surfsense_backend/app/config/__init__.py @@ -18,7 +18,7 @@ class Config: # Database DATABASE_URL = os.getenv("DATABASE_URL") - # Google OAuth + # AUTH: Google OAuth GOOGLE_OAUTH_CLIENT_ID = os.getenv("GOOGLE_OAUTH_CLIENT_ID") GOOGLE_OAUTH_CLIENT_SECRET = os.getenv("GOOGLE_OAUTH_CLIENT_SECRET") NEXT_FRONTEND_URL = os.getenv("NEXT_FRONTEND_URL") @@ -27,7 +27,7 @@ class Config: LONG_CONTEXT_LLM = os.getenv("LONG_CONTEXT_LLM") long_context_llm_instance = ChatLiteLLM(model=LONG_CONTEXT_LLM) - # GPT Researcher + # FAST & STRATEGIC LLM's FAST_LLM = os.getenv("FAST_LLM") STRATEGIC_LLM = os.getenv("STRATEGIC_LLM") fast_llm_instance = ChatLiteLLM(model=FAST_LLM) diff --git a/surfsense_web/content/docs/docker-installation.mdx b/surfsense_web/content/docs/docker-installation.mdx index 2a373d048..236366546 100644 --- a/surfsense_web/content/docs/docker-installation.mdx +++ b/surfsense_web/content/docs/docker-installation.mdx @@ -65,7 +65,7 @@ Before you begin, ensure you have: | GOOGLE_OAUTH_CLIENT_ID | Google OAuth client ID obtained from Google Cloud Console | | GOOGLE_OAUTH_CLIENT_SECRET | Google OAuth client secret obtained from Google Cloud Console | | NEXT_FRONTEND_URL | URL where your frontend application is hosted (e.g., `http://localhost:3000`) | - | EMBEDDING_MODEL | Name of the embedding model (e.g., `mixedbread-ai/mxbai-embed-large-v1`) | + | EMBEDDING_MODEL | Name of the embedding model (e.g., `openai://text-embedding-ada-002`, `anthropic://claude-v1`, `mixedbread-ai/mxbai-embed-large-v1`) | | RERANKERS_MODEL_NAME | Name of the reranker model (e.g., `ms-marco-MiniLM-L-12-v2`) | | RERANKERS_MODEL_TYPE | Type of reranker model (e.g., `flashrank`) | | FAST_LLM | LiteLLM routed smaller, faster LLM (e.g., `openai/gpt-4o-mini`, `ollama/deepseek-r1:8b`) | diff --git a/surfsense_web/content/docs/manual-installation.mdx b/surfsense_web/content/docs/manual-installation.mdx index 477f5ef17..3813b1b88 100644 --- a/surfsense_web/content/docs/manual-installation.mdx +++ b/surfsense_web/content/docs/manual-installation.mdx @@ -53,7 +53,7 @@ Edit the `.env` file and set the following variables: | GOOGLE_OAUTH_CLIENT_ID | Google OAuth client ID | | GOOGLE_OAUTH_CLIENT_SECRET | Google OAuth client secret | | NEXT_FRONTEND_URL | Frontend application URL (e.g., `http://localhost:3000`) | -| EMBEDDING_MODEL | Name of the embedding model (e.g., `mixedbread-ai/mxbai-embed-large-v1`) | +| EMBEDDING_MODEL | Name of the embedding model (e.g., `openai://text-embedding-ada-002`, `anthropic://claude-v1`, `mixedbread-ai/mxbai-embed-large-v1`) | | RERANKERS_MODEL_NAME | Name of the reranker model (e.g., `ms-marco-MiniLM-L-12-v2`) | | RERANKERS_MODEL_TYPE | Type of reranker model (e.g., `flashrank`) | | FAST_LLM | LiteLLM routed faster LLM (e.g., `openai/gpt-4o-mini`, `ollama/deepseek-r1:8b`) | From 906344d6f3724bcf65aeaf4056a7bd5133f3fac0 Mon Sep 17 00:00:00 2001 From: "DESKTOP-RTLN3BA\\$punk" Date: Sat, 3 May 2025 01:08:19 -0700 Subject: [PATCH 10/70] chore: qol patches --- README.md | 17 +++++++-- surfsense_web/app/login/GoogleLoginButton.tsx | 36 +++++++++++++++++++ .../components/ModernHeroWithGradients.tsx | 11 ++++++ surfsense_web/components/Navbar.tsx | 20 ++--------- 4 files changed, 64 insertions(+), 20 deletions(-) diff --git a/README.md b/README.md index ad8633c47..d2af00f33 100644 --- a/README.md +++ b/README.md @@ -1,13 +1,16 @@ - - ![new_header](https://github.com/user-attachments/assets/e236b764-0ddc-42ff-a1f1-8fbb3d2e0e65) + # SurfSense While tools like NotebookLM and Perplexity are impressive and highly effective for conducting research on any topic/query, SurfSense elevates this capability by integrating with your personal knowledge base. It is a highly customizable AI research agent, connected to external sources such as search engines (Tavily, LinkUp), Slack, Linear, Notion, YouTube, GitHub and more to come. +
+MODSetter%2FSurfSense | Trendshift +
+ # Video @@ -203,3 +206,13 @@ Before installation, make sure to complete the [prerequisite setup steps](https: Contributions are very welcome! A contribution can be as small as a ⭐ or even finding and creating issues. Fine-tuning the Backend is always desired. +## Star History + + + + + + Star History Chart + + + diff --git a/surfsense_web/app/login/GoogleLoginButton.tsx b/surfsense_web/app/login/GoogleLoginButton.tsx index a4ed4a3a0..11caafbf3 100644 --- a/surfsense_web/app/login/GoogleLoginButton.tsx +++ b/surfsense_web/app/login/GoogleLoginButton.tsx @@ -34,6 +34,42 @@ export function GoogleLoginButton() { Welcome Back + + + + + + + +
+

+ SurfSense Cloud is currently in development. Check Docs for more information on Self-Hosted version. +

+
+
+
+
+
+ + MODSetter%2FSurfSense | Trendshift + +
{ const [hoveredIndex, setHoveredIndex] = useState(null); const handleGoogleLogin = () => { - // Redirect to Google OAuth authorization URL - fetch(`${process.env.NEXT_PUBLIC_FASTAPI_BACKEND_URL}/auth/google/authorize`) - .then((response) => { - if (!response.ok) { - throw new Error('Failed to get authorization URL'); - } - return response.json(); - }) - .then((data) => { - if (data.authorization_url) { - window.location.href = data.authorization_url; - } else { - console.error('No authorization URL received'); - } - }) - .catch((error) => { - console.error('Error during Google login:', error); - }); + // Redirect to the login page + window.location.href = '/login'; }; return ( From 10d56acaa886ffbe776c8c99bb0430f15e94da32 Mon Sep 17 00:00:00 2001 From: "DESKTOP-RTLN3BA\\$punk" Date: Mon, 5 May 2025 01:39:31 -0700 Subject: [PATCH 11/70] feat: Stable & Hella Fast Podcast Agent with auto FFMPEG handling. --- .gitignore | 3 +- surfsense_backend/.env.example | 3 + .../app/agents/podcaster/__init__.py | 8 + .../app/agents/podcaster/configuration.py | 28 ++ .../app/agents/podcaster/graph.py | 23 + .../app/agents/podcaster/nodes.py | 197 ++++++++ .../app/agents/podcaster/prompts.py | 111 ++++ .../app/agents/podcaster/state.py | 38 ++ .../app/agents/podcaster/test_podcaster.py | 474 ++++++++++++++++++ surfsense_backend/app/config/__init__.py | 24 + surfsense_backend/pyproject.toml | 2 + surfsense_backend/uv.lock | 222 ++++++++ 12 files changed, 1132 insertions(+), 1 deletion(-) create mode 100644 surfsense_backend/app/agents/podcaster/__init__.py create mode 100644 surfsense_backend/app/agents/podcaster/configuration.py create mode 100644 surfsense_backend/app/agents/podcaster/graph.py create mode 100644 surfsense_backend/app/agents/podcaster/nodes.py create mode 100644 surfsense_backend/app/agents/podcaster/prompts.py create mode 100644 surfsense_backend/app/agents/podcaster/state.py create mode 100644 surfsense_backend/app/agents/podcaster/test_podcaster.py diff --git a/.gitignore b/.gitignore index ac1266863..b67a7dd64 100644 --- a/.gitignore +++ b/.gitignore @@ -1 +1,2 @@ -.flashrank_cache* \ No newline at end of file +.flashrank_cache* +podcasts/* diff --git a/surfsense_backend/.env.example b/surfsense_backend/.env.example index 6dfcc9967..8e834bf1d 100644 --- a/surfsense_backend/.env.example +++ b/surfsense_backend/.env.example @@ -15,6 +15,9 @@ FAST_LLM="openai/gpt-4o-mini" STRATEGIC_LLM="openai/gpt-4o" LONG_CONTEXT_LLM="gemini/gemini-2.0-flash" +#LiteLLM TTS Provider: https://docs.litellm.ai/docs/text_to_speech#supported-providers +TTS_SERVICE="openai/tts-1" + # Chosen LiteLLM Providers Keys OPENAI_API_KEY="sk-proj-iA" GEMINI_API_KEY="AIzaSyB6-1641124124124124124124124124124" diff --git a/surfsense_backend/app/agents/podcaster/__init__.py b/surfsense_backend/app/agents/podcaster/__init__.py new file mode 100644 index 000000000..8459b2977 --- /dev/null +++ b/surfsense_backend/app/agents/podcaster/__init__.py @@ -0,0 +1,8 @@ +"""New LangGraph Agent. + +This module defines a custom graph. +""" + +from .graph import graph + +__all__ = ["graph"] diff --git a/surfsense_backend/app/agents/podcaster/configuration.py b/surfsense_backend/app/agents/podcaster/configuration.py new file mode 100644 index 000000000..6bbb4ce03 --- /dev/null +++ b/surfsense_backend/app/agents/podcaster/configuration.py @@ -0,0 +1,28 @@ +"""Define the configurable parameters for the agent.""" + +from __future__ import annotations + +from dataclasses import dataclass, fields +from typing import Optional + +from langchain_core.runnables import RunnableConfig + + +@dataclass(kw_only=True) +class Configuration: + """The configuration for the agent.""" + + # Changeme: Add configurable values here! + # these values can be pre-set when you + # create assistants (https://langchain-ai.github.io/langgraph/cloud/how-tos/configuration_cloud/) + # and when you invoke the graph + podcast_title: str + + @classmethod + def from_runnable_config( + cls, config: Optional[RunnableConfig] = None + ) -> Configuration: + """Create a Configuration instance from a RunnableConfig object.""" + configurable = (config.get("configurable") or {}) if config else {} + _fields = {f.name for f in fields(cls) if f.init} + return cls(**{k: v for k, v in configurable.items() if k in _fields}) diff --git a/surfsense_backend/app/agents/podcaster/graph.py b/surfsense_backend/app/agents/podcaster/graph.py new file mode 100644 index 000000000..f4604a7c8 --- /dev/null +++ b/surfsense_backend/app/agents/podcaster/graph.py @@ -0,0 +1,23 @@ +from langgraph.graph import StateGraph + +from .configuration import Configuration +from .state import State + + +from .nodes import create_merged_podcast_audio, create_podcast_transcript + +# Define a new graph +workflow = StateGraph(State, config_schema=Configuration) + +# Add the node to the graph +workflow.add_node("create_podcast_transcript", create_podcast_transcript) +workflow.add_node("create_merged_podcast_audio", create_merged_podcast_audio) + +# Set the entrypoint as `call_model` +workflow.add_edge("__start__", "create_podcast_transcript") +workflow.add_edge("create_podcast_transcript", "create_merged_podcast_audio") +workflow.add_edge("create_merged_podcast_audio", "__end__") + +# Compile the workflow into an executable graph +graph = workflow.compile() +graph.name = "Surfsense Podcaster" # This defines the custom name in LangSmith diff --git a/surfsense_backend/app/agents/podcaster/nodes.py b/surfsense_backend/app/agents/podcaster/nodes.py new file mode 100644 index 000000000..810307ec2 --- /dev/null +++ b/surfsense_backend/app/agents/podcaster/nodes.py @@ -0,0 +1,197 @@ +from typing import Any, Dict +import json +import os +import uuid +from pathlib import Path +import asyncio + +from langchain_core.messages import HumanMessage, SystemMessage +from langchain_core.runnables import RunnableConfig +from litellm import aspeech +from ffmpeg.asyncio import FFmpeg + +from .configuration import Configuration +from .state import PodcastTranscriptEntry, State, PodcastTranscripts +from .prompts import get_podcast_generation_prompt +from app.config import config as app_config + + +async def create_podcast_transcript(state: State, config: RunnableConfig) -> Dict[str, Any]: + """Each node does work.""" + + # Initialize LLM + llm = app_config.long_context_llm_instance + + # Get the prompt + prompt = get_podcast_generation_prompt() + + # Create the messages + messages = [ + SystemMessage(content=prompt), + HumanMessage(content=state.source_content) + ] + + # Generate the podcast transcript + llm_response = await llm.ainvoke(messages) + + # First try the direct approach + try: + podcast_transcript = PodcastTranscripts.model_validate(json.loads(llm_response.content)) + except (json.JSONDecodeError, ValueError) as e: + print(f"Direct JSON parsing failed, trying fallback approach: {str(e)}") + + # Fallback: Parse the JSON response manually + try: + # Extract JSON content from the response + content = llm_response.content + + # Find the JSON in the content (handle case where LLM might add additional text) + json_start = content.find('{') + json_end = content.rfind('}') + 1 + if json_start >= 0 and json_end > json_start: + json_str = content[json_start:json_end] + + # Parse the JSON string + parsed_data = json.loads(json_str) + + # Convert to Pydantic model + podcast_transcript = PodcastTranscripts.model_validate(parsed_data) + + print(f"Successfully parsed podcast transcript using fallback approach") + else: + # If JSON structure not found, raise a clear error + error_message = f"Could not find valid JSON in LLM response. Raw response: {content}" + print(error_message) + raise ValueError(error_message) + + except (json.JSONDecodeError, ValueError) as e2: + # Log the error and re-raise it + error_message = f"Error parsing LLM response (fallback also failed): {str(e2)}" + print(f"Error parsing LLM response: {str(e2)}") + print(f"Raw response: {llm_response.content}") + raise + + return { + "podcast_transcript": podcast_transcript.podcast_transcripts + } + + +async def create_merged_podcast_audio(state: State, config: RunnableConfig) -> Dict[str, Any]: + """Generate audio for each transcript and merge them into a single podcast file.""" + + configuration = Configuration.from_runnable_config(config) + + starting_transcript = PodcastTranscriptEntry( + speaker_id=1, + dialog=f"Welcome to {configuration.podcast_title} Podcast." + ) + + transcript = state.podcast_transcript + + # Merge the starting transcript with the podcast transcript + # Check if transcript is a PodcastTranscripts object or already a list + if hasattr(transcript, 'podcast_transcripts'): + transcript_entries = transcript.podcast_transcripts + else: + transcript_entries = transcript + + merged_transcript = [starting_transcript] + transcript_entries + + # Create a temporary directory for audio files + temp_dir = Path("temp_audio") + temp_dir.mkdir(exist_ok=True) + + # Generate a unique session ID for this podcast + session_id = str(uuid.uuid4()) + output_path = f"podcasts/{session_id}_podcast.mp3" + os.makedirs("podcasts", exist_ok=True) + + # Map of speaker_id to voice + voice_mapping = { + 0: "alloy", # Default/intro voice + 1: "echo", # First speaker + # 2: "fable", # Second speaker + # 3: "onyx", # Third speaker + # 4: "nova", # Fourth speaker + # 5: "shimmer" # Fifth speaker + } + + # Generate audio for each transcript segment + audio_files = [] + + async def generate_speech_for_segment(segment, index): + # Handle both dictionary and PodcastTranscriptEntry objects + if hasattr(segment, 'speaker_id'): + speaker_id = segment.speaker_id + dialog = segment.dialog + else: + speaker_id = segment.get("speaker_id", 0) + dialog = segment.get("dialog", "") + + # Select voice based on speaker_id + voice = voice_mapping.get(speaker_id, "alloy") + + # Generate a unique filename for this segment + filename = f"{temp_dir}/{session_id}_{index}.mp3" + + try: + # Generate speech using litellm + response = await aspeech( + model=app_config.TTS_SERVICE, + voice=voice, + input=dialog, + max_retries=2, + timeout=600, + ) + + # Save the audio to a file - use proper streaming method + with open(filename, 'wb') as f: + f.write(response.content) + + return filename + except Exception as e: + print(f"Error generating speech for segment {index}: {str(e)}") + raise + + # Generate all audio files concurrently + tasks = [generate_speech_for_segment(segment, i) for i, segment in enumerate(merged_transcript)] + audio_files = await asyncio.gather(*tasks) + + # Merge audio files using ffmpeg + try: + # Create FFmpeg instance with the first input + ffmpeg = FFmpeg().option("y") + + # Add each audio file as input + for audio_file in audio_files: + ffmpeg = ffmpeg.input(audio_file) + + # Configure the concatenation and output + filter_complex = [] + for i in range(len(audio_files)): + filter_complex.append(f"[{i}:0]") + + filter_complex_str = "".join(filter_complex) + f"concat=n={len(audio_files)}:v=0:a=1[outa]" + ffmpeg = ffmpeg.option("filter_complex", filter_complex_str) + ffmpeg = ffmpeg.output(output_path, map="[outa]") + + # Execute FFmpeg + await ffmpeg.execute() + + print(f"Successfully created podcast audio: {output_path}") + + except Exception as e: + print(f"Error merging audio files: {str(e)}") + raise + finally: + # Clean up temporary files + for audio_file in audio_files: + try: + os.remove(audio_file) + except: + pass + + return { + "podcast_transcript": merged_transcript, + "final_podcast_file_path": output_path + } diff --git a/surfsense_backend/app/agents/podcaster/prompts.py b/surfsense_backend/app/agents/podcaster/prompts.py new file mode 100644 index 000000000..2b4bdcfec --- /dev/null +++ b/surfsense_backend/app/agents/podcaster/prompts.py @@ -0,0 +1,111 @@ +import datetime + + +def get_podcast_generation_prompt(): + return f""" +Today's date: {datetime.datetime.now().strftime("%Y-%m-%d")} + +You are a master podcast scriptwriter, adept at transforming diverse input content into a lively, engaging, and natural-sounding conversation between two distinct podcast hosts. Your primary objective is to craft authentic, flowing dialogue that captures the spontaneity and chemistry of a real podcast discussion, completely avoiding any hint of robotic scripting or stiff formality. Think dynamic interplay, not just information delivery. + + +- '': A block of text containing the information to be discussed in the podcast. This could be research findings, an article summary, a detailed outline, user chat history related to the topic, or any other relevant raw information. The content might be unstructured but serves as the factual basis for the podcast dialogue. + + + +A JSON object containing the podcast transcript with alternating speakers: +{{ + "podcast_transcripts": [ + {{ + "speaker_id": 0, + "dialog": "Speaker 0 dialog here" + }}, + {{ + "speaker_id": 1, + "dialog": "Speaker 1 dialog here" + }}, + {{ + "speaker_id": 0, + "dialog": "Speaker 0 dialog here" + }}, + {{ + "speaker_id": 1, + "dialog": "Speaker 1 dialog here" + }} + ] +}} + + + +1. **Establish Distinct & Consistent Host Personas:** + * **Speaker 0 (Lead Host):** Drives the conversation forward, introduces segments, poses key questions derived from the source content, and often summarizes takeaways. Maintain a guiding, clear, and engaging tone. + * **Speaker 1 (Co-Host/Expert):** Offers deeper insights, provides alternative viewpoints or elaborations on the source content, asks clarifying or challenging questions, and shares relevant anecdotes or examples. Adopt a complementary tone (e.g., analytical, enthusiastic, reflective, slightly skeptical). + * **Consistency is Key:** Ensure each speaker maintains their distinct voice, vocabulary choice, sentence structure, and perspective throughout the entire script. Avoid having them sound interchangeable. Their interaction should feel like a genuine partnership. + +2. **Craft Natural & Dynamic Dialogue:** + * **Emulate Real Conversation:** Use contractions (e.g., "don't", "it's"), interjections ("Oh!", "Wow!", "Hmm"), discourse markers ("you know", "right?", "well"), and occasional natural pauses or filler words. Avoid overly formal language or complex sentence structures typical of written text. + * **Foster Interaction & Chemistry:** Write dialogue where speakers genuinely react *to each other*. They should build on points ("Exactly, and that reminds me..."), ask follow-up questions ("Could you expand on that?"), express agreement/disagreement respectfully ("That's a fair point, but have you considered...?"), and show active listening. + * **Vary Rhythm & Pace:** Mix short, punchy lines with longer, more explanatory ones. Vary sentence beginnings. Use questions to break up exposition. The rhythm should feel spontaneous, not monotonous. + * **Inject Personality & Relatability:** Allow for appropriate humor, moments of surprise or curiosity, brief personal reflections ("I actually experienced something similar..."), or relatable asides that fit the hosts' personas and the topic. Lightly reference past discussions if it enhances context ("Remember last week when we touched on...?"). + +3. **Structure for Flow and Listener Engagement:** + * **Natural Beginning:** Start with dialogue that flows naturally after an introduction (which will be added manually). Avoid redundant greetings or podcast name mentions since these will be added separately. + * **Logical Progression & Signposting:** Guide the listener through the information smoothly. Use clear transitions to link different ideas or segments ("So, now that we've covered X, let's dive into Y...", "That actually brings me to another key finding..."). Ensure topics flow logically from one to the next. + * **Meaningful Conclusion:** Summarize the key takeaways or main points discussed, reinforcing the core message derived from the source content. End with a final thought, a lingering question for the audience, or a brief teaser for what's next, providing a sense of closure. Avoid abrupt endings. + +4. **Integrate Source Content Seamlessly & Accurately:** + * **Translate, Don't Recite:** Rephrase information from the `` into conversational language suitable for each host's persona. Avoid directly copying dense sentences or technical jargon without explanation. The goal is discussion, not narration. + * **Explain & Contextualize:** Use analogies, simple examples, storytelling, or have one host ask clarifying questions (acting as a listener surrogate) to break down complex ideas from the source. + * **Weave Information Naturally:** Integrate facts, data, or key points from the source *within* the dialogue, not as standalone, undigested blocks. Attribute information conversationally where appropriate ("The research mentioned...", "Apparently, the key factor is..."). + * **Balance Depth & Accessibility:** Ensure the conversation is informative and factually accurate based on the source content, but prioritize clear communication and engaging delivery over exhaustive technical detail. Make it understandable and interesting for a general audience. + +5. **Length & Pacing:** + * **Six-Minute Duration:** Create a transcript that, when read at a natural speaking pace, would result in approximately 6 minutes of audio. Typically, this means around 1000 words total (based on average speaking rate of 150 words per minute). + * **Concise Speaking Turns:** Keep most speaking turns relatively brief and focused. Aim for a natural back-and-forth rhythm rather than extended monologues. + * **Essential Content Only:** Prioritize the most important information from the source content. Focus on quality over quantity, ensuring every line contributes meaningfully to the topic. + + + +Input: "Quantum computing uses quantum bits or qubits which can exist in multiple states simultaneously due to superposition." + +Output: +{{ + "podcast_transcripts": [ + {{ + "speaker_id": 0, + "dialog": "Today we're diving into the mind-bending world of quantum computing. You know, this is a topic I've been excited to cover for weeks." + }}, + {{ + "speaker_id": 1, + "dialog": "Same here! And I know our listeners have been asking for it. But I have to admit, the concept of quantum computing makes my head spin a little. Can we start with the basics?" + }}, + {{ + "speaker_id": 0, + "dialog": "Absolutely. So regular computers use bits, right? Little on-off switches that are either 1 or 0. But quantum computers use something called qubits, and this is where it gets fascinating." + }}, + {{ + "speaker_id": 1, + "dialog": "Wait, what makes qubits so special compared to regular bits?" + }}, + {{ + "speaker_id": 0, + "dialog": "The magic is in something called superposition. These qubits can exist in multiple states at the same time, not just 1 or 0." + }}, + {{ + "speaker_id": 1, + "dialog": "That sounds impossible! How would you even picture that?" + }}, + {{ + "speaker_id": 0, + "dialog": "Think of it like a coin spinning in the air. Before it lands, is it heads or tails?" + }}, + {{ + "speaker_id": 1, + "dialog": "Well, it's... neither? Or I guess both, until it lands? Oh, I think I see where you're going with this." + }} + ] +}} + + +Transform the source material into a lively and engaging podcast conversation. Craft dialogue that showcases authentic host chemistry and natural interaction (including occasional disagreement, building on points, or asking follow-up questions). Use varied speech patterns reflecting real human conversation, ensuring the final script effectively educates *and* entertains the listener while keeping within a 3-minute audio duration. + +""" \ No newline at end of file diff --git a/surfsense_backend/app/agents/podcaster/state.py b/surfsense_backend/app/agents/podcaster/state.py new file mode 100644 index 000000000..d77270d22 --- /dev/null +++ b/surfsense_backend/app/agents/podcaster/state.py @@ -0,0 +1,38 @@ +"""Define the state structures for the agent.""" + +from __future__ import annotations + +from dataclasses import dataclass +from typing import List, Optional +from pydantic import BaseModel, Field + + +class PodcastTranscriptEntry(BaseModel): + """ + Represents a single entry in a podcast transcript. + """ + speaker_id: int = Field(..., description="The ID of the speaker (0 or 1)") + dialog: str = Field(..., description="The dialog text spoken by the speaker") + + +class PodcastTranscripts(BaseModel): + """ + Represents the full podcast transcript structure. + """ + podcast_transcripts: List[PodcastTranscriptEntry] = Field( + ..., + description="List of transcript entries with alternating speakers" + ) + +@dataclass +class State: + """Defines the input state for the agent, representing a narrower interface to the outside world. + + This class is used to define the initial state and structure of incoming data. + See: https://langchain-ai.github.io/langgraph/concepts/low_level/#state + for more information. + """ + + source_content: str + podcast_transcript: Optional[List[PodcastTranscriptEntry]] = None + final_podcast_file_path: Optional[str] = None diff --git a/surfsense_backend/app/agents/podcaster/test_podcaster.py b/surfsense_backend/app/agents/podcaster/test_podcaster.py new file mode 100644 index 000000000..df6728cc7 --- /dev/null +++ b/surfsense_backend/app/agents/podcaster/test_podcaster.py @@ -0,0 +1,474 @@ +#!/usr/bin/env python +""" +Test script for the Surfsense Podcaster agent. +Run this directly from VS Code to test the Podcaster agent. +""" + +import asyncio +import os +import sys +from pathlib import Path + +# Add the project root to the Python path +project_root = str(Path(__file__).resolve().parent.parent.parent.parent) +if project_root not in sys.path: + sys.path.insert(0, project_root) + +from langchain_core.runnables import RunnableConfig + +# Now import modules using absolute imports +from app.agents.podcaster.graph import graph +from app.agents.podcaster.state import State + + +async def test_podcaster_agent(): + """Test the Podcaster agent with a sample input.""" + + # Print banner + print("=" * 80) + print("SURFSENSE PODCASTER AGENT TEST") + print("=" * 80) + + # Sample input for testing + sample_source_content = """ +

Deep-Live-Cam

+ +

+ Real-time face swap and video deepfake with a single click and only a single image. +

+ +

+hacksider%2FDeep-Live-Cam | Trendshift +

+ +

+ Demo GIF +

+ +## Disclaimer + +This deepfake software is designed to be a productive tool for the AI-generated media industry. It can assist artists in animating custom characters, creating engaging content, and even using models for clothing design. + +We are aware of the potential for unethical applications and are committed to preventative measures. A built-in check prevents the program from processing inappropriate media (nudity, graphic content, sensitive material like war footage, etc.). We will continue to develop this project responsibly, adhering to the law and ethics. We may shut down the project or add watermarks if legally required. + +- Ethical Use: Users are expected to use this software responsibly and legally. If using a real person's face, obtain their consent and clearly label any output as a deepfake when sharing online. + +- Content Restrictions: The software includes built-in checks to prevent processing inappropriate media, such as nudity, graphic content, or sensitive material. + +- Legal Compliance: We adhere to all relevant laws and ethical guidelines. If legally required, we may shut down the project or add watermarks to the output. + +- User Responsibility: We are not responsible for end-user actions. Users must ensure their use of the software aligns with ethical standards and legal requirements. + +By using this software, you agree to these terms and commit to using it in a manner that respects the rights and dignity of others. + +Users are expected to use this software responsibly and legally. If using a real person's face, obtain their consent and clearly label any output as a deepfake when sharing online. We are not responsible for end-user actions. + +## Exclusive v2.0 Quick Start - Pre-built (Windows) + + + +##### This is the fastest build you can get if you have a discrete NVIDIA or AMD GPU. + +###### These Pre-builts are perfect for non-technical users or those who don't have time to, or can't manually install all the requirements. Just a heads-up: this is an open-source project, so you can also install it manually. This will be 60 days ahead on the open source version. + +## TLDR; Live Deepfake in just 3 Clicks +![easysteps](https://github.com/user-attachments/assets/af825228-852c-411b-b787-ffd9aac72fc6) +1. Select a face +2. Select which camera to use +3. Press live! + +## Features & Uses - Everything is in real-time + +### Mouth Mask + +**Retain your original mouth for accurate movement using Mouth Mask** + +

+ resizable-gif +

+ +### Face Mapping + +**Use different faces on multiple subjects simultaneously** + +

+ face_mapping_source +

+ +### Your Movie, Your Face + +**Watch movies with any face in real-time** + +

+ movie +

+ +### Live Show + +**Run Live shows and performances** + +

+ show +

+ +### Memes + +**Create Your Most Viral Meme Yet** + +

+ show +
+ Created using Many Faces feature in Deep-Live-Cam +

+ +### Omegle + +**Surprise people on Omegle** + +

+ +

+ +## Installation (Manual) + +**Please be aware that the installation requires technical skills and is not for beginners. Consider downloading the prebuilt version.** + +
+Click to see the process + +### Installation + +This is more likely to work on your computer but will be slower as it utilizes the CPU. + +**1. Set up Your Platform** + +- Python (3.10 recommended) +- pip +- git +- [ffmpeg](https://www.youtube.com/watch?v=OlNWCpFdVMA) - ```iex (irm ffmpeg.tc.ht)``` +- [Visual Studio 2022 Runtimes (Windows)](https://visualstudio.microsoft.com/visual-cpp-build-tools/) + +**2. Clone the Repository** + +```bash +git clone https://github.com/hacksider/Deep-Live-Cam.git +cd Deep-Live-Cam +``` + +**3. Download the Models** + +1. [GFPGANv1.4](https://huggingface.co/hacksider/deep-live-cam/resolve/main/GFPGANv1.4.pth) +2. [inswapper\_128\_fp16.onnx](https://huggingface.co/hacksider/deep-live-cam/resolve/main/inswapper_128_fp16.onnx) + +Place these files in the "**models**" folder. + +**4. Install Dependencies** + +We highly recommend using a `venv` to avoid issues. + + +For Windows: +```bash +python -m venv venv +venv\Scripts\activate +pip install -r requirements.txt +``` +For Linux: +```bash +# Ensure you use the installed Python 3.10 +python3 -m venv venv +source venv/bin/activate +pip install -r requirements.txt +``` + +**For macOS:** + +Apple Silicon (M1/M2/M3) requires specific setup: + +```bash +# Install Python 3.10 (specific version is important) +brew install python@3.10 + +# Install tkinter package (required for the GUI) +brew install python-tk@3.10 + +# Create and activate virtual environment with Python 3.10 +python3.10 -m venv venv +source venv/bin/activate + +# Install dependencies +pip install -r requirements.txt +``` + +** In case something goes wrong and you need to reinstall the virtual environment ** + +```bash +# Deactivate the virtual environment +rm -rf venv + +# Reinstall the virtual environment +python -m venv venv +source venv/bin/activate + +# install the dependencies again +pip install -r requirements.txt +``` + +**Run:** If you don't have a GPU, you can run Deep-Live-Cam using `python run.py`. Note that initial execution will download models (~300MB). + +### GPU Acceleration + +**CUDA Execution Provider (Nvidia)** + +1. Install [CUDA Toolkit 11.8.0](https://developer.nvidia.com/cuda-11-8-0-download-archive) +2. Install dependencies: + +```bash +pip uninstall onnxruntime onnxruntime-gpu +pip install onnxruntime-gpu==1.16.3 +``` + +3. Usage: + +```bash +python run.py --execution-provider cuda +``` + +**CoreML Execution Provider (Apple Silicon)** + +Apple Silicon (M1/M2/M3) specific installation: + +1. Make sure you've completed the macOS setup above using Python 3.10. +2. Install dependencies: + +```bash +pip uninstall onnxruntime onnxruntime-silicon +pip install onnxruntime-silicon==1.13.1 +``` + +3. Usage (important: specify Python 3.10): + +```bash +python3.10 run.py --execution-provider coreml +``` + +**Important Notes for macOS:** +- You **must** use Python 3.10, not newer versions like 3.11 or 3.13 +- Always run with `python3.10` command not just `python` if you have multiple Python versions installed +- If you get error about `_tkinter` missing, reinstall the tkinter package: `brew reinstall python-tk@3.10` +- If you get model loading errors, check that your models are in the correct folder +- If you encounter conflicts with other Python versions, consider uninstalling them: + ```bash + # List all installed Python versions + brew list | grep python + + # Uninstall conflicting versions if needed + brew uninstall --ignore-dependencies python@3.11 python@3.13 + + # Keep only Python 3.10 + brew cleanup + ``` + +**CoreML Execution Provider (Apple Legacy)** + +1. Install dependencies: + +```bash +pip uninstall onnxruntime onnxruntime-coreml +pip install onnxruntime-coreml==1.13.1 +``` + +2. Usage: + +```bash +python run.py --execution-provider coreml +``` + +**DirectML Execution Provider (Windows)** + +1. Install dependencies: + +```bash +pip uninstall onnxruntime onnxruntime-directml +pip install onnxruntime-directml==1.15.1 +``` + +2. Usage: + +```bash +python run.py --execution-provider directml +``` + +**OpenVINO™ Execution Provider (Intel)** + +1. Install dependencies: + +```bash +pip uninstall onnxruntime onnxruntime-openvino +pip install onnxruntime-openvino==1.15.0 +``` + +2. Usage: + +```bash +python run.py --execution-provider openvino +``` +
+ +## Usage + +**1. Image/Video Mode** + +- Execute `python run.py`. +- Choose a source face image and a target image/video. +- Click "Start". +- The output will be saved in a directory named after the target video. + +**2. Webcam Mode** + +- Execute `python run.py`. +- Select a source face image. +- Click "Live". +- Wait for the preview to appear (10-30 seconds). +- Use a screen capture tool like OBS to stream. +- To change the face, select a new source image. + +## Tips and Tricks + +Check out these helpful guides to get the most out of Deep-Live-Cam: + +- [Unlocking the Secrets to the Perfect Deepfake Image](https://deeplivecam.net/index.php/blog/tips-and-tricks/unlocking-the-secrets-to-the-perfect-deepfake-image) - Learn how to create the best deepfake with full head coverage +- [Video Call with DeepLiveCam](https://deeplivecam.net/index.php/blog/tips-and-tricks/video-call-with-deeplivecam) - Make your meetings livelier by using DeepLiveCam with OBS and meeting software +- [Have a Special Guest!](https://deeplivecam.net/index.php/blog/tips-and-tricks/have-a-special-guest) - Tutorial on how to use face mapping to add special guests to your stream +- [Watch Deepfake Movies in Realtime](https://deeplivecam.net/index.php/blog/tips-and-tricks/watch-deepfake-movies-in-realtime) - See yourself star in any video without processing the video +- [Better Quality without Sacrificing Speed](https://deeplivecam.net/index.php/blog/tips-and-tricks/better-quality-without-sacrificing-speed) - Tips for achieving better results without impacting performance +- [Instant Vtuber!](https://deeplivecam.net/index.php/blog/tips-and-tricks/instant-vtuber) - Create a new persona/vtuber easily using Metahuman Creator + +Visit our [official blog](https://deeplivecam.net/index.php/blog/tips-and-tricks) for more tips and tutorials. + +## Command Line Arguments (Unmaintained) + +``` +options: + -h, --help show this help message and exit + -s SOURCE_PATH, --source SOURCE_PATH select a source image + -t TARGET_PATH, --target TARGET_PATH select a target image or video + -o OUTPUT_PATH, --output OUTPUT_PATH select output file or directory + --frame-processor FRAME_PROCESSOR [FRAME_PROCESSOR ...] frame processors (choices: face_swapper, face_enhancer, ...) + --keep-fps keep original fps + --keep-audio keep original audio + --keep-frames keep temporary frames + --many-faces process every face + --map-faces map source target faces + --mouth-mask mask the mouth region + --video-encoder {libx264,libx265,libvpx-vp9} adjust output video encoder + --video-quality [0-51] adjust output video quality + --live-mirror the live camera display as you see it in the front-facing camera frame + --live-resizable the live camera frame is resizable + --max-memory MAX_MEMORY maximum amount of RAM in GB + --execution-provider {cpu} [{cpu} ...] available execution provider (choices: cpu, ...) + --execution-threads EXECUTION_THREADS number of execution threads + -v, --version show program's version number and exit +``` + +Looking for a CLI mode? Using the -s/--source argument will make the run program in cli mode. + +## Press + +**We are always open to criticism and are ready to improve, that's why we didn't cherry-pick anything.** + + - [*"Deep-Live-Cam goes viral, allowing anyone to become a digital doppelganger"*](https://arstechnica.com/information-technology/2024/08/new-ai-tool-enables-real-time-face-swapping-on-webcams-raising-fraud-concerns/) - Ars Technica + - [*"Thanks Deep Live Cam, shapeshifters are among us now"*](https://dataconomy.com/2024/08/15/what-is-deep-live-cam-github-deepfake/) - Dataconomy + - [*"This free AI tool lets you become anyone during video-calls"*](https://www.newsbytesapp.com/news/science/deep-live-cam-ai-impersonation-tool-goes-viral/story) - NewsBytes + - [*"OK, this viral AI live stream software is truly terrifying"*](https://www.creativebloq.com/ai/ok-this-viral-ai-live-stream-software-is-truly-terrifying) - Creative Bloq + - [*"Deepfake AI Tool Lets You Become Anyone in a Video Call With Single Photo"*](https://petapixel.com/2024/08/14/deep-live-cam-deepfake-ai-tool-lets-you-become-anyone-in-a-video-call-with-single-photo-mark-zuckerberg-jd-vance-elon-musk/) - PetaPixel + - [*"Deep-Live-Cam Uses AI to Transform Your Face in Real-Time, Celebrities Included"*](https://www.techeblog.com/deep-live-cam-ai-transform-face/) - TechEBlog + - [*"An AI tool that "makes you look like anyone" during a video call is going viral online"*](https://telegrafi.com/en/a-tool-that-makes-you-look-like-anyone-during-a-video-call-is-going-viral-on-the-Internet/) - Telegrafi + - [*"This Deepfake Tool Turning Images Into Livestreams is Topping the GitHub Charts"*](https://decrypt.co/244565/this-deepfake-tool-turning-images-into-livestreams-is-topping-the-github-charts) - Emerge + - [*"New Real-Time Face-Swapping AI Allows Anyone to Mimic Famous Faces"*](https://www.digitalmusicnews.com/2024/08/15/face-swapping-ai-real-time-mimic/) - Digital Music News + - [*"This real-time webcam deepfake tool raises alarms about the future of identity theft"*](https://www.diyphotography.net/this-real-time-webcam-deepfake-tool-raises-alarms-about-the-future-of-identity-theft/) - DIYPhotography + - [*"That's Crazy, Oh God. That's Fucking Freaky Dude... That's So Wild Dude"*](https://www.youtube.com/watch?time_continue=1074&v=py4Tc-Y8BcY) - SomeOrdinaryGamers + - [*"Alright look look look, now look chat, we can do any face we want to look like chat"*](https://www.youtube.com/live/mFsCe7AIxq8?feature=shared&t=2686) - IShowSpeed + +## Credits + +- [ffmpeg](https://ffmpeg.org/): for making video-related operations easy +- [deepinsight](https://github.com/deepinsight): for their [insightface](https://github.com/deepinsight/insightface) project which provided a well-made library and models. Please be reminded that the [use of the model is for non-commercial research purposes only](https://github.com/deepinsight/insightface?tab=readme-ov-file#license). +- [havok2-htwo](https://github.com/havok2-htwo): for sharing the code for webcam +- [GosuDRM](https://github.com/GosuDRM): for the open version of roop +- [pereiraroland26](https://github.com/pereiraroland26): Multiple faces support +- [vic4key](https://github.com/vic4key): For supporting/contributing to this project +- [kier007](https://github.com/kier007): for improving the user experience +- [qitianai](https://github.com/qitianai): for multi-lingual support +- and [all developers](https://github.com/hacksider/Deep-Live-Cam/graphs/contributors) behind libraries used in this project. +- Footnote: Please be informed that the base author of the code is [s0md3v](https://github.com/s0md3v/roop) +- All the wonderful users who helped make this project go viral by starring the repo ❤️ + +[![Stargazers](https://reporoster.com/stars/hacksider/Deep-Live-Cam)](https://github.com/hacksider/Deep-Live-Cam/stargazers) + +## Contributions + +![Alt](https://repobeats.axiom.co/api/embed/fec8e29c45dfdb9c5916f3a7830e1249308d20e1.svg "Repobeats analytics image") + +## Stars to the Moon 🚀 + +
+ + + + Star History Chart + + + """ + + # Create initial state + initial_state = State( + source_content=sample_source_content + ) + + # Configuration with podcast title + config = RunnableConfig( + configurable={ + "podcast_title": "SurfSense" + } + ) + + # Create 'podcasts' directory if it doesn't exist + os.makedirs("podcasts", exist_ok=True) + + # Run the agent + print("\nRunning Podcaster agent...\n") + + try: + # Execute the graph + final_state = await graph.ainvoke(initial_state, config) + + # Print results + print("\nAgent execution completed successfully!") + print(f"Generated podcast file: {final_state.get('final_podcast_file_path', 'No audio file generated')}") + + # If transcript was generated, show a preview + if final_state.get('podcast_transcript'): + print("\nPodcast transcript preview (first 3 entries):") + for i, entry in enumerate(final_state.get('podcast_transcript')[:3]): + # Handle both dictionary and PodcastTranscriptEntry objects + if hasattr(entry, 'speaker_id'): + speaker_id = entry.speaker_id + dialog = entry.dialog + else: + speaker_id = entry.get('speaker_id', 0) + dialog = entry.get('dialog', '') + + print(f"Speaker {speaker_id}: {dialog[:50]}...") + + except Exception as e: + print(f"\nError running the agent: {str(e)}") + raise + + print("\nTest completed!") + return final_state + + +if __name__ == "__main__": + # Run the test function + final_state = asyncio.run(test_podcaster_agent()) \ No newline at end of file diff --git a/surfsense_backend/app/config/__init__.py b/surfsense_backend/app/config/__init__.py index 8c457e17b..bdc370ea3 100644 --- a/surfsense_backend/app/config/__init__.py +++ b/surfsense_backend/app/config/__init__.py @@ -1,10 +1,12 @@ import os from pathlib import Path +import shutil from chonkie import AutoEmbeddings, CodeChunker, RecursiveChunker from dotenv import load_dotenv from langchain_community.chat_models import ChatLiteLLM from rerankers import Reranker +from litellm import speech # Get the base directory of the project BASE_DIR = Path(__file__).resolve().parent.parent.parent @@ -13,8 +15,27 @@ env_file = BASE_DIR / ".env" load_dotenv(env_file) +def is_ffmpeg_installed(): + """ + Check if ffmpeg is installed on the current system. + + Returns: + bool: True if ffmpeg is installed, False otherwise. + """ + return shutil.which("ffmpeg") is not None + + class Config: + # Check if ffmpeg is installed + if not is_ffmpeg_installed(): + import static_ffmpeg + # ffmpeg installed on first call to add_paths(), threadsafe. + static_ffmpeg.add_paths() + # check if ffmpeg is installed again + if not is_ffmpeg_installed(): + raise ValueError("FFmpeg is not installed on the system. Please install it to use the Surfsense Podcaster.") + # Database DATABASE_URL = os.getenv("DATABASE_URL") @@ -61,6 +82,9 @@ class Config: # Firecrawl API Key FIRECRAWL_API_KEY = os.getenv("FIRECRAWL_API_KEY", None) + # Litellm TTS Configuration + TTS_SERVICE = os.getenv("TTS_SERVICE") + # Validation Checks # Check embedding dimension if hasattr(embedding_model_instance, 'dimension') and embedding_model_instance.dimension > 2000: diff --git a/surfsense_backend/pyproject.toml b/surfsense_backend/pyproject.toml index c447a74b8..cecf70943 100644 --- a/surfsense_backend/pyproject.toml +++ b/surfsense_backend/pyproject.toml @@ -21,9 +21,11 @@ dependencies = [ "notion-client>=2.3.0", "pgvector>=0.3.6", "playwright>=1.50.0", + "python-ffmpeg>=2.0.12", "rerankers[flashrank]>=0.7.1", "sentence-transformers>=3.4.1", "slack-sdk>=3.34.0", + "static-ffmpeg>=2.13", "tavily-python>=0.3.2", "unstructured-client>=0.30.0", "unstructured[all-docs]>=0.16.25", diff --git a/surfsense_backend/uv.lock b/surfsense_backend/uv.lock index a5621abda..5f90ed9ae 100644 --- a/surfsense_backend/uv.lock +++ b/surfsense_backend/uv.lock @@ -590,6 +590,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/68/1b/e0a87d256e40e8c888847551b20a017a6b98139178505dc7ffb96f04e954/dnspython-2.7.0-py3-none-any.whl", hash = "sha256:b4c34b7d10b51bcc3a5071e7b8dee77939f1e878477eeecc965e9835f63c6c86", size = 313632 }, ] +[[package]] +name = "docutils" +version = "0.21.2" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/ae/ed/aefcc8cd0ba62a0560c3c18c33925362d46c6075480bfa4df87b28e169a9/docutils-0.21.2.tar.gz", hash = "sha256:3a6b18732edf182daa3cd12775bbb338cf5691468f91eeeb109deff6ebfa986f", size = 2204444 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/8f/d7/9322c609343d929e75e7e5e6255e614fcc67572cfd083959cdef3b7aad79/docutils-0.21.2-py3-none-any.whl", hash = "sha256:dafca5b9e384f0e419294eb4d2ff9fa826435bf15f15b7bd45723e8ad76811b2", size = 587408 }, +] + [[package]] name = "effdet" version = "0.4.1" @@ -1144,6 +1153,18 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/f0/0f/310fb31e39e2d734ccaa2c0fb981ee41f7bd5056ce9bc29b2248bd569169/humanfriendly-10.0-py2.py3-none-any.whl", hash = "sha256:1697e1a8a8f550fd43c2865cd84542fc175a61dcb779b6fee18cf6b6ccba1477", size = 86794 }, ] +[[package]] +name = "id" +version = "1.5.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "requests" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/22/11/102da08f88412d875fa2f1a9a469ff7ad4c874b0ca6fed0048fe385bdb3d/id-1.5.0.tar.gz", hash = "sha256:292cb8a49eacbbdbce97244f47a97b4c62540169c976552e497fd57df0734c1d", size = 15237 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/9f/cb/18326d2d89ad3b0dd143da971e77afd1e6ca6674f1b1c3df4b6bec6279fc/id-1.5.0-py3-none-any.whl", hash = "sha256:f1434e1cef91f2cbb8a4ec64663d5a23b9ed43ef44c4c957d02583d61714c658", size = 13611 }, +] + [[package]] name = "idna" version = "3.10" @@ -1165,6 +1186,48 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/79/9d/0fb148dc4d6fa4a7dd1d8378168d9b4cd8d4560a6fbf6f0121c5fc34eb68/importlib_metadata-8.6.1-py3-none-any.whl", hash = "sha256:02a89390c1e15fdfdc0d7c6b25cb3e62650d0494005c97d6f148bf5b9787525e", size = 26971 }, ] +[[package]] +name = "jaraco-classes" +version = "3.4.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "more-itertools" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/06/c0/ed4a27bc5571b99e3cff68f8a9fa5b56ff7df1c2251cc715a652ddd26402/jaraco.classes-3.4.0.tar.gz", hash = "sha256:47a024b51d0239c0dd8c8540c6c7f484be3b8fcf0b2d85c13825780d3b3f3acd", size = 11780 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/7f/66/b15ce62552d84bbfcec9a4873ab79d993a1dd4edb922cbfccae192bd5b5f/jaraco.classes-3.4.0-py3-none-any.whl", hash = "sha256:f662826b6bed8cace05e7ff873ce0f9283b5c924470fe664fff1c2f00f581790", size = 6777 }, +] + +[[package]] +name = "jaraco-context" +version = "6.0.1" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/df/ad/f3777b81bf0b6e7bc7514a1656d3e637b2e8e15fab2ce3235730b3e7a4e6/jaraco_context-6.0.1.tar.gz", hash = "sha256:9bae4ea555cf0b14938dc0aee7c9f32ed303aa20a3b73e7dc80111628792d1b3", size = 13912 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/ff/db/0c52c4cf5e4bd9f5d7135ec7669a3a767af21b3a308e1ed3674881e52b62/jaraco.context-6.0.1-py3-none-any.whl", hash = "sha256:f797fc481b490edb305122c9181830a3a5b76d84ef6d1aef2fb9b47ab956f9e4", size = 6825 }, +] + +[[package]] +name = "jaraco-functools" +version = "4.1.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "more-itertools" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/ab/23/9894b3df5d0a6eb44611c36aec777823fc2e07740dabbd0b810e19594013/jaraco_functools-4.1.0.tar.gz", hash = "sha256:70f7e0e2ae076498e212562325e805204fc092d7b4c17e0e86c959e249701a9d", size = 19159 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/9f/4f/24b319316142c44283d7540e76c7b5a6dbd5db623abd86bb7b3491c21018/jaraco.functools-4.1.0-py3-none-any.whl", hash = "sha256:ad159f13428bc4acbf5541ad6dec511f91573b90fba04df61dafa2a1231cf649", size = 10187 }, +] + +[[package]] +name = "jeepney" +version = "0.9.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/7b/6f/357efd7602486741aa73ffc0617fb310a29b588ed0fd69c2399acbb85b0c/jeepney-0.9.0.tar.gz", hash = "sha256:cf0e9e845622b81e4a28df94c40345400256ec608d0e55bb8a3feaa9163f5732", size = 106758 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/b2/a3/e137168c9c44d18eff0376253da9f1e9234d0239e0ee230d2fee6cea8e55/jeepney-0.9.0-py3-none-any.whl", hash = "sha256:97e5714520c16fc0a45695e5365a2e11b81ea79bba796e26f9f1d178cb182683", size = 49010 }, +] + [[package]] name = "jinja2" version = "3.1.5" @@ -1269,6 +1332,23 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/d1/0f/8910b19ac0670a0f80ce1008e5e751c4a57e14d2c4c13a482aa6079fa9d6/jsonschema_specifications-2024.10.1-py3-none-any.whl", hash = "sha256:a09a0680616357d9a0ecf05c12ad234479f549239d0f5b55f3deea67475da9bf", size = 18459 }, ] +[[package]] +name = "keyring" +version = "25.6.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "jaraco-classes" }, + { name = "jaraco-context" }, + { name = "jaraco-functools" }, + { name = "jeepney", marker = "sys_platform == 'linux'" }, + { name = "pywin32-ctypes", marker = "sys_platform == 'win32'" }, + { name = "secretstorage", marker = "sys_platform == 'linux'" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/70/09/d904a6e96f76ff214be59e7aa6ef7190008f52a0ab6689760a98de0bf37d/keyring-25.6.0.tar.gz", hash = "sha256:0b39998aa941431eb3d9b0d4b2460bc773b9df6fed7621c2dfb291a7e0187a66", size = 62750 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/d3/32/da7f44bcb1105d3e88a0b74ebdca50c59121d2ddf71c9e34ba47df7f3a56/keyring-25.6.0-py3-none-any.whl", hash = "sha256:552a3f7af126ece7ed5c89753650eec89c7eaae8617d0aa4d9ad2b75111266bd", size = 39085 }, +] + [[package]] name = "kiwisolver" version = "1.4.8" @@ -1754,6 +1834,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/cd/76/c8575f90f521017597c5e57e3bfef61e3f27d9cb6c741a82a24d72b10a60/model2vec-0.4.1-py3-none-any.whl", hash = "sha256:04a397a17da9b967082b6baa4c494f0be48c89ec4e1a3975b4f290f045238a38", size = 41972 }, ] +[[package]] +name = "more-itertools" +version = "10.7.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/ce/a0/834b0cebabbfc7e311f30b46c8188790a37f89fc8d756660346fe5abfd09/more_itertools-10.7.0.tar.gz", hash = "sha256:9fddd5403be01a94b204faadcff459ec3568cf110265d3c54323e1e866ad29d3", size = 127671 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/2b/9f/7ba6f94fc1e9ac3d2b853fdff3035fb2fa5afbed898c4a72b8a020610594/more_itertools-10.7.0-py3-none-any.whl", hash = "sha256:d43980384673cb07d2f7d2d918c616b30c659c089ee23953f601d6609c67510e", size = 65278 }, +] + [[package]] name = "mpmath" version = "1.3.0" @@ -1829,6 +1918,37 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/b9/54/dd730b32ea14ea797530a4479b2ed46a6fb250f682a9cfb997e968bf0261/networkx-3.4.2-py3-none-any.whl", hash = "sha256:df5d4365b724cf81b8c6a7312509d0c22386097011ad1abe274afd5e9d3bbc5f", size = 1723263 }, ] +[[package]] +name = "nh3" +version = "0.2.21" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/37/30/2f81466f250eb7f591d4d193930df661c8c23e9056bdc78e365b646054d8/nh3-0.2.21.tar.gz", hash = "sha256:4990e7ee6a55490dbf00d61a6f476c9a3258e31e711e13713b2ea7d6616f670e", size = 16581 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/7f/81/b83775687fcf00e08ade6d4605f0be9c4584cb44c4973d9f27b7456a31c9/nh3-0.2.21-cp313-cp313t-macosx_10_12_x86_64.macosx_11_0_arm64.macosx_10_12_universal2.whl", hash = "sha256:fcff321bd60c6c5c9cb4ddf2554e22772bb41ebd93ad88171bbbb6f271255286", size = 1297678 }, + { url = "https://files.pythonhosted.org/packages/22/ee/d0ad8fb4b5769f073b2df6807f69a5e57ca9cea504b78809921aef460d20/nh3-0.2.21-cp313-cp313t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:31eedcd7d08b0eae28ba47f43fd33a653b4cdb271d64f1aeda47001618348fde", size = 733774 }, + { url = "https://files.pythonhosted.org/packages/ea/76/b450141e2d384ede43fe53953552f1c6741a499a8c20955ad049555cabc8/nh3-0.2.21-cp313-cp313t-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:d426d7be1a2f3d896950fe263332ed1662f6c78525b4520c8e9861f8d7f0d243", size = 760012 }, + { url = "https://files.pythonhosted.org/packages/97/90/1182275db76cd8fbb1f6bf84c770107fafee0cb7da3e66e416bcb9633da2/nh3-0.2.21-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:9d67709bc0d7d1f5797b21db26e7a8b3d15d21c9c5f58ccfe48b5328483b685b", size = 923619 }, + { url = "https://files.pythonhosted.org/packages/29/c7/269a7cfbec9693fad8d767c34a755c25ccb8d048fc1dfc7a7d86bc99375c/nh3-0.2.21-cp313-cp313t-musllinux_1_2_armv7l.whl", hash = "sha256:55823c5ea1f6b267a4fad5de39bc0524d49a47783e1fe094bcf9c537a37df251", size = 1000384 }, + { url = "https://files.pythonhosted.org/packages/68/a9/48479dbf5f49ad93f0badd73fbb48b3d769189f04c6c69b0df261978b009/nh3-0.2.21-cp313-cp313t-musllinux_1_2_i686.whl", hash = "sha256:818f2b6df3763e058efa9e69677b5a92f9bc0acff3295af5ed013da544250d5b", size = 918908 }, + { url = "https://files.pythonhosted.org/packages/d7/da/0279c118f8be2dc306e56819880b19a1cf2379472e3b79fc8eab44e267e3/nh3-0.2.21-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:b3b5c58161e08549904ac4abd450dacd94ff648916f7c376ae4b2c0652b98ff9", size = 909180 }, + { url = "https://files.pythonhosted.org/packages/26/16/93309693f8abcb1088ae143a9c8dbcece9c8f7fb297d492d3918340c41f1/nh3-0.2.21-cp313-cp313t-win32.whl", hash = "sha256:637d4a10c834e1b7d9548592c7aad760611415fcd5bd346f77fd8a064309ae6d", size = 532747 }, + { url = "https://files.pythonhosted.org/packages/a2/3a/96eb26c56cbb733c0b4a6a907fab8408ddf3ead5d1b065830a8f6a9c3557/nh3-0.2.21-cp313-cp313t-win_amd64.whl", hash = "sha256:713d16686596e556b65e7f8c58328c2df63f1a7abe1277d87625dcbbc012ef82", size = 528908 }, + { url = "https://files.pythonhosted.org/packages/ba/1d/b1ef74121fe325a69601270f276021908392081f4953d50b03cbb38b395f/nh3-0.2.21-cp38-abi3-macosx_10_12_x86_64.macosx_11_0_arm64.macosx_10_12_universal2.whl", hash = "sha256:a772dec5b7b7325780922dd904709f0f5f3a79fbf756de5291c01370f6df0967", size = 1316133 }, + { url = "https://files.pythonhosted.org/packages/b8/f2/2c7f79ce6de55b41e7715f7f59b159fd59f6cdb66223c05b42adaee2b645/nh3-0.2.21-cp38-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d002b648592bf3033adfd875a48f09b8ecc000abd7f6a8769ed86b6ccc70c759", size = 758328 }, + { url = "https://files.pythonhosted.org/packages/6d/ad/07bd706fcf2b7979c51b83d8b8def28f413b090cf0cb0035ee6b425e9de5/nh3-0.2.21-cp38-abi3-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:2a5174551f95f2836f2ad6a8074560f261cf9740a48437d6151fd2d4d7d617ab", size = 747020 }, + { url = "https://files.pythonhosted.org/packages/75/99/06a6ba0b8a0d79c3d35496f19accc58199a1fb2dce5e711a31be7e2c1426/nh3-0.2.21-cp38-abi3-manylinux_2_17_ppc64.manylinux2014_ppc64.whl", hash = "sha256:b8d55ea1fc7ae3633d758a92aafa3505cd3cc5a6e40470c9164d54dff6f96d42", size = 944878 }, + { url = "https://files.pythonhosted.org/packages/79/d4/dc76f5dc50018cdaf161d436449181557373869aacf38a826885192fc587/nh3-0.2.21-cp38-abi3-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:6ae319f17cd8960d0612f0f0ddff5a90700fa71926ca800e9028e7851ce44a6f", size = 903460 }, + { url = "https://files.pythonhosted.org/packages/cd/c3/d4f8037b2ab02ebf5a2e8637bd54736ed3d0e6a2869e10341f8d9085f00e/nh3-0.2.21-cp38-abi3-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:63ca02ac6f27fc80f9894409eb61de2cb20ef0a23740c7e29f9ec827139fa578", size = 839369 }, + { url = "https://files.pythonhosted.org/packages/11/a9/1cd3c6964ec51daed7b01ca4686a5c793581bf4492cbd7274b3f544c9abe/nh3-0.2.21-cp38-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a5f77e62aed5c4acad635239ac1290404c7e940c81abe561fd2af011ff59f585", size = 739036 }, + { url = "https://files.pythonhosted.org/packages/fd/04/bfb3ff08d17a8a96325010ae6c53ba41de6248e63cdb1b88ef6369a6cdfc/nh3-0.2.21-cp38-abi3-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:087ffadfdcd497658c3adc797258ce0f06be8a537786a7217649fc1c0c60c293", size = 768712 }, + { url = "https://files.pythonhosted.org/packages/9e/aa/cfc0bf545d668b97d9adea4f8b4598667d2b21b725d83396c343ad12bba7/nh3-0.2.21-cp38-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:ac7006c3abd097790e611fe4646ecb19a8d7f2184b882f6093293b8d9b887431", size = 930559 }, + { url = "https://files.pythonhosted.org/packages/78/9d/6f5369a801d3a1b02e6a9a097d56bcc2f6ef98cffebf03c4bb3850d8e0f0/nh3-0.2.21-cp38-abi3-musllinux_1_2_armv7l.whl", hash = "sha256:6141caabe00bbddc869665b35fc56a478eb774a8c1dfd6fba9fe1dfdf29e6efa", size = 1008591 }, + { url = "https://files.pythonhosted.org/packages/a6/df/01b05299f68c69e480edff608248313cbb5dbd7595c5e048abe8972a57f9/nh3-0.2.21-cp38-abi3-musllinux_1_2_i686.whl", hash = "sha256:20979783526641c81d2f5bfa6ca5ccca3d1e4472474b162c6256745fbfe31cd1", size = 925670 }, + { url = "https://files.pythonhosted.org/packages/3d/79/bdba276f58d15386a3387fe8d54e980fb47557c915f5448d8c6ac6f7ea9b/nh3-0.2.21-cp38-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:a7ea28cd49293749d67e4fcf326c554c83ec912cd09cd94aa7ec3ab1921c8283", size = 917093 }, + { url = "https://files.pythonhosted.org/packages/e7/d8/c6f977a5cd4011c914fb58f5ae573b071d736187ccab31bfb1d539f4af9f/nh3-0.2.21-cp38-abi3-win32.whl", hash = "sha256:6c9c30b8b0d291a7c5ab0967ab200598ba33208f754f2f4920e9343bdd88f79a", size = 537623 }, + { url = "https://files.pythonhosted.org/packages/23/fc/8ce756c032c70ae3dd1d48a3552577a325475af2a2f629604b44f571165c/nh3-0.2.21-cp38-abi3-win_amd64.whl", hash = "sha256:bb0014948f04d7976aabae43fcd4cb7f551f9f8ce785a4c9ef66e6c2590f8629", size = 535283 }, +] + [[package]] name = "nltk" version = "3.9.1" @@ -2366,6 +2486,12 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/bc/2b/e944e10c9b18e77e43d3bb4d6faa323f6cc27597db37b75bc3fd796adfd5/playwright-1.50.0-py3-none-win_amd64.whl", hash = "sha256:1859423da82de631704d5e3d88602d755462b0906824c1debe140979397d2e8d", size = 34784546 }, ] +[[package]] +name = "progress" +version = "1.6" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/2a/68/d8412d1e0d70edf9791cbac5426dc859f4649afc22f2abbeb0d947cf70fd/progress-1.6.tar.gz", hash = "sha256:c9c86e98b5c03fa1fe11e3b67c1feda4788b8d0fe7336c2ff7d5644ccfba34cd", size = 7842 } + [[package]] name = "propcache" version = "0.2.1" @@ -2705,6 +2831,19 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/6a/3e/b68c118422ec867fa7ab88444e1274aa40681c606d59ac27de5a5588f082/python_dotenv-1.0.1-py3-none-any.whl", hash = "sha256:f7b63ef50f1b690dddf550d03497b66d609393b40b564ed0d674909a68ebf16a", size = 19863 }, ] +[[package]] +name = "python-ffmpeg" +version = "2.0.12" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "pyee" }, + { name = "typing-extensions" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/dd/4d/7ecffb341d646e016be76e36f5a42cb32f409c9ca21a57b68f067fad3fc7/python_ffmpeg-2.0.12.tar.gz", hash = "sha256:19ac80af5a064a2f53c245af1a909b2d7648ea045500d96d3bcd507b88d43dc7", size = 14126292 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/7f/6d/02e817aec661defe148cb9eb0c4eca2444846305f625c2243fb9f92a9045/python_ffmpeg-2.0.12-py3-none-any.whl", hash = "sha256:d86697da8dfb39335183e336d31baf42fb217468adf5ac97fd743898240faae3", size = 14411 }, +] + [[package]] name = "python-iso639" version = "2025.2.18" @@ -2770,6 +2909,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/eb/38/ac33370d784287baa1c3d538978b5e2ea064d4c1b93ffbd12826c190dd10/pytz-2025.1-py2.py3-none-any.whl", hash = "sha256:89dd22dca55b46eac6eda23b2d72721bf1bdfef212645d81513ef5d03038de57", size = 507930 }, ] +[[package]] +name = "pywin32-ctypes" +version = "0.2.3" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/85/9f/01a1a99704853cb63f253eea009390c88e7131c67e66a0a02099a8c917cb/pywin32-ctypes-0.2.3.tar.gz", hash = "sha256:d162dc04946d704503b2edc4d55f3dba5c1d539ead017afa00142c38b9885755", size = 29471 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/de/3d/8161f7711c017e01ac9f008dfddd9410dff3674334c233bde66e7ba65bbf/pywin32_ctypes-0.2.3-py3-none-any.whl", hash = "sha256:8a1513379d709975552d202d942d9837758905c8d01eb82b8bcc30918929e7b8", size = 30756 }, +] + [[package]] name = "pyyaml" version = "6.0.2" @@ -2834,6 +2982,20 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/4b/43/ca3d1018b392f49131843648e10b08ace23afe8dad3bee5f136e4346b7cd/rapidfuzz-3.12.2-cp313-cp313-win_arm64.whl", hash = "sha256:69f6ecdf1452139f2b947d0c169a605de578efdb72cbb2373cb0a94edca1fd34", size = 863535 }, ] +[[package]] +name = "readme-renderer" +version = "44.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "docutils" }, + { name = "nh3" }, + { name = "pygments" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/5a/a9/104ec9234c8448c4379768221ea6df01260cd6c2ce13182d4eac531c8342/readme_renderer-44.0.tar.gz", hash = "sha256:8712034eabbfa6805cacf1402b4eeb2a73028f72d1166d6f5cb7f9c047c5d1e1", size = 32056 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/e1/67/921ec3024056483db83953ae8e48079ad62b92db7880013ca77632921dd0/readme_renderer-44.0-py3-none-any.whl", hash = "sha256:2fbca89b81a08526aadf1357a8c2ae889ec05fb03f5da67f9769c9a592166151", size = 13310 }, +] + [[package]] name = "referencing" version = "0.36.2" @@ -2927,6 +3089,15 @@ flashrank = [ { name = "flashrank" }, ] +[[package]] +name = "rfc3986" +version = "2.0.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/85/40/1520d68bfa07ab5a6f065a186815fb6610c86fe957bc065754e47f7b0840/rfc3986-2.0.0.tar.gz", hash = "sha256:97aacf9dbd4bfd829baad6e6309fa6573aaf1be3f6fa735c8ab05e46cecb261c", size = 49026 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/ff/9a/9afaade874b2fa6c752c36f1548f718b5b83af81ed9b76628329dab81c1b/rfc3986-2.0.0-py2.py3-none-any.whl", hash = "sha256:50b1502b60e289cb37883f3dfd34532b8873c7de9f49bb546641ce9cbd256ebd", size = 31326 }, +] + [[package]] name = "rich" version = "14.0.0" @@ -3083,6 +3254,19 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/e4/1f/5d46a8d94e9f6d2c913cbb109e57e7eed914de38ea99e2c4d69a9fc93140/scipy-1.15.1-cp313-cp313t-win_amd64.whl", hash = "sha256:bc7136626261ac1ed988dca56cfc4ab5180f75e0ee52e58f1e6aa74b5f3eacd5", size = 43181730 }, ] +[[package]] +name = "secretstorage" +version = "3.3.3" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "cryptography", marker = "sys_platform != 'darwin'" }, + { name = "jeepney", marker = "sys_platform != 'darwin'" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/53/a4/f48c9d79cb507ed1373477dbceaba7401fd8a23af63b837fa61f1dcd3691/SecretStorage-3.3.3.tar.gz", hash = "sha256:2403533ef369eca6d2ba81718576c5e0f564d5cca1b58f73a8b23e7d4eeebd77", size = 19739 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/54/24/b4293291fa1dd830f353d2cb163295742fa87f179fcc8a20a306a81978b7/SecretStorage-3.3.3-py3-none-any.whl", hash = "sha256:f356e6628222568e3af06f2eba8df495efa13b3b63081dafd4f7d9a7b7bc9f99", size = 15221 }, +] + [[package]] name = "sentence-transformers" version = "3.4.1" @@ -3192,6 +3376,20 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/d9/61/f2b52e107b1fc8944b33ef56bf6ac4ebbe16d91b94d2b87ce013bf63fb84/starlette-0.45.3-py3-none-any.whl", hash = "sha256:dfb6d332576f136ec740296c7e8bb8c8a7125044e7c6da30744718880cdd059d", size = 71507 }, ] +[[package]] +name = "static-ffmpeg" +version = "2.13" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "filelock" }, + { name = "progress" }, + { name = "requests" }, + { name = "twine" }, +] +wheels = [ + { url = "https://files.pythonhosted.org/packages/09/39/1a5d0603280dd681ec52a2a6717c05dab530190dff7887b7603740a1741b/static_ffmpeg-2.13-py3-none-any.whl", hash = "sha256:3bed55a7979f9de9d1eec1126b98774a1d41c2e323811f59973d54b9c94d6dac", size = 7586 }, +] + [[package]] name = "surf-new-backend" version = "0.0.6" @@ -3213,9 +3411,11 @@ dependencies = [ { name = "notion-client" }, { name = "pgvector" }, { name = "playwright" }, + { name = "python-ffmpeg" }, { name = "rerankers", extra = ["flashrank"] }, { name = "sentence-transformers" }, { name = "slack-sdk" }, + { name = "static-ffmpeg" }, { name = "tavily-python" }, { name = "unstructured", extra = ["all-docs"] }, { name = "unstructured-client" }, @@ -3242,9 +3442,11 @@ requires-dist = [ { name = "notion-client", specifier = ">=2.3.0" }, { name = "pgvector", specifier = ">=0.3.6" }, { name = "playwright", specifier = ">=1.50.0" }, + { name = "python-ffmpeg", specifier = ">=2.0.12" }, { name = "rerankers", extras = ["flashrank"], specifier = ">=0.7.1" }, { name = "sentence-transformers", specifier = ">=3.4.1" }, { name = "slack-sdk", specifier = ">=3.34.0" }, + { name = "static-ffmpeg", specifier = ">=2.13" }, { name = "tavily-python", specifier = ">=0.3.2" }, { name = "unstructured", extras = ["all-docs"], specifier = ">=0.16.25" }, { name = "unstructured-client", specifier = ">=0.30.0" }, @@ -3549,6 +3751,26 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/c7/30/37a3384d1e2e9320331baca41e835e90a3767303642c7a80d4510152cbcf/triton-3.2.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e5dfa23ba84541d7c0a531dfce76d8bcd19159d50a4a8b14ad01e91734a5c1b0", size = 253154278 }, ] +[[package]] +name = "twine" +version = "6.1.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "id" }, + { name = "keyring", marker = "platform_machine != 'ppc64le' and platform_machine != 's390x'" }, + { name = "packaging" }, + { name = "readme-renderer" }, + { name = "requests" }, + { name = "requests-toolbelt" }, + { name = "rfc3986" }, + { name = "rich" }, + { name = "urllib3" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/c8/a2/6df94fc5c8e2170d21d7134a565c3a8fb84f9797c1dd65a5976aaf714418/twine-6.1.0.tar.gz", hash = "sha256:be324f6272eff91d07ee93f251edf232fc647935dd585ac003539b42404a8dbd", size = 168404 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/7c/b6/74e927715a285743351233f33ea3c684528a0d374d2e43ff9ce9585b73fe/twine-6.1.0-py3-none-any.whl", hash = "sha256:a47f973caf122930bf0fbbf17f80b83bc1602c9ce393c7845f289a3001dc5384", size = 40791 }, +] + [[package]] name = "types-requests" version = "2.32.0.20250328" From b4bee887bdb95593bede649447a450e3843c4b56 Mon Sep 17 00:00:00 2001 From: "DESKTOP-RTLN3BA\\$punk" Date: Mon, 5 May 2025 23:18:12 -0700 Subject: [PATCH 12/70] feat: Added Podcast Feature and its actually fast. - Fully Async --- .../6_change_podcast_content_to_transcript.py | 44 + .../versions/7_remove_is_generated_column.py | 28 + .../app/agents/podcaster/graph.py | 32 +- .../app/agents/podcaster/nodes.py | 2 +- .../app/agents/podcaster/prompts.py | 2 +- surfsense_backend/app/db.py | 3 +- .../app/routes/podcasts_routes.py | 129 ++- surfsense_backend/app/schemas/__init__.py | 3 +- surfsense_backend/app/schemas/chats.py | 8 +- surfsense_backend/app/schemas/podcasts.py | 12 +- surfsense_backend/app/tasks/podcast_tasks.py | 94 +++ .../[search_space_id]/chats/chats-client.tsx | 447 ++++++++-- .../dashboard/[search_space_id]/layout.tsx | 7 + .../[search_space_id]/podcasts/page.tsx | 22 + .../podcasts/podcasts-client.tsx | 787 ++++++++++++++++++ .../components/sidebar/app-sidebar.tsx | 4 +- surfsense_web/components/ui/slider.tsx | 28 + surfsense_web/package.json | 1 + surfsense_web/pnpm-lock.yaml | 98 +++ 19 files changed, 1676 insertions(+), 75 deletions(-) create mode 100644 surfsense_backend/alembic/versions/6_change_podcast_content_to_transcript.py create mode 100644 surfsense_backend/alembic/versions/7_remove_is_generated_column.py create mode 100644 surfsense_backend/app/tasks/podcast_tasks.py create mode 100644 surfsense_web/app/dashboard/[search_space_id]/podcasts/page.tsx create mode 100644 surfsense_web/app/dashboard/[search_space_id]/podcasts/podcasts-client.tsx create mode 100644 surfsense_web/components/ui/slider.tsx diff --git a/surfsense_backend/alembic/versions/6_change_podcast_content_to_transcript.py b/surfsense_backend/alembic/versions/6_change_podcast_content_to_transcript.py new file mode 100644 index 000000000..991948f3a --- /dev/null +++ b/surfsense_backend/alembic/versions/6_change_podcast_content_to_transcript.py @@ -0,0 +1,44 @@ +"""Change podcast_content to podcast_transcript with JSON type + +Revision ID: 6 +Revises: 5 +Create Date: 2023-08-15 00:00:00.000000 + +""" +from typing import Sequence, Union + +from alembic import op +import sqlalchemy as sa +from sqlalchemy.dialects.postgresql import JSON + + +# revision identifiers, used by Alembic. +revision: str = '6' +down_revision: Union[str, None] = '5' +branch_labels: Union[str, Sequence[str], None] = None +depends_on: Union[str, Sequence[str], None] = None + + +def upgrade() -> None: + # Drop the old column and create a new one with the new name and type + # We need to do this because PostgreSQL doesn't support direct column renames with type changes + op.add_column('podcasts', sa.Column('podcast_transcript', JSON, nullable=False, server_default='{}')) + + # Copy data from old column to new column + # Convert text to JSON by storing it as a JSON string value + op.execute("UPDATE podcasts SET podcast_transcript = jsonb_build_object('text', podcast_content) WHERE podcast_content != ''") + + # Drop the old column + op.drop_column('podcasts', 'podcast_content') + + +def downgrade() -> None: + # Add back the original column + op.add_column('podcasts', sa.Column('podcast_content', sa.Text(), nullable=False, server_default='')) + + # Copy data from JSON column back to text column + # Extract the 'text' field if it exists, otherwise use empty string + op.execute("UPDATE podcasts SET podcast_content = COALESCE((podcast_transcript->>'text'), '')") + + # Drop the new column + op.drop_column('podcasts', 'podcast_transcript') \ No newline at end of file diff --git a/surfsense_backend/alembic/versions/7_remove_is_generated_column.py b/surfsense_backend/alembic/versions/7_remove_is_generated_column.py new file mode 100644 index 000000000..c5d25ad70 --- /dev/null +++ b/surfsense_backend/alembic/versions/7_remove_is_generated_column.py @@ -0,0 +1,28 @@ +"""Remove is_generated column from podcasts table + +Revision ID: 7 +Revises: 6 +Create Date: 2023-08-15 01:00:00.000000 + +""" +from typing import Sequence, Union + +from alembic import op +import sqlalchemy as sa + + +# revision identifiers, used by Alembic. +revision: str = '7' +down_revision: Union[str, None] = '6' +branch_labels: Union[str, Sequence[str], None] = None +depends_on: Union[str, Sequence[str], None] = None + + +def upgrade() -> None: + # Drop the is_generated column + op.drop_column('podcasts', 'is_generated') + + +def downgrade() -> None: + # Add back the is_generated column with its original constraints + op.add_column('podcasts', sa.Column('is_generated', sa.Boolean(), nullable=False, server_default='false')) \ No newline at end of file diff --git a/surfsense_backend/app/agents/podcaster/graph.py b/surfsense_backend/app/agents/podcaster/graph.py index f4604a7c8..d102432ef 100644 --- a/surfsense_backend/app/agents/podcaster/graph.py +++ b/surfsense_backend/app/agents/podcaster/graph.py @@ -6,18 +6,26 @@ from .state import State from .nodes import create_merged_podcast_audio, create_podcast_transcript -# Define a new graph -workflow = StateGraph(State, config_schema=Configuration) -# Add the node to the graph -workflow.add_node("create_podcast_transcript", create_podcast_transcript) -workflow.add_node("create_merged_podcast_audio", create_merged_podcast_audio) +def build_graph(): + + # Define a new graph + workflow = StateGraph(State, config_schema=Configuration) -# Set the entrypoint as `call_model` -workflow.add_edge("__start__", "create_podcast_transcript") -workflow.add_edge("create_podcast_transcript", "create_merged_podcast_audio") -workflow.add_edge("create_merged_podcast_audio", "__end__") + # Add the node to the graph + workflow.add_node("create_podcast_transcript", create_podcast_transcript) + workflow.add_node("create_merged_podcast_audio", create_merged_podcast_audio) -# Compile the workflow into an executable graph -graph = workflow.compile() -graph.name = "Surfsense Podcaster" # This defines the custom name in LangSmith + # Set the entrypoint as `call_model` + workflow.add_edge("__start__", "create_podcast_transcript") + workflow.add_edge("create_podcast_transcript", "create_merged_podcast_audio") + workflow.add_edge("create_merged_podcast_audio", "__end__") + + # Compile the workflow into an executable graph + graph = workflow.compile() + graph.name = "Surfsense Podcaster" # This defines the custom name in LangSmith + + return graph + +# Compile the graph once when the module is loaded +graph = build_graph() diff --git a/surfsense_backend/app/agents/podcaster/nodes.py b/surfsense_backend/app/agents/podcaster/nodes.py index 810307ec2..19a233a6c 100644 --- a/surfsense_backend/app/agents/podcaster/nodes.py +++ b/surfsense_backend/app/agents/podcaster/nodes.py @@ -28,7 +28,7 @@ async def create_podcast_transcript(state: State, config: RunnableConfig) -> Dic # Create the messages messages = [ SystemMessage(content=prompt), - HumanMessage(content=state.source_content) + HumanMessage(content=f"{state.source_content}") ] # Generate the podcast transcript diff --git a/surfsense_backend/app/agents/podcaster/prompts.py b/surfsense_backend/app/agents/podcaster/prompts.py index 2b4bdcfec..c08d38e31 100644 --- a/surfsense_backend/app/agents/podcaster/prompts.py +++ b/surfsense_backend/app/agents/podcaster/prompts.py @@ -106,6 +106,6 @@ Output: }} -Transform the source material into a lively and engaging podcast conversation. Craft dialogue that showcases authentic host chemistry and natural interaction (including occasional disagreement, building on points, or asking follow-up questions). Use varied speech patterns reflecting real human conversation, ensuring the final script effectively educates *and* entertains the listener while keeping within a 3-minute audio duration. +Transform the source material into a lively and engaging podcast conversation. Craft dialogue that showcases authentic host chemistry and natural interaction (including occasional disagreement, building on points, or asking follow-up questions). Use varied speech patterns reflecting real human conversation, ensuring the final script effectively educates *and* entertains the listener while keeping within a 5-minute audio duration. """ \ No newline at end of file diff --git a/surfsense_backend/app/db.py b/surfsense_backend/app/db.py index b4ee3e790..7327c3a0c 100644 --- a/surfsense_backend/app/db.py +++ b/surfsense_backend/app/db.py @@ -110,8 +110,7 @@ class Podcast(BaseModel, TimestampMixin): __tablename__ = "podcasts" title = Column(String, nullable=False, index=True) - is_generated = Column(Boolean, nullable=False, default=False) - podcast_content = Column(Text, nullable=False, default="") + podcast_transcript = Column(JSON, nullable=False, default={}) file_location = Column(String(500), nullable=False, default="") search_space_id = Column(Integer, ForeignKey("searchspaces.id", ondelete='CASCADE'), nullable=False) diff --git a/surfsense_backend/app/routes/podcasts_routes.py b/surfsense_backend/app/routes/podcasts_routes.py index 7ac1da1ba..bc82e21d0 100644 --- a/surfsense_backend/app/routes/podcasts_routes.py +++ b/surfsense_backend/app/routes/podcasts_routes.py @@ -1,12 +1,16 @@ -from fastapi import APIRouter, Depends, HTTPException +from fastapi import APIRouter, BackgroundTasks, Depends, HTTPException from sqlalchemy.ext.asyncio import AsyncSession from sqlalchemy.future import select from sqlalchemy.exc import IntegrityError, SQLAlchemyError from typing import List -from app.db import get_async_session, User, SearchSpace, Podcast -from app.schemas import PodcastCreate, PodcastUpdate, PodcastRead +from app.db import get_async_session, User, SearchSpace, Podcast, Chat +from app.schemas import PodcastCreate, PodcastUpdate, PodcastRead, PodcastGenerateRequest from app.users import current_active_user from app.utils.check_ownership import check_ownership +from app.tasks.podcast_tasks import generate_chat_podcast +from fastapi.responses import StreamingResponse +import os +from pathlib import Path router = APIRouter() @@ -119,4 +123,121 @@ async def delete_podcast( raise he except SQLAlchemyError: await session.rollback() - raise HTTPException(status_code=500, detail="Database error occurred while deleting podcast") \ No newline at end of file + raise HTTPException(status_code=500, detail="Database error occurred while deleting podcast") + +async def generate_chat_podcast_with_new_session( + chat_id: int, + search_space_id: int, + podcast_title: str = "SurfSense Podcast" +): + """Create a new session and process chat podcast generation.""" + from app.db import async_session_maker + + async with async_session_maker() as session: + try: + await generate_chat_podcast(session, chat_id, search_space_id, podcast_title) + except Exception as e: + import logging + logging.error(f"Error generating podcast from chat: {str(e)}") + +@router.post("/podcasts/generate/") +async def generate_podcast( + request: PodcastGenerateRequest, + session: AsyncSession = Depends(get_async_session), + user: User = Depends(current_active_user), + fastapi_background_tasks: BackgroundTasks = BackgroundTasks() +): + try: + # Check if the user owns the search space + await check_ownership(session, SearchSpace, request.search_space_id, user) + + if request.type == "CHAT": + # Verify that all chat IDs belong to this user and search space + query = select(Chat).filter( + Chat.id.in_(request.ids), + Chat.search_space_id == request.search_space_id + ).join(SearchSpace).filter(SearchSpace.user_id == user.id) + + result = await session.execute(query) + valid_chats = result.scalars().all() + valid_chat_ids = [chat.id for chat in valid_chats] + + # If any requested ID is not in valid IDs, raise error immediately + if len(valid_chat_ids) != len(request.ids): + raise HTTPException( + status_code=403, + detail="One or more chat IDs do not belong to this user or search space" + ) + + # Only add a single task with the first chat ID + for chat_id in valid_chat_ids: + fastapi_background_tasks.add_task( + generate_chat_podcast_with_new_session, + chat_id, + request.search_space_id, + request.podcast_title + ) + + return { + "message": "Podcast generation started", + } + except HTTPException as he: + raise he + except IntegrityError as e: + await session.rollback() + raise HTTPException(status_code=400, detail="Podcast generation failed due to constraint violation") + except SQLAlchemyError as e: + await session.rollback() + raise HTTPException(status_code=500, detail="Database error occurred while generating podcast") + except Exception as e: + await session.rollback() + raise HTTPException(status_code=500, detail=f"An unexpected error occurred: {str(e)}") + +@router.get("/podcasts/{podcast_id}/stream") +async def stream_podcast( + podcast_id: int, + session: AsyncSession = Depends(get_async_session), + user: User = Depends(current_active_user) +): + """Stream a podcast audio file.""" + try: + # Get the podcast and check if user has access + result = await session.execute( + select(Podcast) + .join(SearchSpace) + .filter(Podcast.id == podcast_id, SearchSpace.user_id == user.id) + ) + podcast = result.scalars().first() + + if not podcast: + raise HTTPException( + status_code=404, + detail="Podcast not found or you don't have permission to access it" + ) + + # Get the file path + file_path = podcast.file_location + + # Check if the file exists + if not os.path.isfile(file_path): + raise HTTPException(status_code=404, detail="Podcast audio file not found") + + # Define a generator function to stream the file + def iterfile(): + with open(file_path, mode="rb") as file_like: + yield from file_like + + # Return a streaming response with appropriate headers + return StreamingResponse( + iterfile(), + media_type="audio/mpeg", + headers={ + "Accept-Ranges": "bytes", + "Content-Disposition": f"inline; filename={Path(file_path).name}" + } + ) + + except HTTPException as he: + raise he + except Exception as e: + raise HTTPException(status_code=500, detail=f"Error streaming podcast: {str(e)}") \ No newline at end of file diff --git a/surfsense_backend/app/schemas/__init__.py b/surfsense_backend/app/schemas/__init__.py index 07adf24de..21688dfb0 100644 --- a/surfsense_backend/app/schemas/__init__.py +++ b/surfsense_backend/app/schemas/__init__.py @@ -10,7 +10,7 @@ from .documents import ( DocumentRead, ) from .chunks import ChunkBase, ChunkCreate, ChunkUpdate, ChunkRead -from .podcasts import PodcastBase, PodcastCreate, PodcastUpdate, PodcastRead +from .podcasts import PodcastBase, PodcastCreate, PodcastUpdate, PodcastRead, PodcastGenerateRequest from .chats import ChatBase, ChatCreate, ChatUpdate, ChatRead, AISDKChatRequest from .search_source_connector import SearchSourceConnectorBase, SearchSourceConnectorCreate, SearchSourceConnectorUpdate, SearchSourceConnectorRead @@ -39,6 +39,7 @@ __all__ = [ "PodcastCreate", "PodcastUpdate", "PodcastRead", + "PodcastGenerateRequest", "ChatBase", "ChatCreate", "ChatUpdate", diff --git a/surfsense_backend/app/schemas/chats.py b/surfsense_backend/app/schemas/chats.py index ad7829b26..f5eefc532 100644 --- a/surfsense_backend/app/schemas/chats.py +++ b/surfsense_backend/app/schemas/chats.py @@ -1,8 +1,10 @@ from typing import Any, Dict, List, Optional -from pydantic import BaseModel -from sqlalchemy import JSON -from .base import IDModel, TimestampModel + from app.db import ChatType +from pydantic import BaseModel + +from .base import IDModel, TimestampModel + class ChatBase(BaseModel): type: ChatType diff --git a/surfsense_backend/app/schemas/podcasts.py b/surfsense_backend/app/schemas/podcasts.py index fbec5482b..4132fb211 100644 --- a/surfsense_backend/app/schemas/podcasts.py +++ b/surfsense_backend/app/schemas/podcasts.py @@ -1,10 +1,10 @@ from pydantic import BaseModel +from typing import Any, List, Literal from .base import IDModel, TimestampModel class PodcastBase(BaseModel): title: str - is_generated: bool = False - podcast_content: str = "" + podcast_transcript: List[Any] file_location: str = "" search_space_id: int @@ -16,4 +16,10 @@ class PodcastUpdate(PodcastBase): class PodcastRead(PodcastBase, IDModel, TimestampModel): class Config: - from_attributes = True \ No newline at end of file + from_attributes = True + +class PodcastGenerateRequest(BaseModel): + type: Literal["DOCUMENT", "CHAT"] + ids: List[int] + search_space_id: int + podcast_title: str = "SurfSense Podcast" \ No newline at end of file diff --git a/surfsense_backend/app/tasks/podcast_tasks.py b/surfsense_backend/app/tasks/podcast_tasks.py new file mode 100644 index 000000000..e148f5465 --- /dev/null +++ b/surfsense_backend/app/tasks/podcast_tasks.py @@ -0,0 +1,94 @@ +from sqlalchemy.ext.asyncio import AsyncSession +from app.schemas import PodcastGenerateRequest +from typing import List +from sqlalchemy import select +from app.db import Chat, Podcast +from app.agents.podcaster.graph import graph as podcaster_graph +from surfsense_backend.app.agents.podcaster.state import State + + +async def generate_document_podcast( + session: AsyncSession, + document_id: int, + search_space_id: int, + user_id: int +): + # TODO: Need to fetch the document chunks, then concatenate them and pass them to the podcast generation model + pass + + + +async def generate_chat_podcast( + session: AsyncSession, + chat_id: int, + search_space_id: int, + podcast_title: str +): + # Fetch the chat with the specified ID + query = select(Chat).filter( + Chat.id == chat_id, + Chat.search_space_id == search_space_id + ) + + result = await session.execute(query) + chat = result.scalars().first() + + if not chat: + raise ValueError(f"Chat with id {chat_id} not found in search space {search_space_id}") + + # Create chat history structure + chat_history_str = "" + + for message in chat.messages: + if message["role"] == "user": + chat_history_str += f"{message['content']}" + elif message["role"] == "assistant": + # Last annotation type will always be "ANSWER" here + answer_annotation = message["annotations"][-1] + answer_text = "" + if answer_annotation["type"] == "ANSWER": + answer_text = answer_annotation["content"] + # If content is a list, join it into a single string + if isinstance(answer_text, list): + answer_text = "\n".join(answer_text) + chat_history_str += f"{answer_text}" + + chat_history_str += "" + + # Pass it to the SurfSense Podcaster + config = { + "configurable": { + "podcast_title" : "Surfsense", + } + } + # Initialize state with database session and streaming service + initial_state = State( + source_content=chat_history_str, + ) + + # Run the graph directly + result = await podcaster_graph.ainvoke(initial_state, config=config) + + # Convert podcast transcript entries to serializable format + serializable_transcript = [] + for entry in result["podcast_transcript"]: + serializable_transcript.append({ + "speaker_id": entry.speaker_id, + "dialog": entry.dialog + }) + + # Create a new podcast entry + podcast = Podcast( + title=f"{podcast_title}", + podcast_transcript=serializable_transcript, + file_location=result["final_podcast_file_path"], + search_space_id=search_space_id + ) + + # Add to session and commit + session.add(podcast) + await session.commit() + await session.refresh(podcast) + + return podcast + diff --git a/surfsense_web/app/dashboard/[search_space_id]/chats/chats-client.tsx b/surfsense_web/app/dashboard/[search_space_id]/chats/chats-client.tsx index c481bd6ec..6501ca684 100644 --- a/surfsense_web/app/dashboard/[search_space_id]/chats/chats-client.tsx +++ b/surfsense_web/app/dashboard/[search_space_id]/chats/chats-client.tsx @@ -3,7 +3,7 @@ import { useState, useEffect } from 'react'; import { motion, AnimatePresence } from 'framer-motion'; import { useSearchParams } from 'next/navigation'; -import { MessageCircleMore, Search, Calendar, Tag, Trash2, ExternalLink, MoreHorizontal } from 'lucide-react'; +import { MessageCircleMore, Search, Calendar, Tag, Trash2, ExternalLink, MoreHorizontal, Radio, CheckCircle, Circle, Podcast } from 'lucide-react'; import { format } from 'date-fns'; // UI Components @@ -42,6 +42,9 @@ import { SelectTrigger, SelectValue, } from "@/components/ui/select"; +import { Checkbox } from "@/components/ui/checkbox"; +import { Label } from "@/components/ui/label"; +import { toast } from "sonner"; interface Chat { created_at: string; @@ -92,6 +95,18 @@ export default function ChatsPageClient({ searchSpaceId }: ChatsPageClientProps) const [chatToDelete, setChatToDelete] = useState<{ id: number, title: string } | null>(null); const [isDeleting, setIsDeleting] = useState(false); + // New state for podcast generation + const [selectedChats, setSelectedChats] = useState([]); + const [selectionMode, setSelectionMode] = useState(false); + const [podcastDialogOpen, setPodcastDialogOpen] = useState(false); + const [podcastTitle, setPodcastTitle] = useState(""); + const [isGeneratingPodcast, setIsGeneratingPodcast] = useState(false); + + // New state for individual podcast generation + const [currentChatIndex, setCurrentChatIndex] = useState(0); + const [podcastTitles, setPodcastTitles] = useState<{[key: number]: string}>({}); + const [processingChat, setProcessingChat] = useState(null); + const chatsPerPage = 9; const searchParams = useSearchParams(); @@ -234,6 +249,177 @@ export default function ChatsPageClient({ searchSpaceId }: ChatsPageClientProps) // Get unique chat types for filter dropdown const chatTypes = ['all', ...Array.from(new Set(chats.map(chat => chat.type)))]; + // Generate individual podcasts from selected chats + const handleGeneratePodcast = async () => { + if (selectedChats.length === 0) { + toast.error("Please select at least one chat"); + return; + } + + const currentChatId = selectedChats[currentChatIndex]; + const currentTitle = podcastTitles[currentChatId] || podcastTitle; + + if (!currentTitle.trim()) { + toast.error("Please enter a podcast title"); + return; + } + + setIsGeneratingPodcast(true); + try { + const token = localStorage.getItem('surfsense_bearer_token'); + if (!token) { + toast.error("Authentication error. Please log in again."); + setIsGeneratingPodcast(false); + return; + } + + // Create payload for single chat + const payload = { + type: "CHAT", + ids: [currentChatId], // Single chat ID + search_space_id: parseInt(searchSpaceId), + podcast_title: currentTitle + }; + + const response = await fetch(`${process.env.NEXT_PUBLIC_FASTAPI_BACKEND_URL}/api/v1/podcasts/generate/`, { + method: 'POST', + headers: { + 'Authorization': `Bearer ${token}`, + 'Content-Type': 'application/json', + }, + body: JSON.stringify(payload) + }); + + if (!response.ok) { + const errorData = await response.json().catch(() => ({})); + throw new Error(errorData.detail || "Failed to generate podcast"); + } + + const data = await response.json(); + toast.success(`Podcast "${currentTitle}" generation started!`); + + // Move to the next chat or finish + if (currentChatIndex < selectedChats.length - 1) { + // Set up for next chat + setCurrentChatIndex(currentChatIndex + 1); + + // Find the next chat from the chats array + const nextChatId = selectedChats[currentChatIndex + 1]; + const nextChat = chats.find(chat => chat.id === nextChatId) || null; + setProcessingChat(nextChat); + + // Default title for the next chat + if (!podcastTitles[nextChatId]) { + setPodcastTitle(nextChat?.title || `Podcast from Chat ${nextChatId}`); + } else { + setPodcastTitle(podcastTitles[nextChatId]); + } + + setIsGeneratingPodcast(false); + } else { + // All done + finishPodcastGeneration(); + } + } catch (error) { + console.error('Error generating podcast:', error); + toast.error(error instanceof Error ? error.message : 'Failed to generate podcast'); + setIsGeneratingPodcast(false); + } + }; + + // Helper to finish the podcast generation process + const finishPodcastGeneration = () => { + toast.success("All podcasts are being generated! Check the podcasts tab to see them when ready."); + setPodcastDialogOpen(false); + setSelectedChats([]); + setSelectionMode(false); + setCurrentChatIndex(0); + setPodcastTitles({}); + setProcessingChat(null); + setPodcastTitle(""); + setIsGeneratingPodcast(false); + }; + + // Start podcast generation flow + const startPodcastGeneration = () => { + if (selectedChats.length === 0) { + toast.error("Please select at least one chat"); + return; + } + + // Reset the state for podcast generation + setCurrentChatIndex(0); + setPodcastTitles({}); + + // Set up for the first chat + const firstChatId = selectedChats[0]; + const firstChat = chats.find(chat => chat.id === firstChatId) || null; + setProcessingChat(firstChat); + + // Set default title for the first chat + setPodcastTitle(firstChat?.title || `Podcast from Chat ${firstChatId}`); + setPodcastDialogOpen(true); + }; + + // Update the title for the current chat + const updateCurrentChatTitle = (title: string) => { + const currentChatId = selectedChats[currentChatIndex]; + setPodcastTitle(title); + setPodcastTitles(prev => ({ + ...prev, + [currentChatId]: title + })); + }; + + // Skip generating a podcast for the current chat + const skipCurrentChat = () => { + if (currentChatIndex < selectedChats.length - 1) { + // Move to the next chat + setCurrentChatIndex(currentChatIndex + 1); + + // Find the next chat + const nextChatId = selectedChats[currentChatIndex + 1]; + const nextChat = chats.find(chat => chat.id === nextChatId) || null; + setProcessingChat(nextChat); + + // Set default title for the next chat + if (!podcastTitles[nextChatId]) { + setPodcastTitle(nextChat?.title || `Podcast from Chat ${nextChatId}`); + } else { + setPodcastTitle(podcastTitles[nextChatId]); + } + } else { + // All done (all skipped) + finishPodcastGeneration(); + } + }; + + // Toggle chat selection + const toggleChatSelection = (chatId: number) => { + setSelectedChats(prev => + prev.includes(chatId) + ? prev.filter(id => id !== chatId) + : [...prev, chatId] + ); + }; + + // Select all visible chats + const selectAllVisibleChats = () => { + const visibleChatIds = currentChats.map(chat => chat.id); + setSelectedChats(prev => { + const allSelected = visibleChatIds.every(id => prev.includes(id)); + return allSelected + ? prev.filter(id => !visibleChatIds.includes(id)) // Deselect all visible if all are selected + : [...new Set([...prev, ...visibleChatIds])]; // Add all visible, ensuring no duplicates + }); + }; + + // Cancel selection mode + const cancelSelectionMode = () => { + setSelectionMode(false); + setSelectedChats([]); + }; + return (
-
- +
+ {selectionMode ? ( + <> + + + + + ) : ( + <> + + + + )}
@@ -334,44 +564,69 @@ export default function ChatsPageClient({ searchSpaceId }: ChatsPageClientProps) animate="animate" exit="exit" transition={{ duration: 0.2, delay: index * 0.05 }} - className="overflow-hidden hover:shadow-md transition-shadow" + className={`overflow-hidden hover:shadow-md transition-shadow + ${selectionMode && selectedChats.includes(chat.id) + ? 'ring-2 ring-primary ring-offset-2' : ''}`} + onClick={() => selectionMode ? toggleChatSelection(chat.id) : null} >
-
- {chat.title || `Chat ${chat.id}`} - - - - {format(new Date(chat.created_at), 'MMM d, yyyy')} - - +
+ {selectionMode && ( +
+ {selectedChats.includes(chat.id) + ? + : } +
+ )} +
+ {chat.title || `Chat ${chat.id}`} + + + + {format(new Date(chat.created_at), 'MMM d, yyyy')} + + +
- - - - - - window.location.href = `/dashboard/${chat.search_space_id}/researcher/${chat.id}`}> - - View Chat - - - { - setChatToDelete({ id: chat.id, title: chat.title || `Chat ${chat.id}` }); - setDeleteDialogOpen(true); - }} - > - - Delete Chat - - - + {!selectionMode && ( + + + + + + window.location.href = `/dashboard/${chat.search_space_id}/researcher/${chat.id}`}> + + View Chat + + { + setSelectedChats([chat.id]); + setPodcastTitle(chat.title || `Chat ${chat.id}`); + setPodcastDialogOpen(true); + }} + > + + Generate Podcast + + + { + e.stopPropagation(); + setChatToDelete({ id: chat.id, title: chat.title || `Chat ${chat.id}` }); + setDeleteDialogOpen(true); + }} + > + + Delete Chat + + + + )}
@@ -505,6 +760,104 @@ export default function ChatsPageClient({ searchSpaceId }: ChatsPageClientProps) + + {/* Podcast Generation Dialog */} + { + if (!isOpen) { + // Cancel the process if dialog is closed + setPodcastDialogOpen(false); + setSelectedChats([]); + setSelectionMode(false); + setCurrentChatIndex(0); + setPodcastTitles({}); + setProcessingChat(null); + setPodcastTitle(""); + } else { + setPodcastDialogOpen(true); + } + }} + > + + + + + Generate Podcast {currentChatIndex + 1} of {selectedChats.length} + + + {selectedChats.length > 1 ? ( + <>Creating individual podcasts for each selected chat. Currently processing: {processingChat?.title || `Chat ${selectedChats[currentChatIndex]}`} + ) : ( + <>Create a podcast from this chat. The podcast will be available in the podcasts section once generated. + )} + + + +
+
+ + updateCurrentChatTitle(e.target.value)} + /> +
+ + {selectedChats.length > 1 && ( +
+
+
+ )} +
+ + + {selectedChats.length > 1 && !isGeneratingPodcast && ( + + )} + + + +
+
); } \ No newline at end of file diff --git a/surfsense_web/app/dashboard/[search_space_id]/layout.tsx b/surfsense_web/app/dashboard/[search_space_id]/layout.tsx index 7449e10b5..a3c344aaf 100644 --- a/surfsense_web/app/dashboard/[search_space_id]/layout.tsx +++ b/surfsense_web/app/dashboard/[search_space_id]/layout.tsx @@ -73,6 +73,13 @@ export default function DashboardLayout({ }, ], }, + { + title: "Podcasts", + url: `/dashboard/${search_space_id}/podcasts`, + icon: "Podcast", + items: [ + ], + } // TODO: Add research synthesizer's // { // title: "Research Synthesizer's", diff --git a/surfsense_web/app/dashboard/[search_space_id]/podcasts/page.tsx b/surfsense_web/app/dashboard/[search_space_id]/podcasts/page.tsx new file mode 100644 index 000000000..394177c88 --- /dev/null +++ b/surfsense_web/app/dashboard/[search_space_id]/podcasts/page.tsx @@ -0,0 +1,22 @@ +import { Suspense } from 'react'; +import PodcastsPageClient from './podcasts-client'; + +interface PageProps { + params: { + search_space_id: string; + }; +} + +export default async function PodcastsPage({ params }: PageProps) { + // Access dynamic route parameters + // Need to await params before accessing its properties in an async component + const { search_space_id: searchSpaceId } = await Promise.resolve(params); + + return ( + +
+
}> + + + ); +} diff --git a/surfsense_web/app/dashboard/[search_space_id]/podcasts/podcasts-client.tsx b/surfsense_web/app/dashboard/[search_space_id]/podcasts/podcasts-client.tsx new file mode 100644 index 000000000..cacee7061 --- /dev/null +++ b/surfsense_web/app/dashboard/[search_space_id]/podcasts/podcasts-client.tsx @@ -0,0 +1,787 @@ +'use client'; + +import { useState, useEffect, useRef } from 'react'; +import { motion, AnimatePresence } from 'framer-motion'; +import { format } from 'date-fns'; +import { + Search, Calendar, Trash2, MoreHorizontal, Podcast, + Play, Pause, SkipForward, SkipBack, Volume2, VolumeX +} from 'lucide-react'; + +// UI Components +import { Input } from '@/components/ui/input'; +import { Button } from '@/components/ui/button'; +import { Card, CardContent, CardFooter, CardHeader, CardTitle } from '@/components/ui/card'; +import { Slider } from '@/components/ui/slider'; +import { + DropdownMenu, + DropdownMenuContent, + DropdownMenuItem, + DropdownMenuTrigger, + DropdownMenuSeparator +} from '@/components/ui/dropdown-menu'; +import { + Dialog, + DialogContent, + DialogDescription, + DialogFooter, + DialogHeader, + DialogTitle, +} from "@/components/ui/dialog"; +import { + Select, + SelectContent, + SelectGroup, + SelectItem, + SelectTrigger, + SelectValue, +} from "@/components/ui/select"; +import { toast } from "sonner"; + +interface Podcast { + id: number; + title: string; + created_at: string; + file_location: string; + podcast_transcript: any[]; + search_space_id: number; +} + +interface PodcastsPageClientProps { + searchSpaceId: string; +} + +const pageVariants = { + initial: { opacity: 0 }, + enter: { opacity: 1, transition: { duration: 0.3, ease: 'easeInOut' } }, + exit: { opacity: 0, transition: { duration: 0.3, ease: 'easeInOut' } } +}; + +const podcastCardVariants = { + initial: { y: 20, opacity: 0 }, + animate: { y: 0, opacity: 1 }, + exit: { y: -20, opacity: 0 } +}; + +const MotionCard = motion(Card); + +export default function PodcastsPageClient({ searchSpaceId }: PodcastsPageClientProps) { + const [podcasts, setPodcasts] = useState([]); + const [filteredPodcasts, setFilteredPodcasts] = useState([]); + const [isLoading, setIsLoading] = useState(true); + const [error, setError] = useState(null); + const [searchQuery, setSearchQuery] = useState(''); + const [sortOrder, setSortOrder] = useState('newest'); + const [deleteDialogOpen, setDeleteDialogOpen] = useState(false); + const [podcastToDelete, setPodcastToDelete] = useState<{ id: number, title: string } | null>(null); + const [isDeleting, setIsDeleting] = useState(false); + + // Audio player state + const [currentPodcast, setCurrentPodcast] = useState(null); + const [audioSrc, setAudioSrc] = useState(undefined); + const [isAudioLoading, setIsAudioLoading] = useState(false); + const [isPlaying, setIsPlaying] = useState(false); + const [currentTime, setCurrentTime] = useState(0); + const [duration, setDuration] = useState(0); + const [volume, setVolume] = useState(0.7); + const [isMuted, setIsMuted] = useState(false); + const audioRef = useRef(null); + const currentObjectUrlRef = useRef(null); + + // Add podcast image URL constant + const PODCAST_IMAGE_URL = "https://static.vecteezy.com/system/resources/thumbnails/002/157/611/small_2x/illustrations-concept-design-podcast-channel-free-vector.jpg"; + + // Fetch podcasts from API + useEffect(() => { + const fetchPodcasts = async () => { + try { + setIsLoading(true); + + // Get token from localStorage + const token = localStorage.getItem('surfsense_bearer_token'); + + if (!token) { + setError('Authentication token not found. Please log in again.'); + setIsLoading(false); + return; + } + + // Fetch all podcasts for this search space + const response = await fetch( + `${process.env.NEXT_PUBLIC_FASTAPI_BACKEND_URL}/api/v1/podcasts/`, + { + headers: { + 'Authorization': `Bearer ${token}`, + 'Content-Type': 'application/json', + }, + cache: 'no-store', + } + ); + + if (!response.ok) { + const errorData = await response.json().catch(() => null); + throw new Error(`Failed to fetch podcasts: ${response.status} ${errorData?.detail || ''}`); + } + + const data: Podcast[] = await response.json(); + setPodcasts(data); + setFilteredPodcasts(data); + setError(null); + } catch (error) { + console.error('Error fetching podcasts:', error); + setError(error instanceof Error ? error.message : 'Unknown error occurred'); + setPodcasts([]); + setFilteredPodcasts([]); + } finally { + setIsLoading(false); + } + }; + + fetchPodcasts(); + }, [searchSpaceId]); + + // Filter and sort podcasts based on search query and sort order + useEffect(() => { + let result = [...podcasts]; + + // Filter by search term + if (searchQuery) { + const query = searchQuery.toLowerCase(); + result = result.filter(podcast => + podcast.title.toLowerCase().includes(query) + ); + } + + // Filter by search space + result = result.filter(podcast => + podcast.search_space_id === parseInt(searchSpaceId) + ); + + // Sort podcasts + result.sort((a, b) => { + const dateA = new Date(a.created_at).getTime(); + const dateB = new Date(b.created_at).getTime(); + + return sortOrder === 'newest' ? dateB - dateA : dateA - dateB; + }); + + setFilteredPodcasts(result); + }, [podcasts, searchQuery, sortOrder, searchSpaceId]); + + // Cleanup object URL on unmount or when currentPodcast changes + useEffect(() => { + return () => { + if (currentObjectUrlRef.current) { + URL.revokeObjectURL(currentObjectUrlRef.current); + currentObjectUrlRef.current = null; + } + }; + }, []); + + // Audio player time update handler + const handleTimeUpdate = () => { + if (audioRef.current) { + setCurrentTime(audioRef.current.currentTime); + } + }; + + // Audio player metadata loaded handler + const handleMetadataLoaded = () => { + if (audioRef.current) { + setDuration(audioRef.current.duration); + } + }; + + // Play/pause toggle + const togglePlayPause = () => { + if (audioRef.current) { + if (isPlaying) { + audioRef.current.pause(); + } else { + audioRef.current.play(); + } + setIsPlaying(!isPlaying); + } + }; + + // Seek to position + const handleSeek = (value: number[]) => { + if (audioRef.current) { + audioRef.current.currentTime = value[0]; + setCurrentTime(value[0]); + } + }; + + // Volume change + const handleVolumeChange = (value: number[]) => { + if (audioRef.current) { + const newVolume = value[0]; + audioRef.current.volume = newVolume; + setVolume(newVolume); + + if (newVolume === 0) { + setIsMuted(true); + } else if (isMuted) { + setIsMuted(false); + } + } + }; + + // Toggle mute + const toggleMute = () => { + if (audioRef.current) { + audioRef.current.muted = !isMuted; + setIsMuted(!isMuted); + } + }; + + // Skip forward 10 seconds + const skipForward = () => { + if (audioRef.current) { + audioRef.current.currentTime = Math.min(audioRef.current.duration, audioRef.current.currentTime + 10); + } + }; + + // Skip backward 10 seconds + const skipBackward = () => { + if (audioRef.current) { + audioRef.current.currentTime = Math.max(0, audioRef.current.currentTime - 10); + } + }; + + // Format time in MM:SS + const formatTime = (time: number) => { + const minutes = Math.floor(time / 60); + const seconds = Math.floor(time % 60); + return `${minutes}:${seconds < 10 ? '0' : ''}${seconds}`; + }; + + // Play podcast - Fetch blob and set object URL + const playPodcast = async (podcast: Podcast) => { + // If the same podcast is selected, just toggle play/pause + if (currentPodcast && currentPodcast.id === podcast.id) { + togglePlayPause(); + return; + } + + // Revoke previous object URL if exists + if (currentObjectUrlRef.current) { + URL.revokeObjectURL(currentObjectUrlRef.current); + currentObjectUrlRef.current = null; + } + + // Reset player state and show loading + setCurrentPodcast(podcast); + setAudioSrc(undefined); + setCurrentTime(0); + setDuration(0); + setIsPlaying(false); + setIsAudioLoading(true); + + try { + const token = localStorage.getItem('surfsense_bearer_token'); + if (!token) { + toast.error('Authentication token not found.'); + setIsAudioLoading(false); + return; + } + + const response = await fetch( + `${process.env.NEXT_PUBLIC_FASTAPI_BACKEND_URL}/api/v1/podcasts/${podcast.id}/stream`, + { + headers: { + 'Authorization': `Bearer ${token}`, + }, + } + ); + + if (!response.ok) { + throw new Error(`Failed to fetch audio stream: ${response.statusText}`); + } + + const blob = await response.blob(); + const objectUrl = URL.createObjectURL(blob); + currentObjectUrlRef.current = objectUrl; + setAudioSrc(objectUrl); + + // Let the audio element load the new src + setTimeout(() => { + if (audioRef.current) { + audioRef.current.load(); + audioRef.current.play() + .then(() => { + setIsPlaying(true); + }) + .catch(error => { + console.error('Error playing audio:', error); + toast.error('Failed to play audio.'); + setIsPlaying(false); + }); + } + }, 50); + + } catch (error) { + console.error('Error fetching or playing podcast:', error); + toast.error(error instanceof Error ? error.message : 'Failed to load podcast audio.'); + setCurrentPodcast(null); + } finally { + setIsAudioLoading(false); + } + }; + + // Function to handle podcast deletion + const handleDeletePodcast = async () => { + if (!podcastToDelete) return; + + setIsDeleting(true); + try { + const token = localStorage.getItem('surfsense_bearer_token'); + if (!token) { + setIsDeleting(false); + return; + } + + const response = await fetch(`${process.env.NEXT_PUBLIC_FASTAPI_BACKEND_URL}/api/v1/podcasts/${podcastToDelete.id}`, { + method: 'DELETE', + headers: { + 'Authorization': `Bearer ${token}`, + 'Content-Type': 'application/json', + } + }); + + if (!response.ok) { + throw new Error(`Failed to delete podcast: ${response.statusText}`); + } + + // Close dialog and refresh podcasts + setDeleteDialogOpen(false); + setPodcastToDelete(null); + + // Update local state by removing the deleted podcast + setPodcasts(prevPodcasts => prevPodcasts.filter(podcast => podcast.id !== podcastToDelete.id)); + + // If the current playing podcast is deleted, stop playback + if (currentPodcast && currentPodcast.id === podcastToDelete.id) { + if (audioRef.current) { + audioRef.current.pause(); + } + setCurrentPodcast(null); + setIsPlaying(false); + } + + toast.success('Podcast deleted successfully'); + } catch (error) { + console.error('Error deleting podcast:', error); + toast.error(error instanceof Error ? error.message : 'Failed to delete podcast'); + } finally { + setIsDeleting(false); + } + }; + + return ( + +
+
+

Podcasts

+

Listen to generated podcasts.

+
+ + {/* Filter and Search Bar */} +
+
+
+ + setSearchQuery(e.target.value)} + /> +
+
+ +
+ +
+
+ + {/* Status Messages */} + {isLoading && ( +
+
+
+

Loading podcasts...

+
+
+ )} + + {error && !isLoading && ( +
+

Error loading podcasts

+

{error}

+
+ )} + + {!isLoading && !error && filteredPodcasts.length === 0 && ( +
+ +

No podcasts found

+

+ {searchQuery + ? 'Try adjusting your search filters' + : 'Generate podcasts from your chats to get started'} +

+
+ )} + + {/* Podcast Grid */} + {!isLoading && !error && filteredPodcasts.length > 0 && ( + +
+ {filteredPodcasts.map((podcast, index) => ( + +
playPodcast(podcast)} + > + {/* Podcast image */} + Podcast illustration + + {/* Overlay for better contrast with controls */} +
+ + {/* Loading indicator */} + {currentPodcast?.id === podcast.id && isAudioLoading && ( +
+
+
+ )} + + {/* Play button */} + {!(currentPodcast?.id === podcast.id && (isPlaying || isAudioLoading)) && ( + + )} + + {/* Pause button */} + {currentPodcast?.id === podcast.id && isPlaying && !isAudioLoading && ( + + )} +
+ +
+

+ {podcast.title || 'Untitled Podcast'} +

+

+ + {format(new Date(podcast.created_at), 'MMM d, yyyy')} +

+
+ + {currentPodcast?.id === podcast.id && !isAudioLoading && ( +
+
{ + if (!audioRef.current || !duration) return; + const container = e.currentTarget; + const rect = container.getBoundingClientRect(); + const x = e.clientX - rect.left; + const percentage = Math.max(0, Math.min(1, x / rect.width)); + const newTime = percentage * duration; + handleSeek([newTime]); + }} + > +
+
+
+
+
+ {formatTime(currentTime)} + {formatTime(duration)} +
+
+ )} + + {currentPodcast?.id === podcast.id && !isAudioLoading && ( +
+ + + +
+ )} + +
+ + + + + + { + setPodcastToDelete({ id: podcast.id, title: podcast.title }); + setDeleteDialogOpen(true); + }} + > + + Delete Podcast + + + +
+ + + ))} +
+ + )} + + {/* Current Podcast Player (Fixed at bottom) */} + {currentPodcast && !isAudioLoading && audioSrc && ( + +
+
+
+
+ +
+
+ +
+

{currentPodcast.title}

+ +
+
+ +
+
+ {formatTime(currentTime)} / {formatTime(duration)} +
+
+
+ +
+ + + + + + +
+ + + +
+
+
+
+
+ )} +
+ + {/* Delete Confirmation Dialog */} + + + + + + Delete Podcast + + + Are you sure you want to delete {podcastToDelete?.title}? This action cannot be undone. + + + + + + + + + + {/* Hidden audio element for playback */} +