From 3675505eb1a9c4ca967aacfc80698999c9e37f3e Mon Sep 17 00:00:00 2001
From: "DESKTOP-RTLN3BA\\$punk" <vermarohanfinal@gmail.com>
Date: Sun, 27 Apr 2025 15:53:33 -0700
Subject: [PATCH 01/70] feat: Added LinkUP Search Engine Connector

---
 .../alembic/versions/4_add_linkup_api_enum.py |  45 ++++
 .../app/agents/researcher/nodes.py            |  70 ++++--
 surfsense_backend/app/db.py                   |   3 +-
 .../app/schemas/search_source_connector.py    |  10 +
 surfsense_backend/app/temp_test.py            |  17 ++
 .../app/utils/connector_service.py            |  95 ++++++++
 surfsense_backend/pyproject.toml              |   1 +
 surfsense_backend/uv.lock                     |  15 ++
 .../connectors/(manage)/page.tsx              |   1 +
 .../connectors/[connector_id]/edit/page.tsx   |  11 +
 .../connectors/[connector_id]/page.tsx        |  16 +-
 .../connectors/add/linkup-api/page.tsx        | 207 ++++++++++++++++++
 .../[search_space_id]/connectors/add/page.tsx |   9 +-
 .../components/ModernHeroWithGradients.tsx    |   2 +-
 .../components/chat/ConnectorComponents.tsx   |   4 +-
 .../components/editConnector/types.ts         |   1 +
 surfsense_web/hooks/useConnectorEditPage.ts   |  11 +-
 surfsense_web/lib/connectors/utils.ts         |   1 +
 18 files changed, 492 insertions(+), 27 deletions(-)
 create mode 100644 surfsense_backend/alembic/versions/4_add_linkup_api_enum.py
 create mode 100644 surfsense_backend/app/temp_test.py
 create mode 100644 surfsense_web/app/dashboard/[search_space_id]/connectors/add/linkup-api/page.tsx

diff --git a/surfsense_backend/alembic/versions/4_add_linkup_api_enum.py b/surfsense_backend/alembic/versions/4_add_linkup_api_enum.py
new file mode 100644
index 000000000..8ccfac2d2
--- /dev/null
+++ b/surfsense_backend/alembic/versions/4_add_linkup_api_enum.py
@@ -0,0 +1,45 @@
+"""Add LINKUP_API to SearchSourceConnectorType enum
+
+Revision ID: 4
+Revises: 3
+Create Date: 2025-04-18 10:00:00.000000 
+
+"""
+from typing import Sequence, Union
+
+from alembic import op
+import sqlalchemy as sa
+
+
+# revision identifiers, used by Alembic.
+revision: str = '4'
+down_revision: Union[str, None] = '3'
+branch_labels: Union[str, Sequence[str], None] = None
+depends_on: Union[str, Sequence[str], None] = None
+
+
+def upgrade() -> None:
+    # ### commands auto generated by Alembic - please adjust! ###
+    
+    # Manually add the command to add the enum value
+    op.execute("ALTER TYPE searchsourceconnectortype ADD VALUE 'LINKUP_API'")
+    
+    # Pass for the rest, as autogenerate didn't run to add other schema details
+    pass
+    # ### end Alembic commands ###
+
+
+def downgrade() -> None:
+    # ### commands auto generated by Alembic - please adjust! ###
+    
+    # Downgrading removal of an enum value requires recreating the type
+    op.execute("ALTER TYPE searchsourceconnectortype RENAME TO searchsourceconnectortype_old")
+    op.execute("CREATE TYPE searchsourceconnectortype AS ENUM('SERPER_API', 'TAVILY_API', 'SLACK_CONNECTOR', 'NOTION_CONNECTOR', 'GITHUB_CONNECTOR', 'LINEAR_CONNECTOR')")
+    op.execute((
+        "ALTER TABLE search_source_connectors ALTER COLUMN connector_type TYPE searchsourceconnectortype USING "
+        "connector_type::text::searchsourceconnectortype"
+    ))
+    op.execute("DROP TYPE searchsourceconnectortype_old")
+
+    pass
+    # ### end Alembic commands ### 
\ No newline at end of file
diff --git a/surfsense_backend/app/agents/researcher/nodes.py b/surfsense_backend/app/agents/researcher/nodes.py
index 15935f2ea..1b42d7155 100644
--- a/surfsense_backend/app/agents/researcher/nodes.py
+++ b/surfsense_backend/app/agents/researcher/nodes.py
@@ -143,7 +143,7 @@ async def fetch_relevant_documents(
     connectors_to_search: List[str],
     writer: StreamWriter = None,
     state: State = None,
-    top_k: int = 20
+    top_k: int = 10
 ) -> List[Dict[str, Any]]:
     """
     Fetch relevant documents for research questions using the provided connectors.
@@ -264,22 +264,6 @@ async def fetch_relevant_documents(
                         streaming_service.only_update_terminal(f"Found {len(files_chunks)} file chunks relevant to the query")
                         writer({"yeild_value": streaming_service._format_annotations()})
                     
-                elif connector == "TAVILY_API":
-                    source_object, tavily_chunks = await connector_service.search_tavily(
-                        user_query=reformulated_query,
-                        user_id=user_id,
-                        top_k=top_k
-                    )
-                    
-                    # Add to sources and raw documents
-                    if source_object:
-                        all_sources.append(source_object)
-                    all_raw_documents.extend(tavily_chunks)
-                    
-                    # Stream found document count
-                    if streaming_service and writer:
-                        streaming_service.only_update_terminal(f"Found {len(tavily_chunks)} web search results relevant to the query")
-                        writer({"yeild_value": streaming_service._format_annotations()})
                     
                 elif connector == "SLACK_CONNECTOR":
                     source_object, slack_chunks = await connector_service.search_slack(
@@ -352,6 +336,47 @@ async def fetch_relevant_documents(
                     if streaming_service and writer:
                         streaming_service.only_update_terminal(f"Found {len(linear_chunks)} Linear issues relevant to the query")
                         writer({"yeild_value": streaming_service._format_annotations()})
+                        
+                elif connector == "TAVILY_API":
+                    source_object, tavily_chunks = await connector_service.search_tavily(
+                        user_query=reformulated_query,
+                        user_id=user_id,
+                        top_k=top_k
+                    )
+                    
+                    # Add to sources and raw documents
+                    if source_object:
+                        all_sources.append(source_object)
+                    all_raw_documents.extend(tavily_chunks)
+                    
+                    # Stream found document count
+                    if streaming_service and writer:
+                        streaming_service.only_update_terminal(f"Found {len(tavily_chunks)} web search results relevant to the query")
+                        writer({"yeild_value": streaming_service._format_annotations()})
+                        
+                elif connector == "LINKUP_API":
+                    if top_k > 10:
+                        linkup_mode = "deep"
+                    else:
+                        linkup_mode = "standard"
+                        
+                    source_object, linkup_chunks = await connector_service.search_linkup(
+                        user_query=reformulated_query,
+                        user_id=user_id,
+                        mode=linkup_mode
+                    )   
+                    
+                    # Add to sources and raw documents
+                    if source_object:
+                        all_sources.append(source_object)
+                    all_raw_documents.extend(linkup_chunks) 
+                    
+                    # Stream found document count
+                    if streaming_service and writer:
+                        streaming_service.only_update_terminal(f"Found {len(linkup_chunks)} Linkup chunks relevant to the query")
+                        writer({"yeild_value": streaming_service._format_annotations()})
+                    
+
             except Exception as e:
                 error_message = f"Error searching connector {connector}: {str(e)}"
                 print(error_message)
@@ -462,6 +487,14 @@ async def process_sections(state: State, config: RunnableConfig, writer: StreamW
     streaming_service.only_update_terminal("Searching for relevant information across all connectors...")
     writer({"yeild_value": streaming_service._format_annotations()})
     
+    if configuration.num_sections == 1:
+        TOP_K = 10
+    elif configuration.num_sections == 3:
+        TOP_K = 20
+    elif configuration.num_sections == 6:
+        TOP_K = 30
+    
+
     relevant_documents = []
     async with async_session_maker() as db_session:
         try:
@@ -472,7 +505,8 @@ async def process_sections(state: State, config: RunnableConfig, writer: StreamW
                 db_session=db_session,
                 connectors_to_search=configuration.connectors_to_search,
                 writer=writer,
-                state=state
+                state=state,
+                top_k=TOP_K
             )
         except Exception as e:
             error_message = f"Error fetching relevant documents: {str(e)}"
diff --git a/surfsense_backend/app/db.py b/surfsense_backend/app/db.py
index 4426f4ffa..320f059dd 100644
--- a/surfsense_backend/app/db.py
+++ b/surfsense_backend/app/db.py
@@ -44,8 +44,9 @@ class DocumentType(str, Enum):
     LINEAR_CONNECTOR = "LINEAR_CONNECTOR"
 
 class SearchSourceConnectorType(str, Enum):
-    SERPER_API = "SERPER_API"
+    SERPER_API = "SERPER_API" # NOT IMPLEMENTED YET : DON'T REMEMBER WHY : MOST PROBABLY BECAUSE WE NEED TO CRAWL THE RESULTS RETURNED BY IT
     TAVILY_API = "TAVILY_API"
+    LINKUP_API = "LINKUP_API"
     SLACK_CONNECTOR = "SLACK_CONNECTOR"
     NOTION_CONNECTOR = "NOTION_CONNECTOR"
     GITHUB_CONNECTOR = "GITHUB_CONNECTOR"
diff --git a/surfsense_backend/app/schemas/search_source_connector.py b/surfsense_backend/app/schemas/search_source_connector.py
index 6accc12af..cb7152e06 100644
--- a/surfsense_backend/app/schemas/search_source_connector.py
+++ b/surfsense_backend/app/schemas/search_source_connector.py
@@ -36,6 +36,16 @@ class SearchSourceConnectorBase(BaseModel):
             # Ensure the API key is not empty
             if not config.get("TAVILY_API_KEY"):
                 raise ValueError("TAVILY_API_KEY cannot be empty")
+        
+        elif connector_type == SearchSourceConnectorType.LINKUP_API:
+            # For LINKUP_API, only allow LINKUP_API_KEY
+            allowed_keys = ["LINKUP_API_KEY"]
+            if set(config.keys()) != set(allowed_keys):
+                raise ValueError(f"For LINKUP_API connector type, config must only contain these keys: {allowed_keys}")
+                
+            # Ensure the API key is not empty
+            if not config.get("LINKUP_API_KEY"):
+                raise ValueError("LINKUP_API_KEY cannot be empty")
                 
         elif connector_type == SearchSourceConnectorType.SLACK_CONNECTOR:
             # For SLACK_CONNECTOR, only allow SLACK_BOT_TOKEN
diff --git a/surfsense_backend/app/temp_test.py b/surfsense_backend/app/temp_test.py
new file mode 100644
index 000000000..f8ff10fec
--- /dev/null
+++ b/surfsense_backend/app/temp_test.py
@@ -0,0 +1,17 @@
+from linkup import LinkupClient
+
+# Initialize the client (API key can be read from the environment variable or passed as an argument)
+client = LinkupClient(
+    api_key="0ed1d08a-c8eb-4f01-9e3d-67cf87a3cd8f"
+)
+
+# Perform a search query
+search_response = client.search(
+    query="What is Surfsense?",
+    depth="standard",  # "standard" or "deep"
+    output_type="searchResults",  # "searchResults" or "sourcedAnswer" or "structured"
+    structured_output_schema=None,  # must be filled if output_type is "structured"
+)
+print(search_response)
+
+# results=[LinkupSearchTextResult(type='text', name='SurfSense - Future Tools', url='https://www.futuretools.io/tools/surfsense', content='SurfSense is an open-source AI research assistant that functions as a personal, private alternative to tools like NotebookLM or Perplexity. It enables users to save webpages (even those behind login walls), upload documents, and build a searchable knowledge base that can be queried through natural language. The tool integrates with various external sources including search engines, Slack ...'), LinkupSearchTextResult(type='text', name='r/selfhosted on Reddit: SurfSense - Personal AI Assistant for World Wide Web Surfers.', url='https://www.reddit.com/r/selfhosted/comments/1fl58vh/surfsense_personal_ai_assistant_for_world_wide/', content='14 votes, 22 comments. Hi Everyone, For the past few months I have been trying to build a Personal AI Assistant for World Wide Web Surfers. It…\nWhat it is and why I am making it: Well when I’m browsing the internet, I tend to save a ton of content—but remembering when and what you saved? Total brain freeze! That’s where SurfSense comes in. SurfSense is a Personal AI Assistant for anything you see (Social Media Chats, Calendar Invites, Important Mails, Tutorials, Recipes and anything ) on the World Wide Web.\nPlease test it out at https://github.com/MODSetter/SurfSense and let me know your feedback.\nPosted by u/Uiqueblhats - 14 votes and 22 comments'), LinkupSearchTextResult(type='text', name='SurfSense - GitHub', url='https://github.com/DLMJR/surfsense', content='While tools like NotebookLM and Perplexity are impressive and highly effective for conducting research on any topic/query, SurfSense elevates this capability by integrating with your personal knowledge base. It is a highly customizable AI research agent, connected to external sources such as search engines (Tavily), Slack, Notion, YouTube, GitHub and more to come.'), LinkupSearchTextResult(type='text', name='How to Set Up and Use SurfSense: Your Personal AI Assistant', url='https://fxis.ai/edu/how-to-set-up-and-use-surfsense-your-personal-ai-assistant/', content='SurfSense is the answer to the common struggle of remembering what content you’ve saved while browsing the internet. Imagine your favorite library, but instead of books, it’s filled with every useful webpage, chat message, recipe, and tutorial you’ve come across. With SurfSense, you can instantly recall any of these digital treasures. Let’s embark on a journey to set up and utilize ...'), LinkupSearchTextResult(type='text', name='Surf Sense | F6S', url='https://www.f6s.com/surfsense', content='Surf Sense - Government - Surf Sense is the modern infrastructure network of the ocean.\nsurfsense.com.au · Nathan Adler · Sydney, Australia · Product leader, ex-engineer, start-up founder & maker, with end-to-end product development background in software and hardware. Product · Employee @Airtasker · Product · Employee @SafetyCulture · B Engineering / B Commerce @UNSW See 3 more ·'), LinkupSearchTextResult(type='text', name='Surf Sense | Online Surf Coaching & Knowledge Platform', url='https://www.surf-sense.com/', content='Join Surf Sense, the ultimate online surf coaching platform designed for intermediate and advanced surfers. Access expert-guided courses, weekly live Q&amp;A sessions, and a thriving global surf community. Start improving your surfing today!\nundefined'), LinkupSearchTextResult(type='text', name='SurfSense - The Open Source Alternative to NotebookLM / Perplexity ...', url='https://www.redditmedia.com/r/selfhosted/comments/1jzi67a/surfsense_the_open_source_alternative_to/', content="For those of you who aren't familiar with SurfSense, it aims to be the open-source alternative to NotebookLM, Perplexity, or Glean. In short, it's a Highly Customizable AI Research Agent but connected to your personal external sources like search engines (Tavily), Slack, Notion, YouTube, GitHub, and more coming soon."), LinkupSearchTextResult(type='text', name='GitHub - MODSetter/SurfSense: Open Source Alternative to NotebookLM ...', url='https://github.com/MODSetter/SurfSense', content='While tools like NotebookLM and Perplexity are impressive and highly effective for conducting research on any topic/query, SurfSense elevates this capability by integrating with your personal knowledge base. It is a highly customizable AI research agent, connected to external sources such as search engines (Tavily), Slack, Linear, Notion, YouTube, GitHub and more to come.'), LinkupSearchTextResult(type='text', name='GitHub - MODSetter/SurfSense: Open Source Alternative to NotebookLM / Perplexity / Glean, connected to external sources such as search engines (Tavily), Slack, Linear, Notion, YouTube, GitHub and more.', url='https://github.com/MODSetter/SurfSense', content='Open Source Alternative to NotebookLM / Perplexity / Glean, connected to external sources such as search engines (Tavily), Slack, Linear, Notion, YouTube, GitHub and more. - MODSetter/SurfSense\nWhile tools like NotebookLM and Perplexity are impressive and highly effective for conducting research on any topic/query, SurfSense elevates this capability by integrating with your personal knowledge base.\nThe SurfSense extension can be used to save any webpage you like.\nThe SurfSense Podcast feature is currently being reworked for better UI and stability.\nSurfSense is actively being developed.'), LinkupSearchTextResult(type='text', name='SurfSense - Chrome Web Store', url='https://chromewebstore.google.com/detail/surfsense/jihmihbdpfjhppdlifphccgefjhifblf', content='Extension to collect Browsing History for SurfSense.\nWell when I’m browsing the internet, I tend to save a ton of content—but remembering when and what you saved? Total brain freeze! ❄️ That’s where SurfSense comes in. SurfSense is like a Knowledge Graph 🧠 Brain 🧠 for anything you see on the World Wide Web.\nSurfSense has disclosed the following information regarding the collection and usage of your data.\nThen, ask your personal knowledge base anything about your saved content., and voilà—instant recall! 🧑\u200d💻🌐 Use this extension to capture & save your Web Content and chat with your personal Knowledge Graph 🧠 Brain 🧠 at https://www.surfsense.net')]
\ No newline at end of file
diff --git a/surfsense_backend/app/utils/connector_service.py b/surfsense_backend/app/utils/connector_service.py
index 9a6e13c43..7f88c1c0f 100644
--- a/surfsense_backend/app/utils/connector_service.py
+++ b/surfsense_backend/app/utils/connector_service.py
@@ -5,6 +5,7 @@ from sqlalchemy.future import select
 from app.retriver.chunks_hybrid_search import ChucksHybridSearchRetriever
 from app.db import SearchSourceConnector, SearchSourceConnectorType
 from tavily import TavilyClient
+from linkup import LinkupClient
 
 
 class ConnectorService:
@@ -643,3 +644,97 @@ class ConnectorService:
         }
         
         return result_object, linear_chunks
+
+    async def search_linkup(self, user_query: str, user_id: str, mode: str = "standard") -> tuple:
+        """
+        Search using Linkup API and return both the source information and documents
+        
+        Args:
+            user_query: The user's query
+            user_id: The user's ID
+            mode: Search depth mode, can be "standard" or "deep"
+            
+        Returns:
+            tuple: (sources_info, documents)
+        """
+        # Get Linkup connector configuration
+        linkup_connector = await self.get_connector_by_type(user_id, SearchSourceConnectorType.LINKUP_API)
+        
+        if not linkup_connector:
+            # Return empty results if no Linkup connector is configured
+            return {
+                "id": 10,
+                "name": "Linkup Search",
+                "type": "LINKUP_API",
+                "sources": [],
+            }, []
+        
+        # Initialize Linkup client with API key from connector config
+        linkup_api_key = linkup_connector.config.get("LINKUP_API_KEY")
+        linkup_client = LinkupClient(api_key=linkup_api_key)
+        
+        # Perform search with Linkup
+        try:
+            response = linkup_client.search(
+                query=user_query,
+                depth=mode,  # Use the provided mode ("standard" or "deep")
+                output_type="searchResults",  # Default to search results
+            )
+            
+            # Extract results from Linkup response - access as attribute instead of using .get()
+            linkup_results = response.results if hasattr(response, 'results') else []
+            
+            # Process each result and create sources directly without deduplication
+            sources_list = []
+            documents = []
+            
+            for i, result in enumerate(linkup_results):
+                # Fix for UI
+                linkup_results[i]['document']['id'] = self.source_id_counter
+                # Create a source entry
+                source = {
+                    "id": self.source_id_counter,
+                    "title": result.name if hasattr(result, 'name') else "Linkup Result",
+                    "description": result.content[:100] if hasattr(result, 'content') else "",
+                    "url": result.url if hasattr(result, 'url') else ""
+                }
+                sources_list.append(source)
+                
+                # Create a document entry
+                document = {
+                    "chunk_id": f"linkup_chunk_{i}",
+                    "content": result.content if hasattr(result, 'content') else "",
+                    "score": 1.0,  # Default score since not provided by Linkup
+                    "document": {
+                        "id": self.source_id_counter,
+                        "title": result.name if hasattr(result, 'name') else "Linkup Result",
+                        "document_type": "LINKUP_API",
+                        "metadata": {
+                            "url": result.url if hasattr(result, 'url') else "",
+                            "type": result.type if hasattr(result, 'type') else "",
+                            "source": "LINKUP_API"
+                        }
+                    }
+                }
+                documents.append(document)
+                self.source_id_counter += 1
+
+            # Create result object
+            result_object = {
+                "id": 10,
+                "name": "Linkup Search",
+                "type": "LINKUP_API",
+                "sources": sources_list,
+            }
+            
+            return result_object, documents
+            
+        except Exception as e:
+            # Log the error and return empty results
+            print(f"Error searching with Linkup: {str(e)}")
+            return {
+                "id": 10,
+                "name": "Linkup Search",
+                "type": "LINKUP_API",
+                "sources": [],
+            }, []
diff --git a/surfsense_backend/pyproject.toml b/surfsense_backend/pyproject.toml
index 7b7a6f900..8f8dc4c0e 100644
--- a/surfsense_backend/pyproject.toml
+++ b/surfsense_backend/pyproject.toml
@@ -15,6 +15,7 @@ dependencies = [
     "langchain-community>=0.3.17",
     "langchain-unstructured>=0.1.6",
     "langgraph>=0.3.29",
+    "linkup-sdk>=0.2.4",
     "litellm>=1.61.4",
     "markdownify>=0.14.1",
     "notion-client>=2.3.0",
diff --git a/surfsense_backend/uv.lock b/surfsense_backend/uv.lock
index 9b485b0df..9601bccb3 100644
--- a/surfsense_backend/uv.lock
+++ b/surfsense_backend/uv.lock
@@ -1413,6 +1413,19 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/8b/e4/5380e8229c442e406404977d2ec71a9db6a3e6a89fce7791c6ad7cd2bdbe/langsmith-0.3.8-py3-none-any.whl", hash = "sha256:fbb9dd97b0f090219447fca9362698d07abaeda1da85aa7cc6ec6517b36581b1", size = 332800 },
 ]
 
+[[package]]
+name = "linkup-sdk"
+version = "0.2.4"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "httpx" },
+    { name = "pydantic" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/c2/c7/d9a85331bf2611ecac67f1ad92a6ced641b2e2e93eea26b17a9af701b3d1/linkup_sdk-0.2.4.tar.gz", hash = "sha256:2b8fd1894b9b4715bc14aabcbf53df6def9024f2cc426f234cc59e1807ec4c12", size = 9392 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/18/d8/bb9e01328fe5ad979e3e459c0f76321d295663906deef56eeaa5ce0cf269/linkup_sdk-0.2.4-py3-none-any.whl", hash = "sha256:8bc4c4f34de93529136a14e42441d803868d681c2bf3fd59be51923e44f1f1d4", size = 8325 },
+]
+
 [[package]]
 name = "litellm"
 version = "1.61.4"
@@ -3078,6 +3091,7 @@ dependencies = [
     { name = "langchain-community" },
     { name = "langchain-unstructured" },
     { name = "langgraph" },
+    { name = "linkup-sdk" },
     { name = "litellm" },
     { name = "markdownify" },
     { name = "notion-client" },
@@ -3106,6 +3120,7 @@ requires-dist = [
     { name = "langchain-community", specifier = ">=0.3.17" },
     { name = "langchain-unstructured", specifier = ">=0.1.6" },
     { name = "langgraph", specifier = ">=0.3.29" },
+    { name = "linkup-sdk", specifier = ">=0.2.4" },
     { name = "litellm", specifier = ">=1.61.4" },
     { name = "markdownify", specifier = ">=0.14.1" },
     { name = "notion-client", specifier = ">=2.3.0" },
diff --git a/surfsense_web/app/dashboard/[search_space_id]/connectors/(manage)/page.tsx b/surfsense_web/app/dashboard/[search_space_id]/connectors/(manage)/page.tsx
index 24fe6265d..af92a6ae5 100644
--- a/surfsense_web/app/dashboard/[search_space_id]/connectors/(manage)/page.tsx
+++ b/surfsense_web/app/dashboard/[search_space_id]/connectors/(manage)/page.tsx
@@ -46,6 +46,7 @@ const getConnectorTypeDisplay = (type: string): string => {
     "NOTION_CONNECTOR": "Notion",
     "GITHUB_CONNECTOR": "GitHub",
     "LINEAR_CONNECTOR": "Linear",
+    "LINKUP_API": "Linkup",
     // Add other connector types here as needed
   };
   return typeMap[type] || type;
diff --git a/surfsense_web/app/dashboard/[search_space_id]/connectors/[connector_id]/edit/page.tsx b/surfsense_web/app/dashboard/[search_space_id]/connectors/[connector_id]/edit/page.tsx
index d41295faa..5afea12c9 100644
--- a/surfsense_web/app/dashboard/[search_space_id]/connectors/[connector_id]/edit/page.tsx
+++ b/surfsense_web/app/dashboard/[search_space_id]/connectors/[connector_id]/edit/page.tsx
@@ -160,6 +160,17 @@ export default function EditConnectorPage() {
                                     />
                                 )}
 
+                                {/* == Linkup == */}
+                                {connector.connector_type === 'LINKUP_API' && (
+                                    <EditSimpleTokenForm
+                                        control={editForm.control}
+                                        fieldName="LINKUP_API_KEY"
+                                        fieldLabel="Linkup API Key"
+                                        fieldDescription="Update your Linkup API Key if needed."
+                                        placeholder="Begins with linkup_..."
+                                    />
+                                )}
+
                             </CardContent>
                             <CardFooter className="border-t pt-6">
                                 <Button type="submit" disabled={isSaving} className="w-full sm:w-auto">
diff --git a/surfsense_web/app/dashboard/[search_space_id]/connectors/[connector_id]/page.tsx b/surfsense_web/app/dashboard/[search_space_id]/connectors/[connector_id]/page.tsx
index ad6ceb7bf..c2726a837 100644
--- a/surfsense_web/app/dashboard/[search_space_id]/connectors/[connector_id]/page.tsx
+++ b/surfsense_web/app/dashboard/[search_space_id]/connectors/[connector_id]/page.tsx
@@ -52,6 +52,7 @@ const getConnectorTypeDisplay = (type: string): string => {
     "SLACK_CONNECTOR": "Slack Connector",
     "NOTION_CONNECTOR": "Notion Connector",
     "GITHUB_CONNECTOR": "GitHub Connector",
+    "LINKUP_API": "Linkup",
     // Add other connector types here as needed
   };
   return typeMap[type] || type;
@@ -87,7 +88,8 @@ export default function EditConnectorPage() {
       "TAVILY_API": "TAVILY_API_KEY",
       "SLACK_CONNECTOR": "SLACK_BOT_TOKEN",
       "NOTION_CONNECTOR": "NOTION_INTEGRATION_TOKEN",
-      "GITHUB_CONNECTOR": "GITHUB_PAT"
+      "GITHUB_CONNECTOR": "GITHUB_PAT",
+      "LINKUP_API": "LINKUP_API_KEY"
     };
     return fieldMap[connectorType] || "";
   };
@@ -229,7 +231,9 @@ export default function EditConnectorPage() {
                             ? "Notion Integration Token" 
                             : connector?.connector_type === "GITHUB_CONNECTOR"
                               ? "GitHub Personal Access Token (PAT)"
-                              : "API Key"}
+                              : connector?.connector_type === "LINKUP_API"
+                                ? "Linkup API Key"
+                                : "API Key"}
                       </FormLabel>
                       <FormControl>
                         <Input 
@@ -241,7 +245,9 @@ export default function EditConnectorPage() {
                                 ? "Enter new Notion Token (optional)"
                                 : connector?.connector_type === "GITHUB_CONNECTOR"
                                   ? "Enter new GitHub PAT (optional)"
-                                  : "Enter new API key (optional)"
+                                  : connector?.connector_type === "LINKUP_API"
+                                    ? "Enter new Linkup API Key (optional)"
+                                    : "Enter new API key (optional)"
                           } 
                           {...field} 
                         />
@@ -253,7 +259,9 @@ export default function EditConnectorPage() {
                             ? "Enter a new Notion Integration Token or leave blank to keep your existing token." 
                             : connector?.connector_type === "GITHUB_CONNECTOR"
                               ? "Enter a new GitHub PAT or leave blank to keep your existing token."
-                              : "Enter a new API key or leave blank to keep your existing key."}
+                              : connector?.connector_type === "LINKUP_API"
+                                ? "Enter a new Linkup API Key or leave blank to keep your existing key."
+                                : "Enter a new API key or leave blank to keep your existing key."}
                       </FormDescription>
                       <FormMessage />
                     </FormItem>
diff --git a/surfsense_web/app/dashboard/[search_space_id]/connectors/add/linkup-api/page.tsx b/surfsense_web/app/dashboard/[search_space_id]/connectors/add/linkup-api/page.tsx
new file mode 100644
index 000000000..291bdfb36
--- /dev/null
+++ b/surfsense_web/app/dashboard/[search_space_id]/connectors/add/linkup-api/page.tsx
@@ -0,0 +1,207 @@
+"use client";
+
+import { useState } from "react";
+import { useRouter, useParams } from "next/navigation";
+import { motion } from "framer-motion";
+import { zodResolver } from "@hookform/resolvers/zod";
+import { useForm } from "react-hook-form";
+import * as z from "zod";
+import { toast } from "sonner";
+import { ArrowLeft, Check, Info, Loader2 } from "lucide-react";
+
+import { useSearchSourceConnectors } from "@/hooks/useSearchSourceConnectors";
+import {
+  Form,
+  FormControl,
+  FormDescription,
+  FormField,
+  FormItem,
+  FormLabel,
+  FormMessage,
+} from "@/components/ui/form";
+import { Input } from "@/components/ui/input";
+import { Button } from "@/components/ui/button";
+import {
+  Card,
+  CardContent,
+  CardDescription,
+  CardFooter,
+  CardHeader,
+  CardTitle,
+} from "@/components/ui/card";
+import {
+  Alert,
+  AlertDescription,
+  AlertTitle,
+} from "@/components/ui/alert";
+
+// Define the form schema with Zod
+const linkupApiFormSchema = z.object({
+  name: z.string().min(3, {
+    message: "Connector name must be at least 3 characters.",
+  }),
+  api_key: z.string().min(10, {
+    message: "API key is required and must be valid.",
+  }),
+});
+
+// Define the type for the form values
+type LinkupApiFormValues = z.infer<typeof linkupApiFormSchema>;
+
+export default function LinkupApiPage() {
+  const router = useRouter();
+  const params = useParams();
+  const searchSpaceId = params.search_space_id as string;
+  const [isSubmitting, setIsSubmitting] = useState(false);
+  const { createConnector } = useSearchSourceConnectors();
+
+  // Initialize the form
+  const form = useForm<LinkupApiFormValues>({
+    resolver: zodResolver(linkupApiFormSchema),
+    defaultValues: {
+      name: "Linkup API Connector",
+      api_key: "",
+    },
+  });
+
+  // Handle form submission
+  const onSubmit = async (values: LinkupApiFormValues) => {
+    setIsSubmitting(true);
+    try {
+      await createConnector({
+        name: values.name,
+        connector_type: "LINKUP_API",
+        config: {
+          LINKUP_API_KEY: values.api_key,
+        },
+        is_indexable: false,
+        last_indexed_at: null,
+      });
+
+      toast.success("Linkup API connector created successfully!");
+      
+      // Navigate back to connectors page
+      router.push(`/dashboard/${searchSpaceId}/connectors`);
+    } catch (error) {
+      console.error("Error creating connector:", error);
+      toast.error(error instanceof Error ? error.message : "Failed to create connector");
+    } finally {
+      setIsSubmitting(false);
+    }
+  };
+
+  return (
+    <div className="container mx-auto py-8 max-w-3xl">
+      <Button
+        variant="ghost"
+        className="mb-6"
+        onClick={() => router.push(`/dashboard/${searchSpaceId}/connectors/add`)}
+      >
+        <ArrowLeft className="mr-2 h-4 w-4" />
+        Back to Connectors
+      </Button>
+
+      <motion.div
+        initial={{ opacity: 0, y: 20 }}
+        animate={{ opacity: 1, y: 0 }}
+        transition={{ duration: 0.5 }}
+      >
+        <Card className="border-2 border-border">
+          <CardHeader>
+            <CardTitle className="text-2xl font-bold">Connect Linkup API</CardTitle>
+            <CardDescription>
+              Integrate with Linkup API to enhance your search capabilities with AI-powered search results.
+            </CardDescription>
+          </CardHeader>
+          <CardContent>
+            <Alert className="mb-6 bg-muted">
+              <Info className="h-4 w-4" />
+              <AlertTitle>API Key Required</AlertTitle>
+              <AlertDescription>
+                You'll need a Linkup API key to use this connector. You can get one by signing up at{" "}
+                <a 
+                  href="https://linkup.so" 
+                  target="_blank" 
+                  rel="noopener noreferrer"
+                  className="font-medium underline underline-offset-4"
+                >
+                  linkup.so
+                </a>
+              </AlertDescription>
+            </Alert>
+
+            <Form {...form}>
+              <form onSubmit={form.handleSubmit(onSubmit)} className="space-y-6">
+                <FormField
+                  control={form.control}
+                  name="name"
+                  render={({ field }) => (
+                    <FormItem>
+                      <FormLabel>Connector Name</FormLabel>
+                      <FormControl>
+                        <Input placeholder="My Linkup API Connector" {...field} />
+                      </FormControl>
+                      <FormDescription>
+                        A friendly name to identify this connector.
+                      </FormDescription>
+                      <FormMessage />
+                    </FormItem>
+                  )}
+                />
+
+                <FormField
+                  control={form.control}
+                  name="api_key"
+                  render={({ field }) => (
+                    <FormItem>
+                      <FormLabel>Linkup API Key</FormLabel>
+                      <FormControl>
+                        <Input 
+                          type="password" 
+                          placeholder="Enter your Linkup API key" 
+                          {...field} 
+                        />
+                      </FormControl>
+                      <FormDescription>
+                        Your API key will be encrypted and stored securely.
+                      </FormDescription>
+                      <FormMessage />
+                    </FormItem>
+                  )}
+                />
+
+                <div className="flex justify-end">
+                  <Button 
+                    type="submit" 
+                    disabled={isSubmitting}
+                    className="w-full sm:w-auto"
+                  >
+                    {isSubmitting ? (
+                      <>
+                        <Loader2 className="mr-2 h-4 w-4 animate-spin" />
+                        Connecting...
+                      </>
+                    ) : (
+                      <>
+                        <Check className="mr-2 h-4 w-4" />
+                        Connect Linkup API
+                      </>
+                    )}
+                  </Button>
+                </div>
+              </form>
+            </Form>
+          </CardContent>
+          <CardFooter className="flex flex-col items-start border-t bg-muted/50 px-6 py-4">
+            <h4 className="text-sm font-medium">What you get with Linkup API:</h4>
+            <ul className="mt-2 list-disc pl-5 text-sm text-muted-foreground">
+              <li>AI-powered search results tailored to your queries</li>
+              <li>Real-time information from the web</li>
+              <li>Enhanced search capabilities for your projects</li>
+            </ul>
+          </CardFooter>
+        </Card>
+      </motion.div>
+    </div>
+  );
+}
diff --git a/surfsense_web/app/dashboard/[search_space_id]/connectors/add/page.tsx b/surfsense_web/app/dashboard/[search_space_id]/connectors/add/page.tsx
index 1f7490270..c04dae645 100644
--- a/surfsense_web/app/dashboard/[search_space_id]/connectors/add/page.tsx
+++ b/surfsense_web/app/dashboard/[search_space_id]/connectors/add/page.tsx
@@ -16,6 +16,7 @@ import {
   IconWorldWww,
   IconTicket,
   IconLayoutKanban,
+  IconLinkPlus,
 } from "@tabler/icons-react";
 import { AnimatePresence, motion } from "framer-motion";
 import Link from "next/link";
@@ -50,7 +51,13 @@ const connectorCategories: ConnectorCategory[] = [
         icon: <IconWorldWww className="h-6 w-6" />,
         status: "available",
       },
-      // Add other search engine connectors like Tavily, Serper if they have UI config
+      {
+        id: "linkup-api",
+        title: "Linkup API",
+        description: "Search the web using the Linkup API",
+        icon: <IconLinkPlus className="h-6 w-6" />,
+        status: "available",
+      },
     ],
   },
   {
diff --git a/surfsense_web/components/ModernHeroWithGradients.tsx b/surfsense_web/components/ModernHeroWithGradients.tsx
index 052c993da..b30c4bc82 100644
--- a/surfsense_web/components/ModernHeroWithGradients.tsx
+++ b/surfsense_web/components/ModernHeroWithGradients.tsx
@@ -36,7 +36,7 @@ export function ModernHeroWithGradients() {
                             </h1>
                         </div>
                         <p className="mx-auto max-w-3xl py-6 text-center text-base text-gray-600 dark:text-neutral-300 md:text-lg lg:text-xl">
-                            A Customizable AI Research Agent just like NotebookLM or Perplexity, but connected to external sources such as search engines (Tavily), Slack, Linear, Notion, YouTube, GitHub and more.
+                            A Customizable AI Research Agent just like NotebookLM or Perplexity, but connected to external sources such as search engines (Tavily, LinkUp), Slack, Linear, Notion, YouTube, GitHub and more.
                         </p>
                         <div className="flex flex-col items-center gap-6 py-6 sm:flex-row">
                             <Link
diff --git a/surfsense_web/components/chat/ConnectorComponents.tsx b/surfsense_web/components/chat/ConnectorComponents.tsx
index 2b4ab8455..e52348c4a 100644
--- a/surfsense_web/components/chat/ConnectorComponents.tsx
+++ b/surfsense_web/components/chat/ConnectorComponents.tsx
@@ -11,7 +11,7 @@ import {
   Link,
   Webhook,
 } from 'lucide-react';
-import { IconBrandNotion, IconBrandSlack, IconBrandYoutube, IconBrandGithub, IconLayoutKanban } from "@tabler/icons-react";
+import { IconBrandNotion, IconBrandSlack, IconBrandYoutube, IconBrandGithub, IconLayoutKanban, IconLinkPlus } from "@tabler/icons-react";
 import { Button } from '@/components/ui/button';
 import { Connector, ResearchMode } from './types';
 
@@ -20,6 +20,8 @@ export const getConnectorIcon = (connectorType: string) => {
   const iconProps = { className: "h-4 w-4" };
   
   switch(connectorType) {
+    case 'LINKUP_API':
+      return <IconLinkPlus {...iconProps} />;
     case 'LINEAR_CONNECTOR':
       return <IconLayoutKanban {...iconProps} />;
     case 'GITHUB_CONNECTOR':
diff --git a/surfsense_web/components/editConnector/types.ts b/surfsense_web/components/editConnector/types.ts
index 364f23526..cc43e1c81 100644
--- a/surfsense_web/components/editConnector/types.ts
+++ b/surfsense_web/components/editConnector/types.ts
@@ -30,5 +30,6 @@ export const editConnectorSchema = z.object({
     SERPER_API_KEY: z.string().optional(),
     TAVILY_API_KEY: z.string().optional(),
     LINEAR_API_KEY: z.string().optional(),
+    LINKUP_API_KEY: z.string().optional(),
 });
 export type EditConnectorFormValues = z.infer<typeof editConnectorSchema>; 
diff --git a/surfsense_web/hooks/useConnectorEditPage.ts b/surfsense_web/hooks/useConnectorEditPage.ts
index d7672025d..7e81c5524 100644
--- a/surfsense_web/hooks/useConnectorEditPage.ts
+++ b/surfsense_web/hooks/useConnectorEditPage.ts
@@ -59,7 +59,8 @@ export function useConnectorEditPage(connectorId: number, searchSpaceId: string)
                     NOTION_INTEGRATION_TOKEN: config.NOTION_INTEGRATION_TOKEN || "",
                     SERPER_API_KEY: config.SERPER_API_KEY || "",
                     TAVILY_API_KEY: config.TAVILY_API_KEY || "",
-                    LINEAR_API_KEY: config.LINEAR_API_KEY || ""
+                    LINEAR_API_KEY: config.LINEAR_API_KEY || "",
+                    LINKUP_API_KEY: config.LINKUP_API_KEY || ""
                 });
                 if (currentConnector.connector_type === 'GITHUB_CONNECTOR') {
                     const savedRepos = config.repo_full_names || [];
@@ -164,6 +165,12 @@ export function useConnectorEditPage(connectorId: number, searchSpaceId: string)
                     newConfig = { LINEAR_API_KEY: formData.LINEAR_API_KEY };
                 }
                 break;
+            case 'LINKUP_API':
+                if (formData.LINKUP_API_KEY !== originalConfig.LINKUP_API_KEY) {
+                    if (!formData.LINKUP_API_KEY) { toast.error("Linkup API Key cannot be empty."); setIsSaving(false); return; }
+                    newConfig = { LINKUP_API_KEY: formData.LINKUP_API_KEY };
+                }
+                break;
         }
 
         if (newConfig !== null) {
@@ -203,6 +210,8 @@ export function useConnectorEditPage(connectorId: number, searchSpaceId: string)
                     editForm.setValue('TAVILY_API_KEY', newlySavedConfig.TAVILY_API_KEY || "");
                  } else if(connector.connector_type === 'LINEAR_CONNECTOR') {
                     editForm.setValue('LINEAR_API_KEY', newlySavedConfig.LINEAR_API_KEY || "");
+                 } else if(connector.connector_type === 'LINKUP_API') {
+                    editForm.setValue('LINKUP_API_KEY', newlySavedConfig.LINKUP_API_KEY || "");
                  }
              }
             if (connector.connector_type === 'GITHUB_CONNECTOR') {
diff --git a/surfsense_web/lib/connectors/utils.ts b/surfsense_web/lib/connectors/utils.ts
index 5efc59386..f93bd3f82 100644
--- a/surfsense_web/lib/connectors/utils.ts
+++ b/surfsense_web/lib/connectors/utils.ts
@@ -7,6 +7,7 @@ export const getConnectorTypeDisplay = (type: string): string => {
         "NOTION_CONNECTOR": "Notion",
         "GITHUB_CONNECTOR": "GitHub",
         "LINEAR_CONNECTOR": "Linear",
+        "LINKUP_API": "Linkup",
     };
     return typeMap[type] || type;
 }; 

From a945aceac77b402ac33a20f669fd1ad6a692229e Mon Sep 17 00:00:00 2001
From: "DESKTOP-RTLN3BA\\$punk" <vermarohanfinal@gmail.com>
Date: Sun, 27 Apr 2025 15:56:31 -0700
Subject: [PATCH 02/70] chore: readme

---
 README.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index e412fe2be..ad8633c47 100644
--- a/README.md
+++ b/README.md
@@ -6,7 +6,7 @@
 
 
 # SurfSense
-While tools like NotebookLM and Perplexity are impressive and highly effective for conducting research on any topic/query, SurfSense elevates this capability by integrating with your personal knowledge base. It is a highly customizable AI research agent, connected to external sources such as search engines (Tavily), Slack, Linear, Notion, YouTube, GitHub and more to come.
+While tools like NotebookLM and Perplexity are impressive and highly effective for conducting research on any topic/query, SurfSense elevates this capability by integrating with your personal knowledge base. It is a highly customizable AI research agent, connected to external sources such as search engines (Tavily, LinkUp), Slack, Linear, Notion, YouTube, GitHub and more to come.
 
 
 # Video
@@ -42,7 +42,7 @@ Open source and easy to deploy locally.
 - RAG as a Service API Backend.
 
 #### ℹ️ **External Sources**
-- Search Engines (Tavily)
+- Search Engines (Tavily, LinkUp)
 - Slack
 - Linear
 - Notion

From cc4e02183ba8b98e3b50dcdcea46d4efe0eb54b6 Mon Sep 17 00:00:00 2001
From: "DESKTOP-RTLN3BA\\$punk" <vermarohanfinal@gmail.com>
Date: Sun, 27 Apr 2025 15:59:48 -0700
Subject: [PATCH 03/70] oops disables

---
 surfsense_backend/app/temp_test.py | 17 -----------------
 1 file changed, 17 deletions(-)
 delete mode 100644 surfsense_backend/app/temp_test.py

diff --git a/surfsense_backend/app/temp_test.py b/surfsense_backend/app/temp_test.py
deleted file mode 100644
index f8ff10fec..000000000
--- a/surfsense_backend/app/temp_test.py
+++ /dev/null
@@ -1,17 +0,0 @@
-from linkup import LinkupClient
-
-# Initialize the client (API key can be read from the environment variable or passed as an argument)
-client = LinkupClient(
-    api_key="0ed1d08a-c8eb-4f01-9e3d-67cf87a3cd8f"
-)
-
-# Perform a search query
-search_response = client.search(
-    query="What is Surfsense?",
-    depth="standard",  # "standard" or "deep"
-    output_type="searchResults",  # "searchResults" or "sourcedAnswer" or "structured"
-    structured_output_schema=None,  # must be filled if output_type is "structured"
-)
-print(search_response)
-
-# results=[LinkupSearchTextResult(type='text', name='SurfSense - Future Tools', url='https://www.futuretools.io/tools/surfsense', content='SurfSense is an open-source AI research assistant that functions as a personal, private alternative to tools like NotebookLM or Perplexity. It enables users to save webpages (even those behind login walls), upload documents, and build a searchable knowledge base that can be queried through natural language. The tool integrates with various external sources including search engines, Slack ...'), LinkupSearchTextResult(type='text', name='r/selfhosted on Reddit: SurfSense - Personal AI Assistant for World Wide Web Surfers.', url='https://www.reddit.com/r/selfhosted/comments/1fl58vh/surfsense_personal_ai_assistant_for_world_wide/', content='14 votes, 22 comments. Hi Everyone, For the past few months I have been trying to build a Personal AI Assistant for World Wide Web Surfers. It…\nWhat it is and why I am making it: Well when I’m browsing the internet, I tend to save a ton of content—but remembering when and what you saved? Total brain freeze! That’s where SurfSense comes in. SurfSense is a Personal AI Assistant for anything you see (Social Media Chats, Calendar Invites, Important Mails, Tutorials, Recipes and anything ) on the World Wide Web.\nPlease test it out at https://github.com/MODSetter/SurfSense and let me know your feedback.\nPosted by u/Uiqueblhats - 14 votes and 22 comments'), LinkupSearchTextResult(type='text', name='SurfSense - GitHub', url='https://github.com/DLMJR/surfsense', content='While tools like NotebookLM and Perplexity are impressive and highly effective for conducting research on any topic/query, SurfSense elevates this capability by integrating with your personal knowledge base. It is a highly customizable AI research agent, connected to external sources such as search engines (Tavily), Slack, Notion, YouTube, GitHub and more to come.'), LinkupSearchTextResult(type='text', name='How to Set Up and Use SurfSense: Your Personal AI Assistant', url='https://fxis.ai/edu/how-to-set-up-and-use-surfsense-your-personal-ai-assistant/', content='SurfSense is the answer to the common struggle of remembering what content you’ve saved while browsing the internet. Imagine your favorite library, but instead of books, it’s filled with every useful webpage, chat message, recipe, and tutorial you’ve come across. With SurfSense, you can instantly recall any of these digital treasures. Let’s embark on a journey to set up and utilize ...'), LinkupSearchTextResult(type='text', name='Surf Sense | F6S', url='https://www.f6s.com/surfsense', content='Surf Sense - Government - Surf Sense is the modern infrastructure network of the ocean.\nsurfsense.com.au · Nathan Adler · Sydney, Australia · Product leader, ex-engineer, start-up founder & maker, with end-to-end product development background in software and hardware. Product · Employee @Airtasker · Product · Employee @SafetyCulture · B Engineering / B Commerce @UNSW See 3 more ·'), LinkupSearchTextResult(type='text', name='Surf Sense | Online Surf Coaching & Knowledge Platform', url='https://www.surf-sense.com/', content='Join Surf Sense, the ultimate online surf coaching platform designed for intermediate and advanced surfers. Access expert-guided courses, weekly live Q&amp;A sessions, and a thriving global surf community. Start improving your surfing today!\nundefined'), LinkupSearchTextResult(type='text', name='SurfSense - The Open Source Alternative to NotebookLM / Perplexity ...', url='https://www.redditmedia.com/r/selfhosted/comments/1jzi67a/surfsense_the_open_source_alternative_to/', content="For those of you who aren't familiar with SurfSense, it aims to be the open-source alternative to NotebookLM, Perplexity, or Glean. In short, it's a Highly Customizable AI Research Agent but connected to your personal external sources like search engines (Tavily), Slack, Notion, YouTube, GitHub, and more coming soon."), LinkupSearchTextResult(type='text', name='GitHub - MODSetter/SurfSense: Open Source Alternative to NotebookLM ...', url='https://github.com/MODSetter/SurfSense', content='While tools like NotebookLM and Perplexity are impressive and highly effective for conducting research on any topic/query, SurfSense elevates this capability by integrating with your personal knowledge base. It is a highly customizable AI research agent, connected to external sources such as search engines (Tavily), Slack, Linear, Notion, YouTube, GitHub and more to come.'), LinkupSearchTextResult(type='text', name='GitHub - MODSetter/SurfSense: Open Source Alternative to NotebookLM / Perplexity / Glean, connected to external sources such as search engines (Tavily), Slack, Linear, Notion, YouTube, GitHub and more.', url='https://github.com/MODSetter/SurfSense', content='Open Source Alternative to NotebookLM / Perplexity / Glean, connected to external sources such as search engines (Tavily), Slack, Linear, Notion, YouTube, GitHub and more. - MODSetter/SurfSense\nWhile tools like NotebookLM and Perplexity are impressive and highly effective for conducting research on any topic/query, SurfSense elevates this capability by integrating with your personal knowledge base.\nThe SurfSense extension can be used to save any webpage you like.\nThe SurfSense Podcast feature is currently being reworked for better UI and stability.\nSurfSense is actively being developed.'), LinkupSearchTextResult(type='text', name='SurfSense - Chrome Web Store', url='https://chromewebstore.google.com/detail/surfsense/jihmihbdpfjhppdlifphccgefjhifblf', content='Extension to collect Browsing History for SurfSense.\nWell when I’m browsing the internet, I tend to save a ton of content—but remembering when and what you saved? Total brain freeze! ❄️ That’s where SurfSense comes in. SurfSense is like a Knowledge Graph 🧠 Brain 🧠 for anything you see on the World Wide Web.\nSurfSense has disclosed the following information regarding the collection and usage of your data.\nThen, ask your personal knowledge base anything about your saved content., and voilà—instant recall! 🧑\u200d💻🌐 Use this extension to capture & save your Web Content and chat with your personal Knowledge Graph 🧠 Brain 🧠 at https://www.surfsense.net')]
\ No newline at end of file

From f956a39498ba5989cf327ab00eaf64ac4b7e73d1 Mon Sep 17 00:00:00 2001
From: "DESKTOP-RTLN3BA\\$punk" <vermarohanfinal@gmail.com>
Date: Sun, 27 Apr 2025 16:17:20 -0700
Subject: [PATCH 04/70] chore(fix): linkup citation mapping

---
 surfsense_backend/app/utils/connector_service.py | 15 +++++++++++++--
 1 file changed, 13 insertions(+), 2 deletions(-)

diff --git a/surfsense_backend/app/utils/connector_service.py b/surfsense_backend/app/utils/connector_service.py
index 7f88c1c0f..23e3035e8 100644
--- a/surfsense_backend/app/utils/connector_service.py
+++ b/surfsense_backend/app/utils/connector_service.py
@@ -684,13 +684,24 @@ class ConnectorService:
             # Extract results from Linkup response - access as attribute instead of using .get()
             linkup_results = response.results if hasattr(response, 'results') else []
             
+            # Only proceed if we have results
+            if not linkup_results:
+                return {
+                    "id": 10,
+                    "name": "Linkup Search",
+                    "type": "LINKUP_API",
+                    "sources": [],
+                }, []
+                
             # Process each result and create sources directly without deduplication
             sources_list = []
             documents = []
             
             for i, result in enumerate(linkup_results):
-                # Fix for UI
-                linkup_results[i]['document']['id'] = self.source_id_counter
+                # Only process results that have content
+                if not hasattr(result, 'content') or not result.content:
+                    continue
+                    
                 # Create a source entry
                 source = {
                     "id": self.source_id_counter,

From a971bb1f721889bc343bf18e9b414184efdb0f20 Mon Sep 17 00:00:00 2001
From: "DESKTOP-RTLN3BA\\$punk" <vermarohanfinal@gmail.com>
Date: Sun, 27 Apr 2025 20:39:17 -0700
Subject: [PATCH 05/70] chore: update README and refactor ConnectorService for
 improved document handling and error management

---
 surfsense_backend/README.md                   |   1 -
 .../app/agents/researcher/nodes.py            |  12 +-
 .../researcher/sub_section_writer/nodes.py    |   2 +-
 .../app/utils/connector_service.py            | 687 ++++++++++--------
 4 files changed, 399 insertions(+), 303 deletions(-)

diff --git a/surfsense_backend/README.md b/surfsense_backend/README.md
index 879fa4372..f78ec7df5 100644
--- a/surfsense_backend/README.md
+++ b/surfsense_backend/README.md
@@ -110,7 +110,6 @@ See pyproject.toml for detailed dependency information. Key dependencies include
 - fastapi and related packages
 - fastapi-users: Authentication and user management
 - firecrawl-py: Web crawling capabilities
-- gpt-researcher: Advanced research capabilities
 - langchain components for AI workflows
 - litellm: LLM model integration
 - pgvector: Vector similarity search in PostgreSQL
diff --git a/surfsense_backend/app/agents/researcher/nodes.py b/surfsense_backend/app/agents/researcher/nodes.py
index 1b42d7155..4c3bc721f 100644
--- a/surfsense_backend/app/agents/researcher/nodes.py
+++ b/surfsense_backend/app/agents/researcher/nodes.py
@@ -143,7 +143,8 @@ async def fetch_relevant_documents(
     connectors_to_search: List[str],
     writer: StreamWriter = None,
     state: State = None,
-    top_k: int = 10
+    top_k: int = 10,
+    connector_service: ConnectorService = None
 ) -> List[Dict[str, Any]]:
     """
     Fetch relevant documents for research questions using the provided connectors.
@@ -162,7 +163,7 @@ async def fetch_relevant_documents(
         List of relevant documents
     """
     # Initialize services
-    connector_service = ConnectorService(db_session)
+    # connector_service = ConnectorService(db_session)
     
     # Only use streaming if both writer and state are provided
     streaming_service = state.streaming_service if state is not None else None
@@ -494,10 +495,12 @@ async def process_sections(state: State, config: RunnableConfig, writer: StreamW
     elif configuration.num_sections == 6:
         TOP_K = 30
     
-
     relevant_documents = []
     async with async_session_maker() as db_session:
         try:
+            # Create connector service inside the db_session scope
+            connector_service = ConnectorService(db_session)
+            
             relevant_documents = await fetch_relevant_documents(
                 research_questions=all_questions,
                 user_id=configuration.user_id,
@@ -506,7 +509,8 @@ async def process_sections(state: State, config: RunnableConfig, writer: StreamW
                 connectors_to_search=configuration.connectors_to_search,
                 writer=writer,
                 state=state,
-                top_k=TOP_K
+                top_k=TOP_K,
+                connector_service=connector_service
             )
         except Exception as e:
             error_message = f"Error fetching relevant documents: {str(e)}"
diff --git a/surfsense_backend/app/agents/researcher/sub_section_writer/nodes.py b/surfsense_backend/app/agents/researcher/sub_section_writer/nodes.py
index 0bec4618c..f1d50aeeb 100644
--- a/surfsense_backend/app/agents/researcher/sub_section_writer/nodes.py
+++ b/surfsense_backend/app/agents/researcher/sub_section_writer/nodes.py
@@ -102,7 +102,7 @@ async def write_sub_section(state: State, config: RunnableConfig) -> Dict[str, A
         # Extract content and metadata
         content = doc.get("content", "")
         doc_info = doc.get("document", {})
-        document_id = doc_info.get("id", f"{i+1}")  # Use document ID or index+1 as source_id
+        document_id = doc_info.get("id")  # Use document ID
         
         # Format document according to the citation system prompt's expected format
         formatted_doc = f"""
diff --git a/surfsense_backend/app/utils/connector_service.py b/surfsense_backend/app/utils/connector_service.py
index 23e3035e8..c7ad692e0 100644
--- a/surfsense_backend/app/utils/connector_service.py
+++ b/surfsense_backend/app/utils/connector_service.py
@@ -1,5 +1,6 @@
 import json
 from typing import List, Dict, Any, Optional, Tuple
+import asyncio
 from sqlalchemy.ext.asyncio import AsyncSession
 from sqlalchemy.future import select
 from app.retriver.chunks_hybrid_search import ChucksHybridSearchRetriever
@@ -13,6 +14,7 @@ class ConnectorService:
         self.session = session
         self.retriever = ChucksHybridSearchRetriever(session)
         self.source_id_counter = 1
+        self.counter_lock = asyncio.Lock()  # Lock to protect counter in multithreaded environments
     
     async def search_crawled_urls(self, user_query: str, user_id: str, search_space_id: int, top_k: int = 20) -> tuple:
         """
@@ -29,25 +31,35 @@ class ConnectorService:
             document_type="CRAWLED_URL"
         )
 
+        # Early return if no results
+        if not crawled_urls_chunks:
+            return {
+                "id": 1,
+                "name": "Crawled URLs",
+                "type": "CRAWLED_URL",
+                "sources": [],
+            }, []
+
         # Process each chunk and create sources directly without deduplication
         sources_list = []
-        for i, chunk in enumerate(crawled_urls_chunks):
-            # Fix for UI
-            crawled_urls_chunks[i]['document']['id'] = self.source_id_counter
-            # Extract document metadata
-            document = chunk.get('document', {})
-            metadata = document.get('metadata', {})
+        async with self.counter_lock:
+            for i, chunk in enumerate(crawled_urls_chunks):
+                # Fix for UI
+                crawled_urls_chunks[i]['document']['id'] = self.source_id_counter
+                # Extract document metadata
+                document = chunk.get('document', {})
+                metadata = document.get('metadata', {})
 
-            # Create a source entry
-            source = {
-                "id":  self.source_id_counter,
-                "title": document.get('title', 'Untitled Document'),
-                "description": metadata.get('og:description', metadata.get('ogDescription', chunk.get('content', '')[:100])),
-                "url": metadata.get('url', '')
-            }
+                # Create a source entry
+                source = {
+                    "id":  self.source_id_counter,
+                    "title": document.get('title', 'Untitled Document'),
+                    "description": metadata.get('og:description', metadata.get('ogDescription', chunk.get('content', '')[:100])),
+                    "url": metadata.get('url', '')
+                }
 
-            self.source_id_counter += 1
-            sources_list.append(source)
+                self.source_id_counter += 1
+                sources_list.append(source)
         
         # Create result object
         result_object = {
@@ -73,26 +85,36 @@ class ConnectorService:
             search_space_id=search_space_id,
             document_type="FILE"
         )
+        
+        # Early return if no results
+        if not files_chunks:
+            return {
+                "id": 2,
+                "name": "Files",
+                "type": "FILE",
+                "sources": [],
+            }, []
 
         # Process each chunk and create sources directly without deduplication
         sources_list = []
-        for i, chunk in enumerate(files_chunks):
-            # Fix for UI
-            files_chunks[i]['document']['id'] = self.source_id_counter
-            # Extract document metadata
-            document = chunk.get('document', {})
-            metadata = document.get('metadata', {})
+        async with self.counter_lock:
+            for i, chunk in enumerate(files_chunks):
+                # Fix for UI
+                files_chunks[i]['document']['id'] = self.source_id_counter
+                # Extract document metadata
+                document = chunk.get('document', {})
+                metadata = document.get('metadata', {})
 
-            # Create a source entry
-            source = {
-                "id":  self.source_id_counter,
-                "title": document.get('title', 'Untitled Document'),
-                "description": metadata.get('og:description', metadata.get('ogDescription', chunk.get('content', '')[:100])),
-                "url": metadata.get('url', '')
-            }
+                # Create a source entry
+                source = {
+                    "id":  self.source_id_counter,
+                    "title": document.get('title', 'Untitled Document'),
+                    "description": metadata.get('og:description', metadata.get('ogDescription', chunk.get('content', '')[:100])),
+                    "url": metadata.get('url', '')
+                }
 
-            self.source_id_counter += 1
-            sources_list.append(source)
+                self.source_id_counter += 1
+                sources_list.append(source)
         
         # Create result object
         result_object = {
@@ -163,39 +185,49 @@ class ConnectorService:
             # Extract results from Tavily response
             tavily_results = response.get("results", [])
             
+            # Early return if no results
+            if not tavily_results:
+                return {
+                    "id": 3,
+                    "name": "Tavily Search",
+                    "type": "TAVILY_API",
+                    "sources": [],
+                }, []
+            
             # Process each result and create sources directly without deduplication
             sources_list = []
             documents = []
             
-            for i, result in enumerate(tavily_results):
-                
-                # Create a source entry
-                source = {
-                    "id": self.source_id_counter,
-                    "title": result.get("title", "Tavily Result"),
-                    "description": result.get("content", "")[:100],
-                    "url": result.get("url", "")
-                }
-                sources_list.append(source)
-                
-                # Create a document entry
-                document = {
-                    "chunk_id": f"tavily_chunk_{i}",
-                    "content": result.get("content", ""),
-                    "score": result.get("score", 0.0),
-                    "document": {
+            async with self.counter_lock:
+                for i, result in enumerate(tavily_results):
+                    
+                    # Create a source entry
+                    source = {
                         "id": self.source_id_counter,
                         "title": result.get("title", "Tavily Result"),
-                        "document_type": "TAVILY_API",
-                        "metadata": {
-                            "url": result.get("url", ""),
-                            "published_date": result.get("published_date", ""),
-                            "source": "TAVILY_API"
+                        "description": result.get("content", "")[:100],
+                        "url": result.get("url", "")
+                    }
+                    sources_list.append(source)
+                    
+                    # Create a document entry
+                    document = {
+                        "chunk_id": f"tavily_chunk_{i}",
+                        "content": result.get("content", ""),
+                        "score": result.get("score", 0.0),
+                        "document": {
+                            "id": self.source_id_counter,
+                            "title": result.get("title", "Tavily Result"),
+                            "document_type": "TAVILY_API",
+                            "metadata": {
+                                "url": result.get("url", ""),
+                                "published_date": result.get("published_date", ""),
+                                "source": "TAVILY_API"
+                            }
                         }
                     }
-                }
-                documents.append(document)
-                self.source_id_counter += 1
+                    documents.append(document)
+                    self.source_id_counter += 1
 
             # Create result object
             result_object = {
@@ -231,45 +263,55 @@ class ConnectorService:
             search_space_id=search_space_id,
             document_type="SLACK_CONNECTOR"
         )
+        
+        # Early return if no results
+        if not slack_chunks:
+            return {
+                "id": 4,
+                "name": "Slack",
+                "type": "SLACK_CONNECTOR",
+                "sources": [],
+            }, []
 
         # Process each chunk and create sources directly without deduplication
         sources_list = []
-        for i, chunk in enumerate(slack_chunks):
-            # Fix for UI
-            slack_chunks[i]['document']['id'] = self.source_id_counter
-            # Extract document metadata
-            document = chunk.get('document', {})
-            metadata = document.get('metadata', {})
+        async with self.counter_lock:
+            for i, chunk in enumerate(slack_chunks):
+                # Fix for UI
+                slack_chunks[i]['document']['id'] = self.source_id_counter
+                # Extract document metadata
+                document = chunk.get('document', {})
+                metadata = document.get('metadata', {})
 
-            # Create a mapped source entry with Slack-specific metadata
-            channel_name = metadata.get('channel_name', 'Unknown Channel')
-            channel_id = metadata.get('channel_id', '')
-            message_date = metadata.get('start_date', '')
-            
-            # Create a more descriptive title for Slack messages
-            title = f"Slack: {channel_name}"
-            if message_date:
-                title += f" ({message_date})"
+                # Create a mapped source entry with Slack-specific metadata
+                channel_name = metadata.get('channel_name', 'Unknown Channel')
+                channel_id = metadata.get('channel_id', '')
+                message_date = metadata.get('start_date', '')
                 
-            # Create a more descriptive description for Slack messages
-            description = chunk.get('content', '')[:100]
-            if len(description) == 100:
-                description += "..."
-                
-            # For URL, we can use a placeholder or construct a URL to the Slack channel if available
-            url = ""
-            if channel_id:
-                url = f"https://slack.com/app_redirect?channel={channel_id}"
+                # Create a more descriptive title for Slack messages
+                title = f"Slack: {channel_name}"
+                if message_date:
+                    title += f" ({message_date})"
+                    
+                # Create a more descriptive description for Slack messages
+                description = chunk.get('content', '')[:100]
+                if len(description) == 100:
+                    description += "..."
+                    
+                # For URL, we can use a placeholder or construct a URL to the Slack channel if available
+                url = ""
+                if channel_id:
+                    url = f"https://slack.com/app_redirect?channel={channel_id}"
 
-            source = {
-                "id": self.source_id_counter,
-                "title": title,
-                "description": description,
-                "url": url,
-            }
+                source = {
+                    "id": self.source_id_counter,
+                    "title": title,
+                    "description": description,
+                    "url": url,
+                }
 
-            self.source_id_counter += 1
-            sources_list.append(source)
+                self.source_id_counter += 1
+                sources_list.append(source)
         
         # Create result object
         result_object = {
@@ -301,47 +343,57 @@ class ConnectorService:
             search_space_id=search_space_id,
             document_type="NOTION_CONNECTOR"
         )
+        
+        # Early return if no results
+        if not notion_chunks:
+            return {
+                "id": 5,
+                "name": "Notion",
+                "type": "NOTION_CONNECTOR",
+                "sources": [],
+            }, []
 
         # Process each chunk and create sources directly without deduplication
         sources_list = []
-        for i, chunk in enumerate(notion_chunks):
-            # Fix for UI
-            notion_chunks[i]['document']['id'] = self.source_id_counter
-            
-            # Extract document metadata
-            document = chunk.get('document', {})
-            metadata = document.get('metadata', {})
-
-            # Create a mapped source entry with Notion-specific metadata
-            page_title = metadata.get('page_title', 'Untitled Page')
-            page_id = metadata.get('page_id', '')
-            indexed_at = metadata.get('indexed_at', '')
-            
-            # Create a more descriptive title for Notion pages
-            title = f"Notion: {page_title}"
-            if indexed_at:
-                title += f" (indexed: {indexed_at})"
+        async with self.counter_lock:
+            for i, chunk in enumerate(notion_chunks):
+                # Fix for UI
+                notion_chunks[i]['document']['id'] = self.source_id_counter
                 
-            # Create a more descriptive description for Notion pages
-            description = chunk.get('content', '')[:100]
-            if len(description) == 100:
-                description += "..."
+                # Extract document metadata
+                document = chunk.get('document', {})
+                metadata = document.get('metadata', {})
+
+                # Create a mapped source entry with Notion-specific metadata
+                page_title = metadata.get('page_title', 'Untitled Page')
+                page_id = metadata.get('page_id', '')
+                indexed_at = metadata.get('indexed_at', '')
                 
-            # For URL, we can use a placeholder or construct a URL to the Notion page if available
-            url = ""
-            if page_id:
-                # Notion page URLs follow this format
-                url = f"https://notion.so/{page_id.replace('-', '')}"
+                # Create a more descriptive title for Notion pages
+                title = f"Notion: {page_title}"
+                if indexed_at:
+                    title += f" (indexed: {indexed_at})"
+                    
+                # Create a more descriptive description for Notion pages
+                description = chunk.get('content', '')[:100]
+                if len(description) == 100:
+                    description += "..."
+                    
+                # For URL, we can use a placeholder or construct a URL to the Notion page if available
+                url = ""
+                if page_id:
+                    # Notion page URLs follow this format
+                    url = f"https://notion.so/{page_id.replace('-', '')}"
 
-            source = {
-                "id": self.source_id_counter,
-                "title": title,
-                "description": description,
-                "url": url,
-            }
+                source = {
+                    "id": self.source_id_counter,
+                    "title": title,
+                    "description": description,
+                    "url": url,
+                }
 
-            self.source_id_counter += 1
-            sources_list.append(source)
+                self.source_id_counter += 1
+                sources_list.append(source)
         
         # Create result object
         result_object = {
@@ -373,65 +425,75 @@ class ConnectorService:
             search_space_id=search_space_id,
             document_type="EXTENSION"
         )
+        
+        # Early return if no results
+        if not extension_chunks:
+            return {
+                "id": 6,
+                "name": "Extension",
+                "type": "EXTENSION",
+                "sources": [],
+            }, []
 
         # Process each chunk and create sources directly without deduplication
         sources_list = []
-        for i, chunk in enumerate(extension_chunks):
-            # Fix for UI
-            extension_chunks[i]['document']['id'] = self.source_id_counter
-            
-            # Extract document metadata
-            document = chunk.get('document', {})
-            metadata = document.get('metadata', {})
+        async with self.counter_lock:
+            for i, chunk in enumerate(extension_chunks):
+                # Fix for UI
+                extension_chunks[i]['document']['id'] = self.source_id_counter
+                
+                # Extract document metadata
+                document = chunk.get('document', {})
+                metadata = document.get('metadata', {})
 
-            # Extract extension-specific metadata
-            webpage_title = metadata.get('VisitedWebPageTitle', 'Untitled Page')
-            webpage_url = metadata.get('VisitedWebPageURL', '')
-            visit_date = metadata.get('VisitedWebPageDateWithTimeInISOString', '')
-            visit_duration = metadata.get('VisitedWebPageVisitDurationInMilliseconds', '')
-            browsing_session_id = metadata.get('BrowsingSessionId', '')
-            
-            # Create a more descriptive title for extension data
-            title = webpage_title
-            if visit_date:
-                # Format the date for display (simplified)
-                try:
-                    # Just extract the date part for display
-                    formatted_date = visit_date.split('T')[0] if 'T' in visit_date else visit_date
-                    title += f" (visited: {formatted_date})"
-                except:
-                    # Fallback if date parsing fails
-                    title += f" (visited: {visit_date})"
+                # Extract extension-specific metadata
+                webpage_title = metadata.get('VisitedWebPageTitle', 'Untitled Page')
+                webpage_url = metadata.get('VisitedWebPageURL', '')
+                visit_date = metadata.get('VisitedWebPageDateWithTimeInISOString', '')
+                visit_duration = metadata.get('VisitedWebPageVisitDurationInMilliseconds', '')
+                browsing_session_id = metadata.get('BrowsingSessionId', '')
                 
-            # Create a more descriptive description for extension data
-            description = chunk.get('content', '')[:100]
-            if len(description) == 100:
-                description += "..."
-                
-            # Add visit duration if available
-            if visit_duration:
-                try:
-                    duration_seconds = int(visit_duration) / 1000
-                    if duration_seconds < 60:
-                        duration_text = f"{duration_seconds:.1f} seconds"
-                    else:
-                        duration_text = f"{duration_seconds/60:.1f} minutes"
+                # Create a more descriptive title for extension data
+                title = webpage_title
+                if visit_date:
+                    # Format the date for display (simplified)
+                    try:
+                        # Just extract the date part for display
+                        formatted_date = visit_date.split('T')[0] if 'T' in visit_date else visit_date
+                        title += f" (visited: {formatted_date})"
+                    except:
+                        # Fallback if date parsing fails
+                        title += f" (visited: {visit_date})"
                     
-                    if description:
-                        description += f" | Duration: {duration_text}"
-                except:
-                    # Fallback if duration parsing fails
-                    pass
+                # Create a more descriptive description for extension data
+                description = chunk.get('content', '')[:100]
+                if len(description) == 100:
+                    description += "..."
+                    
+                # Add visit duration if available
+                if visit_duration:
+                    try:
+                        duration_seconds = int(visit_duration) / 1000
+                        if duration_seconds < 60:
+                            duration_text = f"{duration_seconds:.1f} seconds"
+                        else:
+                            duration_text = f"{duration_seconds/60:.1f} minutes"
+                        
+                        if description:
+                            description += f" | Duration: {duration_text}"
+                    except:
+                        # Fallback if duration parsing fails
+                        pass
 
-            source = {
-                "id": self.source_id_counter,
-                "title": title,
-                "description": description,
-                "url": webpage_url
-            }
+                source = {
+                    "id": self.source_id_counter,
+                    "title": title,
+                    "description": description,
+                    "url": webpage_url
+                }
 
-            self.source_id_counter += 1
-            sources_list.append(source)
+                self.source_id_counter += 1
+                sources_list.append(source)
         
         # Create result object
         result_object = {
@@ -463,47 +525,57 @@ class ConnectorService:
             search_space_id=search_space_id,
             document_type="YOUTUBE_VIDEO"
         )
+        
+        # Early return if no results
+        if not youtube_chunks:
+            return {
+                "id": 7,
+                "name": "YouTube Videos",
+                "type": "YOUTUBE_VIDEO",
+                "sources": [],
+            }, []
 
         # Process each chunk and create sources directly without deduplication
         sources_list = []
-        for i, chunk in enumerate(youtube_chunks):
-            # Fix for UI
-            youtube_chunks[i]['document']['id'] = self.source_id_counter
-            
-            # Extract document metadata
-            document = chunk.get('document', {})
-            metadata = document.get('metadata', {})
-
-            # Extract YouTube-specific metadata
-            video_title = metadata.get('video_title', 'Untitled Video')
-            video_id = metadata.get('video_id', '')
-            channel_name = metadata.get('channel_name', '')
-            published_date = metadata.get('published_date', '')
-            
-            # Create a more descriptive title for YouTube videos
-            title = video_title
-            if channel_name:
-                title += f" - {channel_name}"
+        async with self.counter_lock:
+            for i, chunk in enumerate(youtube_chunks):
+                # Fix for UI
+                youtube_chunks[i]['document']['id'] = self.source_id_counter
                 
-            # Create a more descriptive description for YouTube videos
-            description = metadata.get('description', chunk.get('content', '')[:100])
-            if len(description) == 100:
-                description += "..."
+                # Extract document metadata
+                document = chunk.get('document', {})
+                metadata = document.get('metadata', {})
+
+                # Extract YouTube-specific metadata
+                video_title = metadata.get('video_title', 'Untitled Video')
+                video_id = metadata.get('video_id', '')
+                channel_name = metadata.get('channel_name', '')
+                published_date = metadata.get('published_date', '')
                 
-            # For URL, construct a URL to the YouTube video
-            url = f"https://www.youtube.com/watch?v={video_id}" if video_id else ""
+                # Create a more descriptive title for YouTube videos
+                title = video_title
+                if channel_name:
+                    title += f" - {channel_name}"
+                    
+                # Create a more descriptive description for YouTube videos
+                description = metadata.get('description', chunk.get('content', '')[:100])
+                if len(description) == 100:
+                    description += "..."
+                    
+                # For URL, construct a URL to the YouTube video
+                url = f"https://www.youtube.com/watch?v={video_id}" if video_id else ""
 
-            source = {
-                "id": self.source_id_counter,
-                "title": title,
-                "description": description,
-                "url": url,
-                "video_id": video_id,  # Additional field for YouTube videos
-                "channel_name": channel_name  # Additional field for YouTube videos
-            }
+                source = {
+                    "id": self.source_id_counter,
+                    "title": title,
+                    "description": description,
+                    "url": url,
+                    "video_id": video_id,  # Additional field for YouTube videos
+                    "channel_name": channel_name  # Additional field for YouTube videos
+                }
 
-            self.source_id_counter += 1
-            sources_list.append(source)
+                self.source_id_counter += 1
+                sources_list.append(source)
         
         # Create result object
         result_object = {
@@ -529,27 +601,37 @@ class ConnectorService:
             search_space_id=search_space_id,
             document_type="GITHUB_CONNECTOR"
         )
+        
+        # Early return if no results
+        if not github_chunks:
+            return {
+                "id": 8,
+                "name": "GitHub",
+                "type": "GITHUB_CONNECTOR",
+                "sources": [],
+            }, []
 
         # Process each chunk and create sources directly without deduplication
         sources_list = []
-        for i, chunk in enumerate(github_chunks):
-            # Fix for UI - assign a unique ID for citation/source tracking
-            github_chunks[i]['document']['id'] = self.source_id_counter
-            
-            # Extract document metadata
-            document = chunk.get('document', {})
-            metadata = document.get('metadata', {})
+        async with self.counter_lock:
+            for i, chunk in enumerate(github_chunks):
+                # Fix for UI - assign a unique ID for citation/source tracking
+                github_chunks[i]['document']['id'] = self.source_id_counter
+                
+                # Extract document metadata
+                document = chunk.get('document', {})
+                metadata = document.get('metadata', {})
 
-            # Create a source entry
-            source = {
-                "id": self.source_id_counter,
-                "title": document.get('title', 'GitHub Document'), # Use specific title if available
-                "description": metadata.get('description', chunk.get('content', '')[:100]), # Use description or content preview
-                "url": metadata.get('url', '') # Use URL if available in metadata
-            }
+                # Create a source entry
+                source = {
+                    "id": self.source_id_counter,
+                    "title": document.get('title', 'GitHub Document'), # Use specific title if available
+                    "description": metadata.get('description', chunk.get('content', '')[:100]), # Use description or content preview
+                    "url": metadata.get('url', '') # Use URL if available in metadata
+                }
 
-            self.source_id_counter += 1
-            sources_list.append(source)
+                self.source_id_counter += 1
+                sources_list.append(source)
         
         # Create result object
         result_object = {
@@ -581,59 +663,69 @@ class ConnectorService:
             search_space_id=search_space_id,
             document_type="LINEAR_CONNECTOR"
         )
+        
+        # Early return if no results
+        if not linear_chunks:
+            return {
+                "id": 9,
+                "name": "Linear Issues",
+                "type": "LINEAR_CONNECTOR",
+                "sources": [],
+            }, []
 
         # Process each chunk and create sources directly without deduplication
         sources_list = []
-        for i, chunk in enumerate(linear_chunks):
-            # Fix for UI
-            linear_chunks[i]['document']['id'] = self.source_id_counter
-            
-            # Extract document metadata
-            document = chunk.get('document', {})
-            metadata = document.get('metadata', {})
-
-            # Extract Linear-specific metadata
-            issue_identifier = metadata.get('issue_identifier', '')
-            issue_title = metadata.get('issue_title', 'Untitled Issue')
-            issue_state = metadata.get('state', '')
-            comment_count = metadata.get('comment_count', 0)
-            
-            # Create a more descriptive title for Linear issues
-            title = f"Linear: {issue_identifier} - {issue_title}"
-            if issue_state:
-                title += f" ({issue_state})"
+        async with self.counter_lock:
+            for i, chunk in enumerate(linear_chunks):
+                # Fix for UI
+                linear_chunks[i]['document']['id'] = self.source_id_counter
                 
-            # Create a more descriptive description for Linear issues
-            description = chunk.get('content', '')[:100]
-            if len(description) == 100:
-                description += "..."
-                
-            # Add comment count info to description
-            if comment_count:
-                if description:
-                    description += f" | Comments: {comment_count}"
-                else:
-                    description = f"Comments: {comment_count}"
-                
-            # For URL, we could construct a URL to the Linear issue if we have the workspace info
-            # For now, use a generic placeholder
-            url = ""
-            if issue_identifier:
-                # This is a generic format, may need to be adjusted based on actual Linear workspace
-                url = f"https://linear.app/issue/{issue_identifier}"
+                # Extract document metadata
+                document = chunk.get('document', {})
+                metadata = document.get('metadata', {})
 
-            source = {
-                "id": self.source_id_counter,
-                "title": title,
-                "description": description,
-                "url": url,
-                "issue_identifier": issue_identifier,
-                "state": issue_state,
-                "comment_count": comment_count
-            }
+                # Extract Linear-specific metadata
+                issue_identifier = metadata.get('issue_identifier', '')
+                issue_title = metadata.get('issue_title', 'Untitled Issue')
+                issue_state = metadata.get('state', '')
+                comment_count = metadata.get('comment_count', 0)
+                
+                # Create a more descriptive title for Linear issues
+                title = f"Linear: {issue_identifier} - {issue_title}"
+                if issue_state:
+                    title += f" ({issue_state})"
+                    
+                # Create a more descriptive description for Linear issues
+                description = chunk.get('content', '')[:100]
+                if len(description) == 100:
+                    description += "..."
+                    
+                # Add comment count info to description
+                if comment_count:
+                    if description:
+                        description += f" | Comments: {comment_count}"
+                    else:
+                        description = f"Comments: {comment_count}"
+                
+                # For URL, we could construct a URL to the Linear issue if we have the workspace info
+                # For now, use a generic placeholder
+                url = ""
+                if issue_identifier:
+                    # This is a generic format, may need to be adjusted based on actual Linear workspace
+                    url = f"https://linear.app/issue/{issue_identifier}"
 
-            self.source_id_counter += 1
-            sources_list.append(source)
+                source = {
+                    "id": self.source_id_counter,
+                    "title": title,
+                    "description": description,
+                    "url": url,
+                    "issue_identifier": issue_identifier,
+                    "state": issue_state,
+                    "comment_count": comment_count
+                }
+
+                self.source_id_counter += 1
+                sources_list.append(source)
         
         # Create result object
         result_object = {
@@ -697,38 +789,39 @@ class ConnectorService:
             sources_list = []
             documents = []
             
-            for i, result in enumerate(linkup_results):
-                # Only process results that have content
-                if not hasattr(result, 'content') or not result.content:
-                    continue
-                    
-                # Create a source entry
-                source = {
-                    "id": self.source_id_counter,
-                    "title": result.name if hasattr(result, 'name') else "Linkup Result",
-                    "description": result.content[:100] if hasattr(result, 'content') else "",
-                    "url": result.url if hasattr(result, 'url') else ""
-                }
-                sources_list.append(source)
-                
-                # Create a document entry
-                document = {
-                    "chunk_id": f"linkup_chunk_{i}",
-                    "content": result.content if hasattr(result, 'content') else "",
-                    "score": 1.0,  # Default score since not provided by Linkup
-                    "document": {
+            async with self.counter_lock:
+                for i, result in enumerate(linkup_results):
+                    # Only process results that have content
+                    if not hasattr(result, 'content') or not result.content:
+                        continue
+                        
+                    # Create a source entry
+                    source = {
                         "id": self.source_id_counter,
                         "title": result.name if hasattr(result, 'name') else "Linkup Result",
-                        "document_type": "LINKUP_API",
-                        "metadata": {
-                            "url": result.url if hasattr(result, 'url') else "",
-                            "type": result.type if hasattr(result, 'type') else "",
-                            "source": "LINKUP_API"
+                        "description": result.content[:100] if hasattr(result, 'content') else "",
+                        "url": result.url if hasattr(result, 'url') else ""
+                    }
+                    sources_list.append(source)
+                    
+                    # Create a document entry
+                    document = {
+                        "chunk_id": f"linkup_chunk_{i}",
+                        "content": result.content if hasattr(result, 'content') else "",
+                        "score": 1.0,  # Default score since not provided by Linkup
+                        "document": {
+                            "id": self.source_id_counter,
+                            "title": result.name if hasattr(result, 'name') else "Linkup Result",
+                            "document_type": "LINKUP_API",
+                            "metadata": {
+                                "url": result.url if hasattr(result, 'url') else "",
+                                "type": result.type if hasattr(result, 'type') else "",
+                                "source": "LINKUP_API"
+                            }
                         }
                     }
-                }
-                documents.append(document)
-                self.source_id_counter += 1
+                    documents.append(document)
+                    self.source_id_counter += 1
 
             # Create result object
             result_object = {

From 22da221ad533e7cf257b2c7171704d73cafb5592 Mon Sep 17 00:00:00 2001
From: "DESKTOP-RTLN3BA\\$punk" <vermarohanfinal@gmail.com>
Date: Tue, 29 Apr 2025 23:02:07 -0700
Subject: [PATCH 06/70] feat: Shifted to RecursiveChunker and CodeChunker

- Codebase Q/A should be lot better now.
---
 surfsense_backend/app/config/__init__.py      |  14 +-
 .../app/tasks/background_tasks.py             |  40 +--
 .../app/tasks/connectors_indexing_tasks.py    |  10 +-
 surfsense_backend/pyproject.toml              |   2 +-
 surfsense_backend/uv.lock                     | 288 +++++++++++++++---
 5 files changed, 285 insertions(+), 69 deletions(-)

diff --git a/surfsense_backend/app/config/__init__.py b/surfsense_backend/app/config/__init__.py
index c7f842b71..4adf2b7dc 100644
--- a/surfsense_backend/app/config/__init__.py
+++ b/surfsense_backend/app/config/__init__.py
@@ -1,12 +1,10 @@
 import os
 from pathlib import Path
 
-from chonkie import AutoEmbeddings, LateChunker
-from rerankers import Reranker
-from langchain_community.chat_models import ChatLiteLLM
-
-
+from chonkie import AutoEmbeddings, CodeChunker, RecursiveChunker
 from dotenv import load_dotenv
+from langchain_community.chat_models import ChatLiteLLM
+from rerankers import Reranker
 
 # Get the base directory of the project
 BASE_DIR = Path(__file__).resolve().parent.parent.parent
@@ -39,10 +37,12 @@ class Config:
     # Chonkie Configuration | Edit this to your needs
     EMBEDDING_MODEL = os.getenv("EMBEDDING_MODEL")
     embedding_model_instance = AutoEmbeddings.get_embeddings(EMBEDDING_MODEL)
-    chunker_instance = LateChunker(
-        embedding_model=EMBEDDING_MODEL,
+    chunker_instance = RecursiveChunker(
         chunk_size=embedding_model_instance.max_seq_length,
     )
+    code_chunker_instance = CodeChunker(
+        chunk_size=embedding_model_instance.max_seq_length
+    )
     
     # Reranker's Configuration | Pinecode, Cohere etc. Read more at https://github.com/AnswerDotAI/rerankers?tab=readme-ov-file#usage
     RERANKERS_MODEL_NAME = os.getenv("RERANKERS_MODEL_NAME")
diff --git a/surfsense_backend/app/tasks/background_tasks.py b/surfsense_backend/app/tasks/background_tasks.py
index b2f6f8c81..68b56c435 100644
--- a/surfsense_backend/app/tasks/background_tasks.py
+++ b/surfsense_backend/app/tasks/background_tasks.py
@@ -80,7 +80,7 @@ async def add_crawled_url_document(
 
         # Process chunks
         chunks = [
-            Chunk(content=chunk.text, embedding=chunk.embedding)
+            Chunk(content=chunk.text, embedding=config.embedding_model_instance.embed(chunk.text))
             for chunk in config.chunker_instance.chunk(content_in_markdown)
         ]
 
@@ -166,7 +166,7 @@ async def add_extension_received_document(
 
         # Process chunks
         chunks = [
-            Chunk(content=chunk.text, embedding=chunk.embedding)
+            Chunk(content=chunk.text, embedding=config.embedding_model_instance.embed(chunk.text))
             for chunk in config.chunker_instance.chunk(content.pageContent)
         ]
 
@@ -215,7 +215,7 @@ async def add_received_file_document(
 
        # Process chunks
         chunks = [
-            Chunk(content=chunk.text, embedding=chunk.embedding)
+            Chunk(content=chunk.text, embedding=config.embedding_model_instance.embed(chunk.text))
             for chunk in config.chunker_instance.chunk(file_in_markdown)
         ]
 
@@ -256,14 +256,14 @@ async def add_youtube_video_document(
     """
     try:
         from youtube_transcript_api import YouTubeTranscriptApi
-        
+
         # Extract video ID from URL
         def get_youtube_video_id(url: str):
             from urllib.parse import urlparse, parse_qs
-            
+
             parsed_url = urlparse(url)
             hostname = parsed_url.hostname
-            
+
             if hostname == "youtu.be":
                 return parsed_url.path[1:]
             if hostname in ("www.youtube.com", "youtube.com"):
@@ -275,26 +275,27 @@ async def add_youtube_video_document(
                 if parsed_url.path.startswith("/v/"):
                     return parsed_url.path.split("/")[2]
             return None
-        
+
         # Get video ID
         video_id = get_youtube_video_id(url)
         if not video_id:
             raise ValueError(f"Could not extract video ID from URL: {url}")
-            
+
         # Get video metadata
         import json
         from urllib.parse import urlencode
         from urllib.request import urlopen
-        
-        params = {"format": "json", "url": f"https://www.youtube.com/watch?v={video_id}"}
+
+        params = {"format": "json",
+                  "url": f"https://www.youtube.com/watch?v={video_id}"}
         oembed_url = "https://www.youtube.com/oembed"
         query_string = urlencode(params)
         full_url = oembed_url + "?" + query_string
-        
+
         with urlopen(full_url) as response:
             response_text = response.read()
             video_data = json.loads(response_text.decode())
-            
+
         # Get video transcript
         try:
             captions = YouTubeTranscriptApi.get_transcript(video_id)
@@ -309,7 +310,7 @@ async def add_youtube_video_document(
             transcript_text = "\n".join(transcript_segments)
         except Exception as e:
             transcript_text = f"No captions available for this video. Error: {str(e)}"
-            
+
         # Format document metadata in a more maintainable way
         metadata_sections = [
             ("METADATA", [
@@ -343,17 +344,18 @@ async def add_youtube_video_document(
         summary_chain = SUMMARY_PROMPT_TEMPLATE | config.long_context_llm_instance
         summary_result = await summary_chain.ainvoke({"document": combined_document_string})
         summary_content = summary_result.content
-        summary_embedding = config.embedding_model_instance.embed(summary_content)
+        summary_embedding = config.embedding_model_instance.embed(
+            summary_content)
 
         # Process chunks
         chunks = [
-            Chunk(content=chunk.text, embedding=chunk.embedding)
+            Chunk(content=chunk.text, embedding=config.embedding_model_instance.embed(chunk.text))
             for chunk in config.chunker_instance.chunk(transcript_text)
         ]
-            
+
         # Create document
         from app.db import Document, DocumentType
-        
+
         document = Document(
             title=video_data.get("title", "YouTube Video"),
             document_type=DocumentType.YOUTUBE_VIDEO,
@@ -369,11 +371,11 @@ async def add_youtube_video_document(
             chunks=chunks,
             search_space_id=search_space_id
         )
-        
+
         session.add(document)
         await session.commit()
         await session.refresh(document)
-        
+
         return document
     except SQLAlchemyError as db_error:
         await session.rollback()
diff --git a/surfsense_backend/app/tasks/connectors_indexing_tasks.py b/surfsense_backend/app/tasks/connectors_indexing_tasks.py
index 7c210628d..94643a45d 100644
--- a/surfsense_backend/app/tasks/connectors_indexing_tasks.py
+++ b/surfsense_backend/app/tasks/connectors_indexing_tasks.py
@@ -222,7 +222,7 @@ async def index_slack_messages(
                 
                 # Process chunks
                 chunks = [
-                    Chunk(content=chunk.text, embedding=chunk.embedding)
+                    Chunk(content=chunk.text, embedding=config.embedding_model_instance.embed(chunk.text))
                     for chunk in config.chunker_instance.chunk(channel_content)
                 ]
                 
@@ -515,7 +515,7 @@ async def index_notion_pages(
                 # Process chunks
                 logger.debug(f"Chunking content for page {page_title}")
                 chunks = [
-                    Chunk(content=chunk.text, embedding=chunk.embedding)
+                    Chunk(content=chunk.text, embedding=config.embedding_model_instance.embed(chunk.text))
                     for chunk in config.chunker_instance.chunk(markdown_content)
                 ]
                 
@@ -720,8 +720,8 @@ async def index_github_repos(
                     # Chunk the content
                     try:
                         chunks_data = [
-                            Chunk(content=chunk.text, embedding=chunk.embedding)
-                            for chunk in config.chunker_instance.chunk(file_content)
+                            Chunk(content=chunk.text, embedding=config.embedding_model_instance.embed(chunk.text))
+                            for chunk in config.code_chunker_instance.chunk(file_content)
                         ]
                     except Exception as chunk_err:
                         logger.error(f"Failed to chunk file {full_path_key}: {chunk_err}")
@@ -984,7 +984,7 @@ async def index_linear_issues(
                 
                 # Process chunks - using the full issue content with comments
                 chunks = [
-                    Chunk(content=chunk.text, embedding=chunk.embedding)
+                    Chunk(content=chunk.text, embedding=config.embedding_model_instance.embed(chunk.text))
                     for chunk in config.chunker_instance.chunk(issue_content)
                 ]
                 
diff --git a/surfsense_backend/pyproject.toml b/surfsense_backend/pyproject.toml
index 8f8dc4c0e..c447a74b8 100644
--- a/surfsense_backend/pyproject.toml
+++ b/surfsense_backend/pyproject.toml
@@ -7,7 +7,7 @@ requires-python = ">=3.12"
 dependencies = [
     "alembic>=1.13.0",
     "asyncpg>=0.30.0",
-    "chonkie[all]>=0.4.1",
+    "chonkie[all]>=1.0.6",
     "fastapi>=0.115.8",
     "fastapi-users[oauth,sqlalchemy]>=14.0.1",
     "firecrawl-py>=1.12.0",
diff --git a/surfsense_backend/uv.lock b/surfsense_backend/uv.lock
index 9601bccb3..a5621abda 100644
--- a/surfsense_backend/uv.lock
+++ b/surfsense_backend/uv.lock
@@ -13,6 +13,24 @@ resolution-markers = [
     "(python_full_version < '3.12.4' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version < '3.12.4' and sys_platform != 'darwin' and sys_platform != 'linux')",
 ]
 
+[[package]]
+name = "accelerate"
+version = "1.6.0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "huggingface-hub" },
+    { name = "numpy" },
+    { name = "packaging" },
+    { name = "psutil" },
+    { name = "pyyaml" },
+    { name = "safetensors" },
+    { name = "torch" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/8a/6e/c29a1dcde7db07f47870ed63e5124086b11874ad52ccd533dc1ca2c799da/accelerate-1.6.0.tar.gz", hash = "sha256:28c1ef1846e690944f98b68dc7b8bb6c51d032d45e85dcbb3adb0c8b99dffb32", size = 363804 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/63/b1/8198e3cdd11a426b1df2912e3381018c4a4a55368f6d0857ba3ca418ef93/accelerate-1.6.0-py3-none-any.whl", hash = "sha256:1aee717d3d3735ad6d09710a7c26990ee4652b79b4e93df46551551b5227c2aa", size = 354748 },
+]
+
 [[package]]
 name = "aiofiles"
 version = "24.1.0"
@@ -201,19 +219,6 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/fc/30/d4986a882011f9df997a55e6becd864812ccfcd821d64aac8570ee39f719/attrs-25.1.0-py3-none-any.whl", hash = "sha256:c75a69e28a550a7e93789579c22aa26b0f5b83b75dc4e08fe092980051e1090a", size = 63152 },
 ]
 
-[[package]]
-name = "autotiktokenizer"
-version = "0.2.2"
-source = { registry = "https://pypi.org/simple" }
-dependencies = [
-    { name = "huggingface-hub" },
-    { name = "tiktoken" },
-]
-sdist = { url = "https://files.pythonhosted.org/packages/a6/1a/c6f494750dc67c2e5b06b91ae9565d46adb384f25f61a7136ff79dd02413/autotiktokenizer-0.2.2.tar.gz", hash = "sha256:f0954f14cedfe538b96ba0eed2e39996378c0bdf649fd977d6a047e419e05fdb", size = 15401 }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/d8/7b/c34469a1495d755bac1c80fbf3c0c2c29eb03ffe61172d889426025173bd/autotiktokenizer-0.2.2-py3-none-any.whl", hash = "sha256:ebbf15d9d5516fcb3287a8153bd8efbcc932f9c99089b2357255413cf37815d9", size = 8957 },
-]
-
 [[package]]
 name = "backoff"
 version = "2.2.1"
@@ -363,23 +368,36 @@ wheels = [
 
 [[package]]
 name = "chonkie"
-version = "0.4.1"
+version = "1.0.6"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
-    { name = "autotiktokenizer" },
+    { name = "tokenizers" },
     { name = "tqdm" },
 ]
-sdist = { url = "https://files.pythonhosted.org/packages/2e/94/4a1bc8bdf06e7327bb256abb85767647125286c9bbc7cbcd77a550b96d63/chonkie-0.4.1.tar.gz", hash = "sha256:164216efa01af02e750e7cb218cea87918a18f83ebbd8f020b25557f1ed36aa9", size = 43284 }
+sdist = { url = "https://files.pythonhosted.org/packages/5a/db/16d5d23a216db734bcb68e61c466ff48a55dc0d2cdc7ecdd73aaea1f6f7d/chonkie-1.0.6.tar.gz", hash = "sha256:feefad3cbbb62b4a55f4c6409bd8d8f0ee180d8319c4d32e31539a768955b3b0", size = 70056 }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/c0/b5/c0d77500a413794773edb630bdc7061121c237a4eaf6ce222226c200d603/chonkie-0.4.1-py3-none-any.whl", hash = "sha256:af7d95d17f4ed60a26e32f0bad60f807287e3301189114755d727657ed2ef964", size = 51193 },
+    { url = "https://files.pythonhosted.org/packages/bc/46/d6d9789eb6e61bfa073a13fd2b5cbbcf022a7781adbb060a25d82f16437e/chonkie-1.0.6-py3-none-any.whl", hash = "sha256:d8cfcf665cb6a64ac6ca87da61207372a88b9e5a7bb697faade78069c853e4b1", size = 89526 },
 ]
 
 [package.optional-dependencies]
 all = [
+    { name = "accelerate" },
+    { name = "cohere" },
+    { name = "google-genai" },
+    { name = "huggingface-hub" },
+    { name = "jsonschema" },
+    { name = "magika" },
     { name = "model2vec" },
     { name = "numpy" },
     { name = "openai" },
+    { name = "pydantic" },
+    { name = "rich" },
     { name = "sentence-transformers" },
+    { name = "tiktoken" },
+    { name = "torch" },
+    { name = "transformers" },
+    { name = "tree-sitter" },
+    { name = "tree-sitter-language-pack" },
 ]
 
 [[package]]
@@ -394,6 +412,26 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/7e/d4/7ebdbd03970677812aac39c869717059dbb71a4cfc033ca6e5221787892c/click-8.1.8-py3-none-any.whl", hash = "sha256:63c132bbbed01578a06712a2d1f497bb62d9c1c0d329b7903a866228027263b2", size = 98188 },
 ]
 
+[[package]]
+name = "cohere"
+version = "5.15.0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "fastavro" },
+    { name = "httpx" },
+    { name = "httpx-sse" },
+    { name = "pydantic" },
+    { name = "pydantic-core" },
+    { name = "requests" },
+    { name = "tokenizers" },
+    { name = "types-requests" },
+    { name = "typing-extensions" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/a1/33/69c7d1b25a20eafef4197a1444c7f87d5241e936194e54876ea8996157e6/cohere-5.15.0.tar.gz", hash = "sha256:e802d4718ddb0bb655654382ebbce002756a3800faac30296cde7f1bdc6ff2cc", size = 135021 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/c7/87/94694db7fe6df979fbc03286eaabdfa98f1c8fa532960e5afdf965e10960/cohere-5.15.0-py3-none-any.whl", hash = "sha256:22ff867c2a6f2fc2b585360c6072f584f11f275ef6d9242bac24e0fa2df1dfb5", size = 259522 },
+]
+
 [[package]]
 name = "colorama"
 version = "0.4.6"
@@ -660,6 +698,26 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/a6/08/9968963c1fb8c34627b7f1fbcdfe9438540f87dc7c9bfb59bb4fd19a4ecf/fastapi_users_db_sqlalchemy-7.0.0-py3-none-any.whl", hash = "sha256:5fceac018e7cfa69efc70834dd3035b3de7988eb4274154a0dbe8b14f5aa001e", size = 6891 },
 ]
 
+[[package]]
+name = "fastavro"
+version = "1.10.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/f3/67/7121d2221e998706cac00fa779ec44c1c943cb65e8a7ed1bd57d78d93f2c/fastavro-1.10.0.tar.gz", hash = "sha256:47bf41ac6d52cdfe4a3da88c75a802321321b37b663a900d12765101a5d6886f", size = 987970 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/9c/a4/8e69c0a5cd121e5d476237de1bde5a7947f791ae45768ae52ed0d3ea8d18/fastavro-1.10.0-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:cfe57cb0d72f304bd0dcc5a3208ca6a7363a9ae76f3073307d095c9d053b29d4", size = 1036343 },
+    { url = "https://files.pythonhosted.org/packages/1e/01/aa219e2b33e5873d27b867ec0fad9f35f23d461114e1135a7e46c06786d2/fastavro-1.10.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:74e517440c824cb65fb29d3e3903a9406f4d7c75490cef47e55c4c82cdc66270", size = 3263368 },
+    { url = "https://files.pythonhosted.org/packages/a7/ba/1766e2d7d95df2e95e9e9a089dc7a537c0616720b053a111a918fa7ee6b6/fastavro-1.10.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:203c17d44cadde76e8eecb30f2d1b4f33eb478877552d71f049265dc6f2ecd10", size = 3328933 },
+    { url = "https://files.pythonhosted.org/packages/2e/40/26e56696b9696ab4fbba25a96b8037ca3f9fd8a8cc55b4b36400ef023e49/fastavro-1.10.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:6575be7f2b5f94023b5a4e766b0251924945ad55e9a96672dc523656d17fe251", size = 3258045 },
+    { url = "https://files.pythonhosted.org/packages/4e/bc/2f6c92c06c5363372abe828bccdd95762f2c1983b261509f94189c38c8a1/fastavro-1.10.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:fe471deb675ed2f01ee2aac958fbf8ebb13ea00fa4ce7f87e57710a0bc592208", size = 3418001 },
+    { url = "https://files.pythonhosted.org/packages/0c/ce/cfd16546c04ebbca1be80873b533c788cec76f7bfac231bfac6786047572/fastavro-1.10.0-cp312-cp312-win_amd64.whl", hash = "sha256:567ff515f2a5d26d9674b31c95477f3e6022ec206124c62169bc2ffaf0889089", size = 487855 },
+    { url = "https://files.pythonhosted.org/packages/c9/c4/163cf154cc694c2dccc70cd6796db6214ac668a1260bf0310401dad188dc/fastavro-1.10.0-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:82263af0adfddb39c85f9517d736e1e940fe506dfcc35bc9ab9f85e0fa9236d8", size = 1022741 },
+    { url = "https://files.pythonhosted.org/packages/38/01/a24598f5f31b8582a92fe9c41bf91caeed50d5b5eaa7576e6f8b23cb488d/fastavro-1.10.0-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:566c193109ff0ff84f1072a165b7106c4f96050078a4e6ac7391f81ca1ef3efa", size = 3237421 },
+    { url = "https://files.pythonhosted.org/packages/a7/bf/08bcf65cfb7feb0e5b1329fafeb4a9b95b7b5ec723ba58c7dbd0d04ded34/fastavro-1.10.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e400d2e55d068404d9fea7c5021f8b999c6f9d9afa1d1f3652ec92c105ffcbdd", size = 3300222 },
+    { url = "https://files.pythonhosted.org/packages/53/4d/a6c25f3166328f8306ec2e6be1123ed78a55b8ab774a43a661124508881f/fastavro-1.10.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:9b8227497f71565270f9249fc9af32a93644ca683a0167cfe66d203845c3a038", size = 3233276 },
+    { url = "https://files.pythonhosted.org/packages/47/1c/b2b2ce2bf866a248ae23e96a87b3b8369427ff79be9112073039bee1d245/fastavro-1.10.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:8e62d04c65461b30ac6d314e4197ad666371e97ae8cb2c16f971d802f6c7f514", size = 3388936 },
+    { url = "https://files.pythonhosted.org/packages/1f/2c/43927e22a2d57587b3aa09765098a6d833246b672d34c10c5f135414745a/fastavro-1.10.0-cp313-cp313-win_amd64.whl", hash = "sha256:86baf8c9740ab570d0d4d18517da71626fe9be4d1142bea684db52bd5adb078f", size = 483967 },
+]
+
 [[package]]
 name = "filelock"
 version = "3.17.0"
@@ -858,6 +916,24 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/94/b6/60f2910485d32f7bba92cc33e5053b3f29d61fccaa57e5e58c600bb7e0d2/google_cloud_vision-3.10.1-py3-none-any.whl", hash = "sha256:91959ea12b0d6a8442e30c0a5062cd305f349a4840f9184b5061b3153bbd8476", size = 526076 },
 ]
 
+[[package]]
+name = "google-genai"
+version = "1.12.1"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "anyio" },
+    { name = "google-auth" },
+    { name = "httpx" },
+    { name = "pydantic" },
+    { name = "requests" },
+    { name = "typing-extensions" },
+    { name = "websockets" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/38/9c/c907dbea921663bb7c41f415337bedd08259d17da8d156396c7237611744/google_genai-1.12.1.tar.gz", hash = "sha256:5c7eda422360643ce602a3f6b23152470ec1039310ef40080cbe4e71237f6391", size = 167752 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/29/2c/5b454dec837328eb167e78f45a14da502af223f8b94a4824e2fd0df74f19/google_genai-1.12.1-py3-none-any.whl", hash = "sha256:7cbc1bc029712946ce41bcf80c0eaa89eb8c09c308efbbfe30fd491f402c258a", size = 165940 },
+]
+
 [[package]]
 name = "googleapis-common-protos"
 version = "1.69.2"
@@ -1490,6 +1566,24 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/80/83/8c54533b3576f4391eebea88454738978669a6cad0d8e23266224007939d/lxml-5.3.1-cp313-cp313-win_amd64.whl", hash = "sha256:91fb6a43d72b4f8863d21f347a9163eecbf36e76e2f51068d59cd004c506f332", size = 3814484 },
 ]
 
+[[package]]
+name = "magika"
+version = "0.6.1"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "click" },
+    { name = "numpy" },
+    { name = "onnxruntime" },
+    { name = "python-dotenv" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/6d/18/ea70f6abd36f455037340f12c8125918c726d08cd6e01f0b76b6884e0c38/magika-0.6.1.tar.gz", hash = "sha256:e3dd22c73936630b1cd79d0f412d6d9a53dc99ba5e3709b1ac53f56bc998e635", size = 3030234 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/1f/be/c9f7bb9ee94abe8d344b660672001313e459c67b867b24abe32d5c80a9ce/magika-0.6.1-py3-none-any.whl", hash = "sha256:15838d2469f1394d8e9598bc7fceea1ede7f35aebe9675c6b45c6b5c48315931", size = 2968516 },
+    { url = "https://files.pythonhosted.org/packages/3c/b9/016b174520e81faef5edb31b6c7a73966dc84ee33acd23a2e7b775df7ba4/magika-0.6.1-py3-none-macosx_11_0_arm64.whl", hash = "sha256:dadd036296a2e4840fd48fa0712848fe122da438e8f607dc8f19ca4663c359dc", size = 12408519 },
+    { url = "https://files.pythonhosted.org/packages/02/b7/e7dfeb235823a82d676c68a748541c24db0249b854f945f6e3cec11c1b7e/magika-0.6.1-py3-none-manylinux_2_28_x86_64.whl", hash = "sha256:133c0e1a844361de86ca2dd7c530e38b324e86177d30c52e36fd82101c190b5c", size = 15089294 },
+    { url = "https://files.pythonhosted.org/packages/64/f0/bec5bff0125d08c1bc3baef88beeb910121085249f67b5994ea961615b55/magika-0.6.1-py3-none-win_amd64.whl", hash = "sha256:0342b6230ea9aea7ab4b8fa92e1b46f1cc62e724d452ee8d6821a37f56738d22", size = 12378455 },
+]
+
 [[package]]
 name = "makefun"
 version = "1.15.6"
@@ -1643,7 +1737,7 @@ wheels = [
 
 [[package]]
 name = "model2vec"
-version = "0.4.0"
+version = "0.4.1"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
     { name = "jinja2" },
@@ -1655,9 +1749,9 @@ dependencies = [
     { name = "tokenizers" },
     { name = "tqdm" },
 ]
-sdist = { url = "https://files.pythonhosted.org/packages/83/e2/3fb7bd8c612f71ad3abded92e7401f97f1e71427d3a68a3fb85f39394b17/model2vec-0.4.0.tar.gz", hash = "sha256:48d4a3da040499b0090f736eb8f22ea0fdd35b67462d81d789c70004423adbae", size = 2486998 }
+sdist = { url = "https://files.pythonhosted.org/packages/b8/c1/3cd6cab10e8b7da8c32acebf85672d38a26f5f03165bfeaa617a5ec0bb61/model2vec-0.4.1.tar.gz", hash = "sha256:fc6038416679eebe448951708f2d0bebdee8510f47970af1c81a8f054a3c3f9f", size = 2660626 }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/93/7d/39ff093c4e45303a06e3c5825c6144cbd21f18a1393a154bbf93232b0f1a/model2vec-0.4.0-py3-none-any.whl", hash = "sha256:df30685a55841c61c6638e4f329648e76b148507bd778801d7bfcd6b970a4f2f", size = 38593 },
+    { url = "https://files.pythonhosted.org/packages/cd/76/c8575f90f521017597c5e57e3bfef61e3f27d9cb6c741a82a24d72b10a60/model2vec-0.4.1-py3-none-any.whl", hash = "sha256:04a397a17da9b967082b6baa4c494f0be48c89ec4e1a3975b4f290f045238a38", size = 41972 },
 ]
 
 [[package]]
@@ -1764,18 +1858,40 @@ wheels = [
 
 [[package]]
 name = "numpy"
-version = "1.26.4"
+version = "2.2.5"
 source = { registry = "https://pypi.org/simple" }
-sdist = { url = "https://files.pythonhosted.org/packages/65/6e/09db70a523a96d25e115e71cc56a6f9031e7b8cd166c1ac8438307c14058/numpy-1.26.4.tar.gz", hash = "sha256:2a02aba9ed12e4ac4eb3ea9421c420301a0c6460d9830d74a9df87efa4912010", size = 15786129 }
+sdist = { url = "https://files.pythonhosted.org/packages/dc/b2/ce4b867d8cd9c0ee84938ae1e6a6f7926ebf928c9090d036fc3c6a04f946/numpy-2.2.5.tar.gz", hash = "sha256:a9c0d994680cd991b1cb772e8b297340085466a6fe964bc9d4e80f5e2f43c291", size = 20273920 }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/95/12/8f2020a8e8b8383ac0177dc9570aad031a3beb12e38847f7129bacd96228/numpy-1.26.4-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:b3ce300f3644fb06443ee2222c2201dd3a89ea6040541412b8fa189341847218", size = 20335901 },
-    { url = "https://files.pythonhosted.org/packages/75/5b/ca6c8bd14007e5ca171c7c03102d17b4f4e0ceb53957e8c44343a9546dcc/numpy-1.26.4-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:03a8c78d01d9781b28a6989f6fa1bb2c4f2d51201cf99d3dd875df6fbd96b23b", size = 13685868 },
-    { url = "https://files.pythonhosted.org/packages/79/f8/97f10e6755e2a7d027ca783f63044d5b1bc1ae7acb12afe6a9b4286eac17/numpy-1.26.4-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:9fad7dcb1aac3c7f0584a5a8133e3a43eeb2fe127f47e3632d43d677c66c102b", size = 13925109 },
-    { url = "https://files.pythonhosted.org/packages/0f/50/de23fde84e45f5c4fda2488c759b69990fd4512387a8632860f3ac9cd225/numpy-1.26.4-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:675d61ffbfa78604709862923189bad94014bef562cc35cf61d3a07bba02a7ed", size = 17950613 },
-    { url = "https://files.pythonhosted.org/packages/4c/0c/9c603826b6465e82591e05ca230dfc13376da512b25ccd0894709b054ed0/numpy-1.26.4-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:ab47dbe5cc8210f55aa58e4805fe224dac469cde56b9f731a4c098b91917159a", size = 13572172 },
-    { url = "https://files.pythonhosted.org/packages/76/8c/2ba3902e1a0fc1c74962ea9bb33a534bb05984ad7ff9515bf8d07527cadd/numpy-1.26.4-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:1dda2e7b4ec9dd512f84935c5f126c8bd8b9f2fc001e9f54af255e8c5f16b0e0", size = 17786643 },
-    { url = "https://files.pythonhosted.org/packages/28/4a/46d9e65106879492374999e76eb85f87b15328e06bd1550668f79f7b18c6/numpy-1.26.4-cp312-cp312-win32.whl", hash = "sha256:50193e430acfc1346175fcbdaa28ffec49947a06918b7b92130744e81e640110", size = 5677803 },
-    { url = "https://files.pythonhosted.org/packages/16/2e/86f24451c2d530c88daf997cb8d6ac622c1d40d19f5a031ed68a4b73a374/numpy-1.26.4-cp312-cp312-win_amd64.whl", hash = "sha256:08beddf13648eb95f8d867350f6a018a4be2e5ad54c8d8caed89ebca558b2818", size = 15517754 },
+    { url = "https://files.pythonhosted.org/packages/e2/f7/1fd4ff108cd9d7ef929b8882692e23665dc9c23feecafbb9c6b80f4ec583/numpy-2.2.5-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:ee461a4eaab4f165b68780a6a1af95fb23a29932be7569b9fab666c407969051", size = 20948633 },
+    { url = "https://files.pythonhosted.org/packages/12/03/d443c278348371b20d830af155ff2079acad6a9e60279fac2b41dbbb73d8/numpy-2.2.5-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:ec31367fd6a255dc8de4772bd1658c3e926d8e860a0b6e922b615e532d320ddc", size = 14176123 },
+    { url = "https://files.pythonhosted.org/packages/2b/0b/5ca264641d0e7b14393313304da48b225d15d471250376f3fbdb1a2be603/numpy-2.2.5-cp312-cp312-macosx_14_0_arm64.whl", hash = "sha256:47834cde750d3c9f4e52c6ca28a7361859fcaf52695c7dc3cc1a720b8922683e", size = 5163817 },
+    { url = "https://files.pythonhosted.org/packages/04/b3/d522672b9e3d28e26e1613de7675b441bbd1eaca75db95680635dd158c67/numpy-2.2.5-cp312-cp312-macosx_14_0_x86_64.whl", hash = "sha256:2c1a1c6ccce4022383583a6ded7bbcda22fc635eb4eb1e0a053336425ed36dfa", size = 6698066 },
+    { url = "https://files.pythonhosted.org/packages/a0/93/0f7a75c1ff02d4b76df35079676b3b2719fcdfb39abdf44c8b33f43ef37d/numpy-2.2.5-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:9d75f338f5f79ee23548b03d801d28a505198297534f62416391857ea0479571", size = 14087277 },
+    { url = "https://files.pythonhosted.org/packages/b0/d9/7c338b923c53d431bc837b5b787052fef9ae68a56fe91e325aac0d48226e/numpy-2.2.5-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3a801fef99668f309b88640e28d261991bfad9617c27beda4a3aec4f217ea073", size = 16135742 },
+    { url = "https://files.pythonhosted.org/packages/2d/10/4dec9184a5d74ba9867c6f7d1e9f2e0fb5fe96ff2bf50bb6f342d64f2003/numpy-2.2.5-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:abe38cd8381245a7f49967a6010e77dbf3680bd3627c0fe4362dd693b404c7f8", size = 15581825 },
+    { url = "https://files.pythonhosted.org/packages/80/1f/2b6fcd636e848053f5b57712a7d1880b1565eec35a637fdfd0a30d5e738d/numpy-2.2.5-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:5a0ac90e46fdb5649ab6369d1ab6104bfe5854ab19b645bf5cda0127a13034ae", size = 17899600 },
+    { url = "https://files.pythonhosted.org/packages/ec/87/36801f4dc2623d76a0a3835975524a84bd2b18fe0f8835d45c8eae2f9ff2/numpy-2.2.5-cp312-cp312-win32.whl", hash = "sha256:0cd48122a6b7eab8f06404805b1bd5856200e3ed6f8a1b9a194f9d9054631beb", size = 6312626 },
+    { url = "https://files.pythonhosted.org/packages/8b/09/4ffb4d6cfe7ca6707336187951992bd8a8b9142cf345d87ab858d2d7636a/numpy-2.2.5-cp312-cp312-win_amd64.whl", hash = "sha256:ced69262a8278547e63409b2653b372bf4baff0870c57efa76c5703fd6543282", size = 12645715 },
+    { url = "https://files.pythonhosted.org/packages/e2/a0/0aa7f0f4509a2e07bd7a509042967c2fab635690d4f48c6c7b3afd4f448c/numpy-2.2.5-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:059b51b658f4414fff78c6d7b1b4e18283ab5fa56d270ff212d5ba0c561846f4", size = 20935102 },
+    { url = "https://files.pythonhosted.org/packages/7e/e4/a6a9f4537542912ec513185396fce52cdd45bdcf3e9d921ab02a93ca5aa9/numpy-2.2.5-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:47f9ed103af0bc63182609044b0490747e03bd20a67e391192dde119bf43d52f", size = 14191709 },
+    { url = "https://files.pythonhosted.org/packages/be/65/72f3186b6050bbfe9c43cb81f9df59ae63603491d36179cf7a7c8d216758/numpy-2.2.5-cp313-cp313-macosx_14_0_arm64.whl", hash = "sha256:261a1ef047751bb02f29dfe337230b5882b54521ca121fc7f62668133cb119c9", size = 5149173 },
+    { url = "https://files.pythonhosted.org/packages/e5/e9/83e7a9432378dde5802651307ae5e9ea07bb72b416728202218cd4da2801/numpy-2.2.5-cp313-cp313-macosx_14_0_x86_64.whl", hash = "sha256:4520caa3807c1ceb005d125a75e715567806fed67e315cea619d5ec6e75a4191", size = 6684502 },
+    { url = "https://files.pythonhosted.org/packages/ea/27/b80da6c762394c8ee516b74c1f686fcd16c8f23b14de57ba0cad7349d1d2/numpy-2.2.5-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3d14b17b9be5f9c9301f43d2e2a4886a33b53f4e6fdf9ca2f4cc60aeeee76372", size = 14084417 },
+    { url = "https://files.pythonhosted.org/packages/aa/fc/ebfd32c3e124e6a1043e19c0ab0769818aa69050ce5589b63d05ff185526/numpy-2.2.5-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2ba321813a00e508d5421104464510cc962a6f791aa2fca1c97b1e65027da80d", size = 16133807 },
+    { url = "https://files.pythonhosted.org/packages/bf/9b/4cc171a0acbe4666f7775cfd21d4eb6bb1d36d3a0431f48a73e9212d2278/numpy-2.2.5-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:a4cbdef3ddf777423060c6f81b5694bad2dc9675f110c4b2a60dc0181543fac7", size = 15575611 },
+    { url = "https://files.pythonhosted.org/packages/a3/45/40f4135341850df48f8edcf949cf47b523c404b712774f8855a64c96ef29/numpy-2.2.5-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:54088a5a147ab71a8e7fdfd8c3601972751ded0739c6b696ad9cb0343e21ab73", size = 17895747 },
+    { url = "https://files.pythonhosted.org/packages/f8/4c/b32a17a46f0ffbde8cc82df6d3daeaf4f552e346df143e1b188a701a8f09/numpy-2.2.5-cp313-cp313-win32.whl", hash = "sha256:c8b82a55ef86a2d8e81b63da85e55f5537d2157165be1cb2ce7cfa57b6aef38b", size = 6309594 },
+    { url = "https://files.pythonhosted.org/packages/13/ae/72e6276feb9ef06787365b05915bfdb057d01fceb4a43cb80978e518d79b/numpy-2.2.5-cp313-cp313-win_amd64.whl", hash = "sha256:d8882a829fd779f0f43998e931c466802a77ca1ee0fe25a3abe50278616b1471", size = 12638356 },
+    { url = "https://files.pythonhosted.org/packages/79/56/be8b85a9f2adb688e7ded6324e20149a03541d2b3297c3ffc1a73f46dedb/numpy-2.2.5-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:e8b025c351b9f0e8b5436cf28a07fa4ac0204d67b38f01433ac7f9b870fa38c6", size = 20963778 },
+    { url = "https://files.pythonhosted.org/packages/ff/77/19c5e62d55bff507a18c3cdff82e94fe174957bad25860a991cac719d3ab/numpy-2.2.5-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:8dfa94b6a4374e7851bbb6f35e6ded2120b752b063e6acdd3157e4d2bb922eba", size = 14207279 },
+    { url = "https://files.pythonhosted.org/packages/75/22/aa11f22dc11ff4ffe4e849d9b63bbe8d4ac6d5fae85ddaa67dfe43be3e76/numpy-2.2.5-cp313-cp313t-macosx_14_0_arm64.whl", hash = "sha256:97c8425d4e26437e65e1d189d22dff4a079b747ff9c2788057bfb8114ce1e133", size = 5199247 },
+    { url = "https://files.pythonhosted.org/packages/4f/6c/12d5e760fc62c08eded0394f62039f5a9857f758312bf01632a81d841459/numpy-2.2.5-cp313-cp313t-macosx_14_0_x86_64.whl", hash = "sha256:352d330048c055ea6db701130abc48a21bec690a8d38f8284e00fab256dc1376", size = 6711087 },
+    { url = "https://files.pythonhosted.org/packages/ef/94/ece8280cf4218b2bee5cec9567629e61e51b4be501e5c6840ceb593db945/numpy-2.2.5-cp313-cp313t-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8b4c0773b6ada798f51f0f8e30c054d32304ccc6e9c5d93d46cb26f3d385ab19", size = 14059964 },
+    { url = "https://files.pythonhosted.org/packages/39/41/c5377dac0514aaeec69115830a39d905b1882819c8e65d97fc60e177e19e/numpy-2.2.5-cp313-cp313t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:55f09e00d4dccd76b179c0f18a44f041e5332fd0e022886ba1c0bbf3ea4a18d0", size = 16121214 },
+    { url = "https://files.pythonhosted.org/packages/db/54/3b9f89a943257bc8e187145c6bc0eb8e3d615655f7b14e9b490b053e8149/numpy-2.2.5-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:02f226baeefa68f7d579e213d0f3493496397d8f1cff5e2b222af274c86a552a", size = 15575788 },
+    { url = "https://files.pythonhosted.org/packages/b1/c4/2e407e85df35b29f79945751b8f8e671057a13a376497d7fb2151ba0d290/numpy-2.2.5-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:c26843fd58f65da9491165072da2cccc372530681de481ef670dcc8e27cfb066", size = 17893672 },
+    { url = "https://files.pythonhosted.org/packages/29/7e/d0b44e129d038dba453f00d0e29ebd6eaf2f06055d72b95b9947998aca14/numpy-2.2.5-cp313-cp313t-win32.whl", hash = "sha256:1a161c2c79ab30fe4501d5a2bbfe8b162490757cf90b7f05be8b80bc02f7bb8e", size = 6377102 },
+    { url = "https://files.pythonhosted.org/packages/63/be/b85e4aa4bf42c6502851b971f1c326d583fcc68227385f92089cf50a7b45/numpy-2.2.5-cp313-cp313t-win_amd64.whl", hash = "sha256:d403c84991b5ad291d3809bace5e85f4bbf44a04bdc9a88ed2bb1807b3360bb8", size = 12750096 },
 ]
 
 [[package]]
@@ -2813,15 +2929,15 @@ flashrank = [
 
 [[package]]
 name = "rich"
-version = "13.9.4"
+version = "14.0.0"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
     { name = "markdown-it-py" },
     { name = "pygments" },
 ]
-sdist = { url = "https://files.pythonhosted.org/packages/ab/3a/0316b28d0761c6734d6bc14e770d85506c986c85ffb239e688eeaab2c2bc/rich-13.9.4.tar.gz", hash = "sha256:439594978a49a09530cff7ebc4b5c7103ef57baf48d5ea3184f21d9a2befa098", size = 223149 }
+sdist = { url = "https://files.pythonhosted.org/packages/a1/53/830aa4c3066a8ab0ae9a9955976fb770fe9c6102117c8ec4ab3ea62d89e8/rich-14.0.0.tar.gz", hash = "sha256:82f1bc23a6a21ebca4ae0c45af9bdbc492ed20231dcb63f297d6d1021a9d5725", size = 224078 }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/19/71/39c7c0d87f8d4e6c020a393182060eaefeeae6c01dab6a84ec346f2567df/rich-13.9.4-py3-none-any.whl", hash = "sha256:6049d5e6ec054bf2779ab3358186963bac2ea89175919d699e378b99738c2a90", size = 242424 },
+    { url = "https://files.pythonhosted.org/packages/0d/9b/63f4c7ebc259242c89b3acafdb37b41d1185c07ff0011164674e9076b491/rich-14.0.0-py3-none-any.whl", hash = "sha256:1c9491e1951aac09caffd42f448ee3d04e58923ffe14993f6e83068dc395d7e0", size = 243229 },
 ]
 
 [[package]]
@@ -3112,7 +3228,7 @@ dependencies = [
 requires-dist = [
     { name = "alembic", specifier = ">=1.13.0" },
     { name = "asyncpg", specifier = ">=0.30.0" },
-    { name = "chonkie", extras = ["all"], specifier = ">=0.4.1" },
+    { name = "chonkie", extras = ["all"], specifier = ">=1.0.6" },
     { name = "fastapi", specifier = ">=0.115.8" },
     { name = "fastapi-users", extras = ["oauth", "sqlalchemy"], specifier = ">=14.0.1" },
     { name = "firecrawl-py", specifier = ">=1.12.0" },
@@ -3339,6 +3455,91 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/b6/1a/efeecb8d83705f2f4beac98d46f2148c95ecd7babfb31b5c0f1e7017e83d/transformers-4.48.3-py3-none-any.whl", hash = "sha256:78697f990f5ef350c23b46bf86d5081ce96b49479ab180b2de7687267de8fd36", size = 9669412 },
 ]
 
+[[package]]
+name = "tree-sitter"
+version = "0.24.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/a7/a2/698b9d31d08ad5558f8bfbfe3a0781bd4b1f284e89bde3ad18e05101a892/tree-sitter-0.24.0.tar.gz", hash = "sha256:abd95af65ca2f4f7eca356343391ed669e764f37748b5352946f00f7fc78e734", size = 168304 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/e9/57/3a590f287b5aa60c07d5545953912be3d252481bf5e178f750db75572bff/tree_sitter-0.24.0-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:14beeff5f11e223c37be7d5d119819880601a80d0399abe8c738ae2288804afc", size = 140788 },
+    { url = "https://files.pythonhosted.org/packages/61/0b/fc289e0cba7dbe77c6655a4dd949cd23c663fd62a8b4d8f02f97e28d7fe5/tree_sitter-0.24.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:26a5b130f70d5925d67b47db314da209063664585a2fd36fa69e0717738efaf4", size = 133945 },
+    { url = "https://files.pythonhosted.org/packages/86/d7/80767238308a137e0b5b5c947aa243e3c1e3e430e6d0d5ae94b9a9ffd1a2/tree_sitter-0.24.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5fc5c3c26d83c9d0ecb4fc4304fba35f034b7761d35286b936c1db1217558b4e", size = 564819 },
+    { url = "https://files.pythonhosted.org/packages/bf/b3/6c5574f4b937b836601f5fb556b24804b0a6341f2eb42f40c0e6464339f4/tree_sitter-0.24.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:772e1bd8c0931c866b848d0369b32218ac97c24b04790ec4b0e409901945dd8e", size = 579303 },
+    { url = "https://files.pythonhosted.org/packages/0a/f4/bd0ddf9abe242ea67cca18a64810f8af230fc1ea74b28bb702e838ccd874/tree_sitter-0.24.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:24a8dd03b0d6b8812425f3b84d2f4763322684e38baf74e5bb766128b5633dc7", size = 581054 },
+    { url = "https://files.pythonhosted.org/packages/8c/1c/ff23fa4931b6ef1bbeac461b904ca7e49eaec7e7e5398584e3eef836ec96/tree_sitter-0.24.0-cp312-cp312-win_amd64.whl", hash = "sha256:f9e8b1605ab60ed43803100f067eed71b0b0e6c1fb9860a262727dbfbbb74751", size = 120221 },
+    { url = "https://files.pythonhosted.org/packages/b2/2a/9979c626f303177b7612a802237d0533155bf1e425ff6f73cc40f25453e2/tree_sitter-0.24.0-cp312-cp312-win_arm64.whl", hash = "sha256:f733a83d8355fc95561582b66bbea92ffd365c5d7a665bc9ebd25e049c2b2abb", size = 108234 },
+    { url = "https://files.pythonhosted.org/packages/61/cd/2348339c85803330ce38cee1c6cbbfa78a656b34ff58606ebaf5c9e83bd0/tree_sitter-0.24.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:0d4a6416ed421c4210f0ca405a4834d5ccfbb8ad6692d4d74f7773ef68f92071", size = 140781 },
+    { url = "https://files.pythonhosted.org/packages/8b/a3/1ea9d8b64e8dcfcc0051028a9c84a630301290995cd6e947bf88267ef7b1/tree_sitter-0.24.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:e0992d483677e71d5c5d37f30dfb2e3afec2f932a9c53eec4fca13869b788c6c", size = 133928 },
+    { url = "https://files.pythonhosted.org/packages/fe/ae/55c1055609c9428a4aedf4b164400ab9adb0b1bf1538b51f4b3748a6c983/tree_sitter-0.24.0-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:57277a12fbcefb1c8b206186068d456c600dbfbc3fd6c76968ee22614c5cd5ad", size = 564497 },
+    { url = "https://files.pythonhosted.org/packages/ce/d0/f2ffcd04882c5aa28d205a787353130cbf84b2b8a977fd211bdc3b399ae3/tree_sitter-0.24.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d25fa22766d63f73716c6fec1a31ee5cf904aa429484256bd5fdf5259051ed74", size = 578917 },
+    { url = "https://files.pythonhosted.org/packages/af/82/aebe78ea23a2b3a79324993d4915f3093ad1af43d7c2208ee90be9273273/tree_sitter-0.24.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:7d5d9537507e1c8c5fa9935b34f320bfec4114d675e028f3ad94f11cf9db37b9", size = 581148 },
+    { url = "https://files.pythonhosted.org/packages/a1/b4/6b0291a590c2b0417cfdb64ccb8ea242f270a46ed429c641fbc2bfab77e0/tree_sitter-0.24.0-cp313-cp313-win_amd64.whl", hash = "sha256:f58bb4956917715ec4d5a28681829a8dad5c342cafd4aea269f9132a83ca9b34", size = 120207 },
+    { url = "https://files.pythonhosted.org/packages/a8/18/542fd844b75272630229c9939b03f7db232c71a9d82aadc59c596319ea6a/tree_sitter-0.24.0-cp313-cp313-win_arm64.whl", hash = "sha256:23641bd25dcd4bb0b6fa91b8fb3f46cc9f1c9f475efe4d536d3f1f688d1b84c8", size = 108232 },
+]
+
+[[package]]
+name = "tree-sitter-c-sharp"
+version = "0.23.1"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/22/85/a61c782afbb706a47d990eaee6977e7c2bd013771c5bf5c81c617684f286/tree_sitter_c_sharp-0.23.1.tar.gz", hash = "sha256:322e2cfd3a547a840375276b2aea3335fa6458aeac082f6c60fec3f745c967eb", size = 1317728 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/58/04/f6c2df4c53a588ccd88d50851155945cff8cd887bd70c175e00aaade7edf/tree_sitter_c_sharp-0.23.1-cp39-abi3-macosx_10_9_x86_64.whl", hash = "sha256:2b612a6e5bd17bb7fa2aab4bb6fc1fba45c94f09cb034ab332e45603b86e32fd", size = 372235 },
+    { url = "https://files.pythonhosted.org/packages/99/10/1aa9486f1e28fc22810fa92cbdc54e1051e7f5536a5e5b5e9695f609b31e/tree_sitter_c_sharp-0.23.1-cp39-abi3-macosx_11_0_arm64.whl", hash = "sha256:1a8b98f62bc53efcd4d971151950c9b9cd5cbe3bacdb0cd69fdccac63350d83e", size = 419046 },
+    { url = "https://files.pythonhosted.org/packages/0f/21/13df29f8fcb9ba9f209b7b413a4764b673dfd58989a0dd67e9c7e19e9c2e/tree_sitter_c_sharp-0.23.1-cp39-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:986e93d845a438ec3c4416401aa98e6a6f6631d644bbbc2e43fcb915c51d255d", size = 415999 },
+    { url = "https://files.pythonhosted.org/packages/ca/72/fc6846795bcdae2f8aa94cc8b1d1af33d634e08be63e294ff0d6794b1efc/tree_sitter_c_sharp-0.23.1-cp39-abi3-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a8024e466b2f5611c6dc90321f232d8584893c7fb88b75e4a831992f877616d2", size = 402830 },
+    { url = "https://files.pythonhosted.org/packages/fe/3a/b6028c5890ce6653807d5fa88c72232c027c6ceb480dbeb3b186d60e5971/tree_sitter_c_sharp-0.23.1-cp39-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:7f9bf876866835492281d336b9e1f9626ab668737f74e914c31d285261507da7", size = 397880 },
+    { url = "https://files.pythonhosted.org/packages/47/d2/4facaa34b40f8104d8751746d0e1cd2ddf0beb9f1404b736b97f372bd1f3/tree_sitter_c_sharp-0.23.1-cp39-abi3-win_amd64.whl", hash = "sha256:ae9a9e859e8f44e2b07578d44f9a220d3fa25b688966708af6aa55d42abeebb3", size = 377562 },
+    { url = "https://files.pythonhosted.org/packages/d8/88/3cf6bd9959d94d1fec1e6a9c530c5f08ff4115a474f62aedb5fedb0f7241/tree_sitter_c_sharp-0.23.1-cp39-abi3-win_arm64.whl", hash = "sha256:c81548347a93347be4f48cb63ec7d60ef4b0efa91313330e69641e49aa5a08c5", size = 375157 },
+]
+
+[[package]]
+name = "tree-sitter-embedded-template"
+version = "0.23.2"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/28/d6/5a58ea2f0480f5ed188b733114a8c275532a2fd1568b3898793b13d28af5/tree_sitter_embedded_template-0.23.2.tar.gz", hash = "sha256:7b24dcf2e92497f54323e617564d36866230a8bfb719dbb7b45b461510dcddaa", size = 8471 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/ef/c1/be0c48ed9609b720e74ade86f24ea086e353fe9c7405ee9630c3d52d09a2/tree_sitter_embedded_template-0.23.2-cp39-abi3-macosx_10_9_x86_64.whl", hash = "sha256:a505c2d2494464029d79db541cab52f6da5fb326bf3d355e69bf98b84eb89ae0", size = 9554 },
+    { url = "https://files.pythonhosted.org/packages/6d/a5/7c12f5d302525ee36d1eafc28a68e4454da5bad208436d547326bee4ed76/tree_sitter_embedded_template-0.23.2-cp39-abi3-macosx_11_0_arm64.whl", hash = "sha256:28028b93b42cc3753261ae7ce066675d407f59de512417524f9c3ab7792b1d37", size = 10051 },
+    { url = "https://files.pythonhosted.org/packages/cd/87/95aaba8b64b849200bd7d4ae510cc394ecaef46a031499cbff301766970d/tree_sitter_embedded_template-0.23.2-cp39-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ec399d59ce93ffb60759a2d96053eed529f3c3f6a27128f261710d0d0de60e10", size = 17532 },
+    { url = "https://files.pythonhosted.org/packages/13/f8/8c837b898f00b35f9f3f76a4abc525e80866a69343083c9ff329e17ecb03/tree_sitter_embedded_template-0.23.2-cp39-abi3-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:bcfa01f62b88d50dbcb736cc23baec8ddbfe08daacfdc613eee8c04ab65efd09", size = 17394 },
+    { url = "https://files.pythonhosted.org/packages/89/9b/893adf9e465d2d7f14870871bf2f3b30045e5ac417cb596f667a72eda493/tree_sitter_embedded_template-0.23.2-cp39-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:6debd24791466f887109a433c31aa4a5deeba2b217817521c745a4e748a944ed", size = 16439 },
+    { url = "https://files.pythonhosted.org/packages/40/96/e79934572723673db9f867000500c6eea61a37705e02c7aee9ee031bbb6f/tree_sitter_embedded_template-0.23.2-cp39-abi3-win_amd64.whl", hash = "sha256:158fecb38be5b15db0190ef7238e5248f24bf32ae3cab93bc1197e293a5641eb", size = 12572 },
+    { url = "https://files.pythonhosted.org/packages/63/06/27f678b9874e4e2e39ddc6f5cce3374c8c60e6046ea8588a491ab6fc9fcb/tree_sitter_embedded_template-0.23.2-cp39-abi3-win_arm64.whl", hash = "sha256:9f1f3b79fe273f3d15a5b64c85fc6ebfb48decfbe8542accd05f5b7694860df0", size = 11232 },
+]
+
+[[package]]
+name = "tree-sitter-language-pack"
+version = "0.7.2"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "tree-sitter" },
+    { name = "tree-sitter-c-sharp" },
+    { name = "tree-sitter-embedded-template" },
+    { name = "tree-sitter-yaml" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/9b/1e/2d63d93025fd5b527327c3fd348955cebaec02a3f1bcec88ab4d88ddfc39/tree_sitter_language_pack-0.7.2.tar.gz", hash = "sha256:46fc96cc3bddfee7091fdedec2ae7e34218679e58241e8319bf82026f6d02eae", size = 59264078 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/da/9d/2c6272bf4fd18a22d8c07d3c983940dbece4f0e9e21f5c78f15a2740f435/tree_sitter_language_pack-0.7.2-cp39-abi3-macosx_10_13_universal2.whl", hash = "sha256:4036603020bd32060d9931a64f8c3d8637de575f350f11534971012e51a27a95", size = 28132977 },
+    { url = "https://files.pythonhosted.org/packages/2b/e2/0f2511019c27b870061f9ad719074095ef84cd7857a730765bfa066384be/tree_sitter_language_pack-0.7.2-cp39-abi3-manylinux2014_aarch64.whl", hash = "sha256:801926dbc81eeca4ce97b846cc899dcf3fecfdc3b2514a68eeeb118f70ac686d", size = 17576769 },
+    { url = "https://files.pythonhosted.org/packages/3a/88/7b38233def5c359503ad4d36533f96f9fe2943a8eeeced66b36312c49e1b/tree_sitter_language_pack-0.7.2-cp39-abi3-manylinux2014_x86_64.whl", hash = "sha256:77be80335fb585f48eb268b0e07ca54f3da8f30c2eab7be749113f116c3ef316", size = 17433872 },
+    { url = "https://files.pythonhosted.org/packages/f8/27/fc5dce240b68a1ed876bc80b2238fbaaa0f695dbaf88660728a0239a2b20/tree_sitter_language_pack-0.7.2-cp39-abi3-win_amd64.whl", hash = "sha256:d71c6b4c14b3370ca783319ede7a581a10e6dd1bdfe5d31d316d9216981a6406", size = 14316050 },
+]
+
+[[package]]
+name = "tree-sitter-yaml"
+version = "0.7.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/93/04/6de8be8112c50450cab753fcd6b74d8368c60f6099bf551cee0bec69563a/tree_sitter_yaml-0.7.0.tar.gz", hash = "sha256:9c8bb17d9755c3b0e757260917240c0d19883cd3b59a5d74f205baa8bf8435a4", size = 85085 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/69/1d/243dbdf59fae8a4109e19f0994e2627ddedb2e16b7cf99bd42be64367742/tree_sitter_yaml-0.7.0-cp39-abi3-macosx_10_9_x86_64.whl", hash = "sha256:e21553ac190ae05bf82796df8beb4d9158ba195b5846018cb36fbc3a35bd0679", size = 43335 },
+    { url = "https://files.pythonhosted.org/packages/e2/63/e5d5868a1498e20fd07e7db62933766fd64950279862e3e7f150b88ec69d/tree_sitter_yaml-0.7.0-cp39-abi3-macosx_11_0_arm64.whl", hash = "sha256:c022054f1f9b54201082ea83073a6c24c42d0436ad8ee99ff2574cba8f928c28", size = 44574 },
+    { url = "https://files.pythonhosted.org/packages/f5/ba/9cff9a3fddb1b6b38bc71ce1dfdb8892ab15a4042c104f4582e30318b412/tree_sitter_yaml-0.7.0-cp39-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1cd1725142f19e41c51d27c99cfc60780f596e069eb181cfa6433d993a19aa3d", size = 93088 },
+    { url = "https://files.pythonhosted.org/packages/19/09/39d29d9a22cee0b3c3e4f3fdbd23e4534b9c2a84b5f962f369eafcfbf88c/tree_sitter_yaml-0.7.0-cp39-abi3-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9d1b268378254f75bb27396d83c96d886ccbfcda6bd8c2778e94e3e1d2459085", size = 91367 },
+    { url = "https://files.pythonhosted.org/packages/b0/b7/285653b894b351436917b5fe5e738eecaeb2128b4e4bf72bfe0c6043f62e/tree_sitter_yaml-0.7.0-cp39-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:27c2e7f4f49ddf410003abbb82a7b00ec77ea263d8ef08dbce1a15d293eed2fd", size = 87405 },
+    { url = "https://files.pythonhosted.org/packages/bb/73/0cdc82ea653c190475a4f63dd4a1f4efd5d1c7d09d2668b8d84008a4c4f8/tree_sitter_yaml-0.7.0-cp39-abi3-win_amd64.whl", hash = "sha256:98dce0d6bc376f842cfb1d3c32512eea95b37e61cd2c87074bb4b05c999917c8", size = 45360 },
+    { url = "https://files.pythonhosted.org/packages/2e/32/af2d676b0176a958f22a75b04be836e09476a10844baab78c018a5030297/tree_sitter_yaml-0.7.0-cp39-abi3-win_arm64.whl", hash = "sha256:f0f8d8e05fa8e70f08d0f18a209d6026e171844f4ea7090e7c779b9c375b3a31", size = 43650 },
+]
+
 [[package]]
 name = "triton"
 version = "3.2.0"
@@ -3348,6 +3549,18 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/c7/30/37a3384d1e2e9320331baca41e835e90a3767303642c7a80d4510152cbcf/triton-3.2.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e5dfa23ba84541d7c0a531dfce76d8bcd19159d50a4a8b14ad01e91734a5c1b0", size = 253154278 },
 ]
 
+[[package]]
+name = "types-requests"
+version = "2.32.0.20250328"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "urllib3" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/00/7d/eb174f74e3f5634eaacb38031bbe467dfe2e545bc255e5c90096ec46bc46/types_requests-2.32.0.20250328.tar.gz", hash = "sha256:c9e67228ea103bd811c96984fac36ed2ae8da87a36a633964a21f199d60baf32", size = 22995 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/cc/15/3700282a9d4ea3b37044264d3e4d1b1f0095a4ebf860a99914fd544e3be3/types_requests-2.32.0.20250328-py3-none-any.whl", hash = "sha256:72ff80f84b15eb3aa7a8e2625fffb6a93f2ad5a0c20215fc1dcfa61117bcb2a2", size = 20663 },
+]
+
 [[package]]
 name = "typing-extensions"
 version = "4.12.2"
@@ -3381,7 +3594,7 @@ wheels = [
 
 [[package]]
 name = "unstructured"
-version = "0.16.25"
+version = "0.17.2"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
     { name = "backoff" },
@@ -3406,9 +3619,9 @@ dependencies = [
     { name = "unstructured-client" },
     { name = "wrapt" },
 ]
-sdist = { url = "https://files.pythonhosted.org/packages/64/31/98c4c78e305d1294888adf87fd5ee30577a4c393951341ca32b43f167f1e/unstructured-0.16.25.tar.gz", hash = "sha256:73b9b0f51dbb687af572ecdb849a6811710b9cac797ddeab8ee80fa07d8aa5e6", size = 1683097 }
+sdist = { url = "https://files.pythonhosted.org/packages/b4/49/b95ff4b609d7328cd0394ac9d8ad69839e11a1f879462496afcf4887154a/unstructured-0.17.2.tar.gz", hash = "sha256:af18c3caef0a6c562cf77e34ee8b6ff522b605031d2336ffe565df66f126aa46", size = 1684745 }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/12/4f/ad08585b5c8a33c82ea119494c4d3023f4796958c56e668b15cc282ec0a0/unstructured-0.16.25-py3-none-any.whl", hash = "sha256:14719ccef2830216cf1c5bf654f75e2bf07b17ca5dcee9da5ac74618130fd337", size = 1769286 },
+    { url = "https://files.pythonhosted.org/packages/cb/88/061a9dedd4e8cc0c31097c3275a9ef1fd7307e26afac5cd582487386e1b8/unstructured-0.17.2-py3-none-any.whl", hash = "sha256:527dd26a4b273aebef2f9119c9d4f0d0ce17640038d92296d23abe89be123840", size = 1771563 },
 ]
 
 [package.optional-dependencies]
@@ -3418,6 +3631,7 @@ all-docs = [
     { name = "markdown" },
     { name = "networkx" },
     { name = "onnx" },
+    { name = "onnxruntime" },
     { name = "openpyxl" },
     { name = "pandas" },
     { name = "pdf2image" },

From 42bde287819b1db96c8a240fb731404b8ff8687c Mon Sep 17 00:00:00 2001
From: "DESKTOP-RTLN3BA\\$punk" <vermarohanfinal@gmail.com>
Date: Wed, 30 Apr 2025 00:10:50 -0700
Subject: [PATCH 07/70] fix: Support for All Embeddings

---
 .../versions/5_remove_title_char_limit.py     | 58 +++++++++++++++++++
 surfsense_backend/app/config/__init__.py      |  4 +-
 surfsense_backend/app/db.py                   |  6 +-
 3 files changed, 63 insertions(+), 5 deletions(-)
 create mode 100644 surfsense_backend/alembic/versions/5_remove_title_char_limit.py

diff --git a/surfsense_backend/alembic/versions/5_remove_title_char_limit.py b/surfsense_backend/alembic/versions/5_remove_title_char_limit.py
new file mode 100644
index 000000000..57ed10899
--- /dev/null
+++ b/surfsense_backend/alembic/versions/5_remove_title_char_limit.py
@@ -0,0 +1,58 @@
+"""Remove char limit on title columns
+
+Revision ID: 5
+Revises: 4
+Create Date: 2023-06-10 00:00:00.000000
+
+"""
+from typing import Sequence, Union
+
+from alembic import op
+import sqlalchemy as sa
+
+
+# revision identifiers, used by Alembic.
+revision: str = '5'
+down_revision: Union[str, None] = '4'
+branch_labels: Union[str, Sequence[str], None] = None
+depends_on: Union[str, Sequence[str], None] = None
+
+
+def upgrade() -> None:
+    # Alter Chat table
+    op.alter_column('chats', 'title',
+                    existing_type=sa.String(200),
+                    type_=sa.String(),
+                    existing_nullable=False)
+    
+    # Alter Document table
+    op.alter_column('documents', 'title',
+                    existing_type=sa.String(200),
+                    type_=sa.String(),
+                    existing_nullable=False)
+    
+    # Alter Podcast table
+    op.alter_column('podcasts', 'title',
+                    existing_type=sa.String(200),
+                    type_=sa.String(),
+                    existing_nullable=False)
+
+
+def downgrade() -> None:
+    # Revert Chat table
+    op.alter_column('chats', 'title',
+                    existing_type=sa.String(),
+                    type_=sa.String(200),
+                    existing_nullable=False)
+    
+    # Revert Document table
+    op.alter_column('documents', 'title',
+                    existing_type=sa.String(),
+                    type_=sa.String(200),
+                    existing_nullable=False)
+    
+    # Revert Podcast table
+    op.alter_column('podcasts', 'title',
+                    existing_type=sa.String(),
+                    type_=sa.String(200),
+                    existing_nullable=False) 
\ No newline at end of file
diff --git a/surfsense_backend/app/config/__init__.py b/surfsense_backend/app/config/__init__.py
index 4adf2b7dc..91968aac0 100644
--- a/surfsense_backend/app/config/__init__.py
+++ b/surfsense_backend/app/config/__init__.py
@@ -38,10 +38,10 @@ class Config:
     EMBEDDING_MODEL = os.getenv("EMBEDDING_MODEL")
     embedding_model_instance = AutoEmbeddings.get_embeddings(EMBEDDING_MODEL)
     chunker_instance = RecursiveChunker(
-        chunk_size=embedding_model_instance.max_seq_length,
+        chunk_size=getattr(embedding_model_instance, 'max_seq_length', 512)
     )
     code_chunker_instance = CodeChunker(
-        chunk_size=embedding_model_instance.max_seq_length
+        chunk_size=getattr(embedding_model_instance, 'max_seq_length', 512)
     )
     
     # Reranker's Configuration | Pinecode, Cohere etc. Read more at https://github.com/AnswerDotAI/rerankers?tab=readme-ov-file#usage
diff --git a/surfsense_backend/app/db.py b/surfsense_backend/app/db.py
index 320f059dd..b4ee3e790 100644
--- a/surfsense_backend/app/db.py
+++ b/surfsense_backend/app/db.py
@@ -76,7 +76,7 @@ class Chat(BaseModel, TimestampMixin):
     __tablename__ = "chats"
 
     type = Column(SQLAlchemyEnum(ChatType), nullable=False)
-    title = Column(String(200), nullable=False, index=True)
+    title = Column(String, nullable=False, index=True)
     initial_connectors = Column(ARRAY(String), nullable=True)
     messages = Column(JSON, nullable=False)
     
@@ -86,7 +86,7 @@ class Chat(BaseModel, TimestampMixin):
 class Document(BaseModel, TimestampMixin):
     __tablename__ = "documents"
     
-    title = Column(String(200), nullable=False, index=True)
+    title = Column(String, nullable=False, index=True)
     document_type = Column(SQLAlchemyEnum(DocumentType), nullable=False)
     document_metadata = Column(JSON, nullable=True)
     
@@ -109,7 +109,7 @@ class Chunk(BaseModel, TimestampMixin):
 class Podcast(BaseModel, TimestampMixin):
     __tablename__ = "podcasts"
     
-    title = Column(String(200), nullable=False, index=True)
+    title = Column(String, nullable=False, index=True)
     is_generated = Column(Boolean, nullable=False, default=False)
     podcast_content = Column(Text, nullable=False, default="")
     file_location = Column(String(500), nullable=False, default="")

From d899678b72d24e97bed702e2c0e625d1db02ef71 Mon Sep 17 00:00:00 2001
From: "DESKTOP-RTLN3BA\\$punk" <vermarohanfinal@gmail.com>
Date: Wed, 30 Apr 2025 23:52:29 -0700
Subject: [PATCH 08/70] chore: updated docs

---
 surfsense_backend/app/config/__init__.py           | 4 ++--
 surfsense_web/content/docs/docker-installation.mdx | 2 +-
 surfsense_web/content/docs/manual-installation.mdx | 2 +-
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/surfsense_backend/app/config/__init__.py b/surfsense_backend/app/config/__init__.py
index 91968aac0..8c457e17b 100644
--- a/surfsense_backend/app/config/__init__.py
+++ b/surfsense_backend/app/config/__init__.py
@@ -18,7 +18,7 @@ class Config:
     # Database
     DATABASE_URL = os.getenv("DATABASE_URL")
 
-    # Google OAuth
+    # AUTH: Google OAuth
     GOOGLE_OAUTH_CLIENT_ID = os.getenv("GOOGLE_OAUTH_CLIENT_ID")
     GOOGLE_OAUTH_CLIENT_SECRET = os.getenv("GOOGLE_OAUTH_CLIENT_SECRET")
     NEXT_FRONTEND_URL = os.getenv("NEXT_FRONTEND_URL")
@@ -27,7 +27,7 @@ class Config:
     LONG_CONTEXT_LLM = os.getenv("LONG_CONTEXT_LLM")
     long_context_llm_instance = ChatLiteLLM(model=LONG_CONTEXT_LLM)
     
-    # GPT Researcher
+    # FAST & STRATEGIC LLM's
     FAST_LLM = os.getenv("FAST_LLM")
     STRATEGIC_LLM = os.getenv("STRATEGIC_LLM")
     fast_llm_instance = ChatLiteLLM(model=FAST_LLM)
diff --git a/surfsense_web/content/docs/docker-installation.mdx b/surfsense_web/content/docs/docker-installation.mdx
index 2a373d048..236366546 100644
--- a/surfsense_web/content/docs/docker-installation.mdx
+++ b/surfsense_web/content/docs/docker-installation.mdx
@@ -65,7 +65,7 @@ Before you begin, ensure you have:
    | GOOGLE_OAUTH_CLIENT_ID | Google OAuth client ID obtained from Google Cloud Console |
    | GOOGLE_OAUTH_CLIENT_SECRET | Google OAuth client secret obtained from Google Cloud Console |
    | NEXT_FRONTEND_URL | URL where your frontend application is hosted (e.g., `http://localhost:3000`) |
-   | EMBEDDING_MODEL | Name of the embedding model (e.g., `mixedbread-ai/mxbai-embed-large-v1`) |
+   | EMBEDDING_MODEL | Name of the embedding model (e.g., `openai://text-embedding-ada-002`, `anthropic://claude-v1`, `mixedbread-ai/mxbai-embed-large-v1`) |
    | RERANKERS_MODEL_NAME | Name of the reranker model (e.g., `ms-marco-MiniLM-L-12-v2`) |
    | RERANKERS_MODEL_TYPE | Type of reranker model (e.g., `flashrank`) |
    | FAST_LLM | LiteLLM routed smaller, faster LLM (e.g., `openai/gpt-4o-mini`, `ollama/deepseek-r1:8b`) |
diff --git a/surfsense_web/content/docs/manual-installation.mdx b/surfsense_web/content/docs/manual-installation.mdx
index 477f5ef17..3813b1b88 100644
--- a/surfsense_web/content/docs/manual-installation.mdx
+++ b/surfsense_web/content/docs/manual-installation.mdx
@@ -53,7 +53,7 @@ Edit the `.env` file and set the following variables:
 | GOOGLE_OAUTH_CLIENT_ID | Google OAuth client ID |
 | GOOGLE_OAUTH_CLIENT_SECRET | Google OAuth client secret |
 | NEXT_FRONTEND_URL | Frontend application URL (e.g., `http://localhost:3000`) |
-| EMBEDDING_MODEL | Name of the embedding model (e.g., `mixedbread-ai/mxbai-embed-large-v1`) |
+| EMBEDDING_MODEL | Name of the embedding model (e.g., `openai://text-embedding-ada-002`, `anthropic://claude-v1`, `mixedbread-ai/mxbai-embed-large-v1`) |
 | RERANKERS_MODEL_NAME | Name of the reranker model (e.g., `ms-marco-MiniLM-L-12-v2`) |
 | RERANKERS_MODEL_TYPE | Type of reranker model (e.g., `flashrank`) |
 | FAST_LLM | LiteLLM routed faster LLM (e.g., `openai/gpt-4o-mini`, `ollama/deepseek-r1:8b`) |

From dc97072145f3840d5b4e1d7d5721556f829dde06 Mon Sep 17 00:00:00 2001
From: "DESKTOP-RTLN3BA\\$punk" <vermarohanfinal@gmail.com>
Date: Wed, 30 Apr 2025 23:52:58 -0700
Subject: [PATCH 09/70] chore: updated docs

---
 surfsense_backend/app/config/__init__.py           | 4 ++--
 surfsense_web/content/docs/docker-installation.mdx | 2 +-
 surfsense_web/content/docs/manual-installation.mdx | 2 +-
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/surfsense_backend/app/config/__init__.py b/surfsense_backend/app/config/__init__.py
index 91968aac0..8c457e17b 100644
--- a/surfsense_backend/app/config/__init__.py
+++ b/surfsense_backend/app/config/__init__.py
@@ -18,7 +18,7 @@ class Config:
     # Database
     DATABASE_URL = os.getenv("DATABASE_URL")
 
-    # Google OAuth
+    # AUTH: Google OAuth
     GOOGLE_OAUTH_CLIENT_ID = os.getenv("GOOGLE_OAUTH_CLIENT_ID")
     GOOGLE_OAUTH_CLIENT_SECRET = os.getenv("GOOGLE_OAUTH_CLIENT_SECRET")
     NEXT_FRONTEND_URL = os.getenv("NEXT_FRONTEND_URL")
@@ -27,7 +27,7 @@ class Config:
     LONG_CONTEXT_LLM = os.getenv("LONG_CONTEXT_LLM")
     long_context_llm_instance = ChatLiteLLM(model=LONG_CONTEXT_LLM)
     
-    # GPT Researcher
+    # FAST & STRATEGIC LLM's
     FAST_LLM = os.getenv("FAST_LLM")
     STRATEGIC_LLM = os.getenv("STRATEGIC_LLM")
     fast_llm_instance = ChatLiteLLM(model=FAST_LLM)
diff --git a/surfsense_web/content/docs/docker-installation.mdx b/surfsense_web/content/docs/docker-installation.mdx
index 2a373d048..236366546 100644
--- a/surfsense_web/content/docs/docker-installation.mdx
+++ b/surfsense_web/content/docs/docker-installation.mdx
@@ -65,7 +65,7 @@ Before you begin, ensure you have:
    | GOOGLE_OAUTH_CLIENT_ID | Google OAuth client ID obtained from Google Cloud Console |
    | GOOGLE_OAUTH_CLIENT_SECRET | Google OAuth client secret obtained from Google Cloud Console |
    | NEXT_FRONTEND_URL | URL where your frontend application is hosted (e.g., `http://localhost:3000`) |
-   | EMBEDDING_MODEL | Name of the embedding model (e.g., `mixedbread-ai/mxbai-embed-large-v1`) |
+   | EMBEDDING_MODEL | Name of the embedding model (e.g., `openai://text-embedding-ada-002`, `anthropic://claude-v1`, `mixedbread-ai/mxbai-embed-large-v1`) |
    | RERANKERS_MODEL_NAME | Name of the reranker model (e.g., `ms-marco-MiniLM-L-12-v2`) |
    | RERANKERS_MODEL_TYPE | Type of reranker model (e.g., `flashrank`) |
    | FAST_LLM | LiteLLM routed smaller, faster LLM (e.g., `openai/gpt-4o-mini`, `ollama/deepseek-r1:8b`) |
diff --git a/surfsense_web/content/docs/manual-installation.mdx b/surfsense_web/content/docs/manual-installation.mdx
index 477f5ef17..3813b1b88 100644
--- a/surfsense_web/content/docs/manual-installation.mdx
+++ b/surfsense_web/content/docs/manual-installation.mdx
@@ -53,7 +53,7 @@ Edit the `.env` file and set the following variables:
 | GOOGLE_OAUTH_CLIENT_ID | Google OAuth client ID |
 | GOOGLE_OAUTH_CLIENT_SECRET | Google OAuth client secret |
 | NEXT_FRONTEND_URL | Frontend application URL (e.g., `http://localhost:3000`) |
-| EMBEDDING_MODEL | Name of the embedding model (e.g., `mixedbread-ai/mxbai-embed-large-v1`) |
+| EMBEDDING_MODEL | Name of the embedding model (e.g., `openai://text-embedding-ada-002`, `anthropic://claude-v1`, `mixedbread-ai/mxbai-embed-large-v1`) |
 | RERANKERS_MODEL_NAME | Name of the reranker model (e.g., `ms-marco-MiniLM-L-12-v2`) |
 | RERANKERS_MODEL_TYPE | Type of reranker model (e.g., `flashrank`) |
 | FAST_LLM | LiteLLM routed faster LLM (e.g., `openai/gpt-4o-mini`, `ollama/deepseek-r1:8b`) |

From 906344d6f3724bcf65aeaf4056a7bd5133f3fac0 Mon Sep 17 00:00:00 2001
From: "DESKTOP-RTLN3BA\\$punk" <vermarohanfinal@gmail.com>
Date: Sat, 3 May 2025 01:08:19 -0700
Subject: [PATCH 10/70] chore: qol patches

---
 README.md                                     | 17 +++++++--
 surfsense_web/app/login/GoogleLoginButton.tsx | 36 +++++++++++++++++++
 .../components/ModernHeroWithGradients.tsx    | 11 ++++++
 surfsense_web/components/Navbar.tsx           | 20 ++---------
 4 files changed, 64 insertions(+), 20 deletions(-)

diff --git a/README.md b/README.md
index ad8633c47..d2af00f33 100644
--- a/README.md
+++ b/README.md
@@ -1,13 +1,16 @@
 
-
-
 ![new_header](https://github.com/user-attachments/assets/e236b764-0ddc-42ff-a1f1-8fbb3d2e0e65)
 
 
 
+
 # SurfSense
 While tools like NotebookLM and Perplexity are impressive and highly effective for conducting research on any topic/query, SurfSense elevates this capability by integrating with your personal knowledge base. It is a highly customizable AI research agent, connected to external sources such as search engines (Tavily, LinkUp), Slack, Linear, Notion, YouTube, GitHub and more to come.
 
+<div align="center">
+<a href="https://trendshift.io/repositories/13606" target="_blank"><img src="https://trendshift.io/api/badge/repositories/13606" alt="MODSetter%2FSurfSense | Trendshift" style="width: 250px; height: 55px;" width="250" height="55"/></a>
+</div>
+
 
 # Video
 
@@ -203,3 +206,13 @@ Before installation, make sure to complete the [prerequisite setup steps](https:
 Contributions are very welcome! A contribution can be as small as a ⭐ or even finding and creating issues.
 Fine-tuning the Backend is always desired.
 
+## Star History
+
+<a href="https://www.star-history.com/#MODSetter/SurfSense&Date">
+ <picture>
+   <source media="(prefers-color-scheme: dark)" srcset="https://api.star-history.com/svg?repos=MODSetter/SurfSense&type=Date&theme=dark" />
+   <source media="(prefers-color-scheme: light)" srcset="https://api.star-history.com/svg?repos=MODSetter/SurfSense&type=Date" />
+   <img alt="Star History Chart" src="https://api.star-history.com/svg?repos=MODSetter/SurfSense&type=Date" />
+ </picture>
+</a>
+
diff --git a/surfsense_web/app/login/GoogleLoginButton.tsx b/surfsense_web/app/login/GoogleLoginButton.tsx
index a4ed4a3a0..11caafbf3 100644
--- a/surfsense_web/app/login/GoogleLoginButton.tsx
+++ b/surfsense_web/app/login/GoogleLoginButton.tsx
@@ -34,6 +34,42 @@ export function GoogleLoginButton() {
           Welcome Back
         </h1>
         
+        <motion.div 
+          initial={{ opacity: 0, y: -5 }}
+          animate={{ opacity: 1, y: 0 }}
+          transition={{ duration: 0.3 }}
+          className="mb-4 w-full overflow-hidden rounded-lg border border-yellow-200 bg-yellow-50 text-yellow-900 shadow-sm dark:border-yellow-900/30 dark:bg-yellow-900/20 dark:text-yellow-200"
+        >
+          <motion.div
+            className="flex items-center gap-2 p-4"
+            initial={{ x: -5 }}
+            animate={{ x: 0 }}
+            transition={{ delay: 0.1, duration: 0.2 }}
+          >
+            <svg 
+              xmlns="http://www.w3.org/2000/svg" 
+              width="16" 
+              height="16" 
+              viewBox="0 0 24 24" 
+              fill="none" 
+              stroke="currentColor" 
+              strokeWidth="2"
+              strokeLinecap="round" 
+              strokeLinejoin="round" 
+              className="flex-shrink-0"
+            >
+              <path d="M10.29 3.86L1.82 18a2 2 0 0 0 1.71 3h16.94a2 2 0 0 0 1.71-3L13.71 3.86a2 2 0 0 0-3.42 0z"/>
+              <line x1="12" y1="9" x2="12" y2="13"/>
+              <line x1="12" y1="17" x2="12.01" y2="17"/>
+            </svg>
+            <div className="ml-1">
+              <p className="text-sm font-medium">
+                SurfSense Cloud is currently in development. Check <a href="/docs" className="text-blue-600 underline dark:text-blue-400 hover:text-blue-800 dark:hover:text-blue-300">Docs</a> for more information on Self-Hosted version.
+              </p>
+            </div>
+          </motion.div>
+        </motion.div>
+        
         <motion.button
           whileHover={{ scale: 1.02 }}
           whileTap={{ scale: 0.98 }}
diff --git a/surfsense_web/components/ModernHeroWithGradients.tsx b/surfsense_web/components/ModernHeroWithGradients.tsx
index b30c4bc82..1a64fc446 100644
--- a/surfsense_web/components/ModernHeroWithGradients.tsx
+++ b/surfsense_web/components/ModernHeroWithGradients.tsx
@@ -19,6 +19,17 @@ export function ModernHeroWithGradients() {
                     <DarkModeGradient />
 
                     <div className="relative z-20 flex flex-col items-center justify-center overflow-hidden rounded-3xl p-4 md:p-12 lg:p-16">
+                        <div className="flex justify-center w-full mb-4">
+                            <Link href="https://github.com/MODSetter/SurfSense" target="_blank" rel="noopener noreferrer">
+                                <img 
+                                    src="https://trendshift.io/api/badge/repositories/13606" 
+                                    alt="MODSetter%2FSurfSense | Trendshift" 
+                                    style={{ width: "250px", height: "55px" }} 
+                                    width={250} 
+                                    height={55} 
+                                />
+                            </Link>
+                        </div>
                         <Link
                             href="/docs"
                             className="flex items-center gap-1 rounded-full border border-gray-200 bg-gradient-to-b from-gray-50 to-gray-100 px-4 py-1 text-center text-sm text-gray-800 shadow-sm dark:border-[#404040] dark:bg-gradient-to-b dark:from-[#5B5B5D] dark:to-[#262627] dark:text-white dark:shadow-inner dark:shadow-purple-500/10"
diff --git a/surfsense_web/components/Navbar.tsx b/surfsense_web/components/Navbar.tsx
index eb3555c05..fcc45f0a1 100644
--- a/surfsense_web/components/Navbar.tsx
+++ b/surfsense_web/components/Navbar.tsx
@@ -64,24 +64,8 @@ const DesktopNav = ({ navItems, visible }: NavbarProps) => {
   const [hoveredIndex, setHoveredIndex] = useState<number | null>(null);
   
   const handleGoogleLogin = () => {
-    // Redirect to Google OAuth authorization URL
-    fetch(`${process.env.NEXT_PUBLIC_FASTAPI_BACKEND_URL}/auth/google/authorize`)
-      .then((response) => {
-        if (!response.ok) {
-          throw new Error('Failed to get authorization URL');
-        }
-        return response.json();
-      })
-      .then((data) => {
-        if (data.authorization_url) {
-          window.location.href = data.authorization_url;
-        } else {
-          console.error('No authorization URL received');
-        }
-      })
-      .catch((error) => {
-        console.error('Error during Google login:', error);
-      });
+    // Redirect to the login page
+    window.location.href = '/login';
   };
 
   return (

From 10d56acaa886ffbe776c8c99bb0430f15e94da32 Mon Sep 17 00:00:00 2001
From: "DESKTOP-RTLN3BA\\$punk" <vermarohanfinal@gmail.com>
Date: Mon, 5 May 2025 01:39:31 -0700
Subject: [PATCH 11/70] feat: Stable & Hella Fast Podcast Agent with auto
 FFMPEG handling.

---
 .gitignore                                    |   3 +-
 surfsense_backend/.env.example                |   3 +
 .../app/agents/podcaster/__init__.py          |   8 +
 .../app/agents/podcaster/configuration.py     |  28 ++
 .../app/agents/podcaster/graph.py             |  23 +
 .../app/agents/podcaster/nodes.py             | 197 ++++++++
 .../app/agents/podcaster/prompts.py           | 111 ++++
 .../app/agents/podcaster/state.py             |  38 ++
 .../app/agents/podcaster/test_podcaster.py    | 474 ++++++++++++++++++
 surfsense_backend/app/config/__init__.py      |  24 +
 surfsense_backend/pyproject.toml              |   2 +
 surfsense_backend/uv.lock                     | 222 ++++++++
 12 files changed, 1132 insertions(+), 1 deletion(-)
 create mode 100644 surfsense_backend/app/agents/podcaster/__init__.py
 create mode 100644 surfsense_backend/app/agents/podcaster/configuration.py
 create mode 100644 surfsense_backend/app/agents/podcaster/graph.py
 create mode 100644 surfsense_backend/app/agents/podcaster/nodes.py
 create mode 100644 surfsense_backend/app/agents/podcaster/prompts.py
 create mode 100644 surfsense_backend/app/agents/podcaster/state.py
 create mode 100644 surfsense_backend/app/agents/podcaster/test_podcaster.py

diff --git a/.gitignore b/.gitignore
index ac1266863..b67a7dd64 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1 +1,2 @@
-.flashrank_cache*
\ No newline at end of file
+.flashrank_cache*
+podcasts/*
diff --git a/surfsense_backend/.env.example b/surfsense_backend/.env.example
index 6dfcc9967..8e834bf1d 100644
--- a/surfsense_backend/.env.example
+++ b/surfsense_backend/.env.example
@@ -15,6 +15,9 @@ FAST_LLM="openai/gpt-4o-mini"
 STRATEGIC_LLM="openai/gpt-4o"
 LONG_CONTEXT_LLM="gemini/gemini-2.0-flash"
 
+#LiteLLM TTS Provider: https://docs.litellm.ai/docs/text_to_speech#supported-providers
+TTS_SERVICE="openai/tts-1"
+
 # Chosen LiteLLM Providers Keys
 OPENAI_API_KEY="sk-proj-iA"
 GEMINI_API_KEY="AIzaSyB6-1641124124124124124124124124124"
diff --git a/surfsense_backend/app/agents/podcaster/__init__.py b/surfsense_backend/app/agents/podcaster/__init__.py
new file mode 100644
index 000000000..8459b2977
--- /dev/null
+++ b/surfsense_backend/app/agents/podcaster/__init__.py
@@ -0,0 +1,8 @@
+"""New LangGraph Agent.
+
+This module defines a custom graph.
+"""
+
+from .graph import graph
+
+__all__ = ["graph"]
diff --git a/surfsense_backend/app/agents/podcaster/configuration.py b/surfsense_backend/app/agents/podcaster/configuration.py
new file mode 100644
index 000000000..6bbb4ce03
--- /dev/null
+++ b/surfsense_backend/app/agents/podcaster/configuration.py
@@ -0,0 +1,28 @@
+"""Define the configurable parameters for the agent."""
+
+from __future__ import annotations
+
+from dataclasses import dataclass, fields
+from typing import Optional
+
+from langchain_core.runnables import RunnableConfig
+
+
+@dataclass(kw_only=True)
+class Configuration:
+    """The configuration for the agent."""
+
+    # Changeme: Add configurable values here!
+    # these values can be pre-set when you
+    # create assistants (https://langchain-ai.github.io/langgraph/cloud/how-tos/configuration_cloud/)
+    # and when you invoke the graph
+    podcast_title: str 
+
+    @classmethod
+    def from_runnable_config(
+        cls, config: Optional[RunnableConfig] = None
+    ) -> Configuration:
+        """Create a Configuration instance from a RunnableConfig object."""
+        configurable = (config.get("configurable") or {}) if config else {}
+        _fields = {f.name for f in fields(cls) if f.init}
+        return cls(**{k: v for k, v in configurable.items() if k in _fields})
diff --git a/surfsense_backend/app/agents/podcaster/graph.py b/surfsense_backend/app/agents/podcaster/graph.py
new file mode 100644
index 000000000..f4604a7c8
--- /dev/null
+++ b/surfsense_backend/app/agents/podcaster/graph.py
@@ -0,0 +1,23 @@
+from langgraph.graph import StateGraph
+
+from .configuration import Configuration
+from .state import State
+
+
+from .nodes import create_merged_podcast_audio, create_podcast_transcript
+
+# Define a new graph
+workflow = StateGraph(State, config_schema=Configuration)
+
+# Add the node to the graph
+workflow.add_node("create_podcast_transcript", create_podcast_transcript)
+workflow.add_node("create_merged_podcast_audio", create_merged_podcast_audio)
+
+# Set the entrypoint as `call_model`
+workflow.add_edge("__start__", "create_podcast_transcript")
+workflow.add_edge("create_podcast_transcript", "create_merged_podcast_audio")
+workflow.add_edge("create_merged_podcast_audio", "__end__")
+
+# Compile the workflow into an executable graph
+graph = workflow.compile()
+graph.name = "Surfsense Podcaster"  # This defines the custom name in LangSmith
diff --git a/surfsense_backend/app/agents/podcaster/nodes.py b/surfsense_backend/app/agents/podcaster/nodes.py
new file mode 100644
index 000000000..810307ec2
--- /dev/null
+++ b/surfsense_backend/app/agents/podcaster/nodes.py
@@ -0,0 +1,197 @@
+from typing import Any, Dict
+import json
+import os
+import uuid
+from pathlib import Path
+import asyncio
+
+from langchain_core.messages import HumanMessage, SystemMessage
+from langchain_core.runnables import RunnableConfig
+from litellm import aspeech
+from ffmpeg.asyncio import FFmpeg
+
+from .configuration import Configuration
+from .state import PodcastTranscriptEntry, State, PodcastTranscripts
+from .prompts import get_podcast_generation_prompt
+from app.config import config as app_config
+
+
+async def create_podcast_transcript(state: State, config: RunnableConfig) -> Dict[str, Any]:
+    """Each node does work."""
+    
+    # Initialize LLM
+    llm = app_config.long_context_llm_instance
+    
+    # Get the prompt
+    prompt = get_podcast_generation_prompt()
+    
+    # Create the messages
+    messages = [
+        SystemMessage(content=prompt),
+        HumanMessage(content=state.source_content)
+    ]
+    
+    # Generate the podcast transcript
+    llm_response = await llm.ainvoke(messages)
+    
+    # First try the direct approach
+    try:
+        podcast_transcript = PodcastTranscripts.model_validate(json.loads(llm_response.content))
+    except (json.JSONDecodeError, ValueError) as e:
+        print(f"Direct JSON parsing failed, trying fallback approach: {str(e)}")
+        
+        # Fallback: Parse the JSON response manually
+        try:
+            # Extract JSON content from the response
+            content = llm_response.content
+            
+            # Find the JSON in the content (handle case where LLM might add additional text)
+            json_start = content.find('{')
+            json_end = content.rfind('}') + 1
+            if json_start >= 0 and json_end > json_start:
+                json_str = content[json_start:json_end]
+                
+                # Parse the JSON string
+                parsed_data = json.loads(json_str)
+                
+                # Convert to Pydantic model
+                podcast_transcript = PodcastTranscripts.model_validate(parsed_data)
+                
+                print(f"Successfully parsed podcast transcript using fallback approach")
+            else:
+                # If JSON structure not found, raise a clear error
+                error_message = f"Could not find valid JSON in LLM response. Raw response: {content}"
+                print(error_message)
+                raise ValueError(error_message)
+                
+        except (json.JSONDecodeError, ValueError) as e2:
+            # Log the error and re-raise it
+            error_message = f"Error parsing LLM response (fallback also failed): {str(e2)}"
+            print(f"Error parsing LLM response: {str(e2)}")
+            print(f"Raw response: {llm_response.content}")
+            raise
+    
+    return {
+        "podcast_transcript": podcast_transcript.podcast_transcripts
+    }
+    
+    
+async def create_merged_podcast_audio(state: State, config: RunnableConfig) -> Dict[str, Any]:
+    """Generate audio for each transcript and merge them into a single podcast file."""
+    
+    configuration = Configuration.from_runnable_config(config)
+    
+    starting_transcript = PodcastTranscriptEntry(
+        speaker_id=1,
+        dialog=f"Welcome to {configuration.podcast_title} Podcast."
+    )
+    
+    transcript = state.podcast_transcript
+    
+    # Merge the starting transcript with the podcast transcript
+    # Check if transcript is a PodcastTranscripts object or already a list
+    if hasattr(transcript, 'podcast_transcripts'):
+        transcript_entries = transcript.podcast_transcripts
+    else:
+        transcript_entries = transcript
+    
+    merged_transcript = [starting_transcript] + transcript_entries
+    
+    # Create a temporary directory for audio files
+    temp_dir = Path("temp_audio")
+    temp_dir.mkdir(exist_ok=True)
+    
+    # Generate a unique session ID for this podcast
+    session_id = str(uuid.uuid4())
+    output_path = f"podcasts/{session_id}_podcast.mp3"
+    os.makedirs("podcasts", exist_ok=True)
+    
+    # Map of speaker_id to voice
+    voice_mapping = {
+        0: "alloy",  # Default/intro voice
+        1: "echo",   # First speaker
+        # 2: "fable",  # Second speaker
+        # 3: "onyx",   # Third speaker
+        # 4: "nova",   # Fourth speaker
+        # 5: "shimmer" # Fifth speaker
+    }
+    
+    # Generate audio for each transcript segment
+    audio_files = []
+    
+    async def generate_speech_for_segment(segment, index):
+        # Handle both dictionary and PodcastTranscriptEntry objects
+        if hasattr(segment, 'speaker_id'):
+            speaker_id = segment.speaker_id
+            dialog = segment.dialog
+        else:
+            speaker_id = segment.get("speaker_id", 0)
+            dialog = segment.get("dialog", "")
+        
+        # Select voice based on speaker_id
+        voice = voice_mapping.get(speaker_id, "alloy")
+        
+        # Generate a unique filename for this segment
+        filename = f"{temp_dir}/{session_id}_{index}.mp3"
+        
+        try:
+            # Generate speech using litellm
+            response = await aspeech(
+                model=app_config.TTS_SERVICE,
+                voice=voice,
+                input=dialog,
+                max_retries=2,
+                timeout=600,
+            )
+            
+            # Save the audio to a file - use proper streaming method
+            with open(filename, 'wb') as f:
+                f.write(response.content)
+            
+            return filename
+        except Exception as e:
+            print(f"Error generating speech for segment {index}: {str(e)}")
+            raise
+    
+    # Generate all audio files concurrently
+    tasks = [generate_speech_for_segment(segment, i) for i, segment in enumerate(merged_transcript)]
+    audio_files = await asyncio.gather(*tasks)
+    
+    # Merge audio files using ffmpeg
+    try:
+        # Create FFmpeg instance with the first input
+        ffmpeg = FFmpeg().option("y")
+        
+        # Add each audio file as input
+        for audio_file in audio_files:
+            ffmpeg = ffmpeg.input(audio_file)
+        
+        # Configure the concatenation and output
+        filter_complex = []
+        for i in range(len(audio_files)):
+            filter_complex.append(f"[{i}:0]")
+        
+        filter_complex_str = "".join(filter_complex) + f"concat=n={len(audio_files)}:v=0:a=1[outa]"
+        ffmpeg = ffmpeg.option("filter_complex", filter_complex_str)
+        ffmpeg = ffmpeg.output(output_path, map="[outa]")
+        
+        # Execute FFmpeg
+        await ffmpeg.execute()
+        
+        print(f"Successfully created podcast audio: {output_path}")
+        
+    except Exception as e:
+        print(f"Error merging audio files: {str(e)}")
+        raise
+    finally:
+        # Clean up temporary files
+        for audio_file in audio_files:
+            try:
+                os.remove(audio_file)
+            except:
+                pass
+    
+    return {
+        "podcast_transcript": merged_transcript,
+        "final_podcast_file_path": output_path
+    }
diff --git a/surfsense_backend/app/agents/podcaster/prompts.py b/surfsense_backend/app/agents/podcaster/prompts.py
new file mode 100644
index 000000000..2b4bdcfec
--- /dev/null
+++ b/surfsense_backend/app/agents/podcaster/prompts.py
@@ -0,0 +1,111 @@
+import datetime
+
+
+def get_podcast_generation_prompt():
+    return f"""
+Today's date: {datetime.datetime.now().strftime("%Y-%m-%d")}
+<podcast_generation_system>
+You are a master podcast scriptwriter, adept at transforming diverse input content into a lively, engaging, and natural-sounding conversation between two distinct podcast hosts. Your primary objective is to craft authentic, flowing dialogue that captures the spontaneity and chemistry of a real podcast discussion, completely avoiding any hint of robotic scripting or stiff formality. Think dynamic interplay, not just information delivery.
+
+<input>
+- '<source_content>': A block of text containing the information to be discussed in the podcast. This could be research findings, an article summary, a detailed outline, user chat history related to the topic, or any other relevant raw information. The content might be unstructured but serves as the factual basis for the podcast dialogue.
+</input>
+
+<output_format>
+A JSON object containing the podcast transcript with alternating speakers:
+{{
+  "podcast_transcripts": [
+    {{
+      "speaker_id": 0,
+      "dialog": "Speaker 0 dialog here"
+    }},
+    {{
+      "speaker_id": 1,
+      "dialog": "Speaker 1 dialog here"
+    }},
+    {{
+      "speaker_id": 0,
+      "dialog": "Speaker 0 dialog here"
+    }},
+    {{
+      "speaker_id": 1,
+      "dialog": "Speaker 1 dialog here"
+    }}
+  ]
+}}
+</output_format>
+
+<guidelines>
+1.  **Establish Distinct & Consistent Host Personas:**
+    *   **Speaker 0 (Lead Host):** Drives the conversation forward, introduces segments, poses key questions derived from the source content, and often summarizes takeaways. Maintain a guiding, clear, and engaging tone.
+    *   **Speaker 1 (Co-Host/Expert):** Offers deeper insights, provides alternative viewpoints or elaborations on the source content, asks clarifying or challenging questions, and shares relevant anecdotes or examples. Adopt a complementary tone (e.g., analytical, enthusiastic, reflective, slightly skeptical).
+    *   **Consistency is Key:** Ensure each speaker maintains their distinct voice, vocabulary choice, sentence structure, and perspective throughout the entire script. Avoid having them sound interchangeable. Their interaction should feel like a genuine partnership.
+
+2.  **Craft Natural & Dynamic Dialogue:**
+    *   **Emulate Real Conversation:** Use contractions (e.g., "don't", "it's"), interjections ("Oh!", "Wow!", "Hmm"), discourse markers ("you know", "right?", "well"), and occasional natural pauses or filler words. Avoid overly formal language or complex sentence structures typical of written text.
+    *   **Foster Interaction & Chemistry:** Write dialogue where speakers genuinely react *to each other*. They should build on points ("Exactly, and that reminds me..."), ask follow-up questions ("Could you expand on that?"), express agreement/disagreement respectfully ("That's a fair point, but have you considered...?"), and show active listening.
+    *   **Vary Rhythm & Pace:** Mix short, punchy lines with longer, more explanatory ones. Vary sentence beginnings. Use questions to break up exposition. The rhythm should feel spontaneous, not monotonous.
+    *   **Inject Personality & Relatability:** Allow for appropriate humor, moments of surprise or curiosity, brief personal reflections ("I actually experienced something similar..."), or relatable asides that fit the hosts' personas and the topic. Lightly reference past discussions if it enhances context ("Remember last week when we touched on...?").
+
+3.  **Structure for Flow and Listener Engagement:**
+    *   **Natural Beginning:** Start with dialogue that flows naturally after an introduction (which will be added manually). Avoid redundant greetings or podcast name mentions since these will be added separately.
+    *   **Logical Progression & Signposting:** Guide the listener through the information smoothly. Use clear transitions to link different ideas or segments ("So, now that we've covered X, let's dive into Y...", "That actually brings me to another key finding..."). Ensure topics flow logically from one to the next.
+    *   **Meaningful Conclusion:** Summarize the key takeaways or main points discussed, reinforcing the core message derived from the source content. End with a final thought, a lingering question for the audience, or a brief teaser for what's next, providing a sense of closure. Avoid abrupt endings.
+
+4.  **Integrate Source Content Seamlessly & Accurately:**
+    *   **Translate, Don't Recite:** Rephrase information from the `<source_content>` into conversational language suitable for each host's persona. Avoid directly copying dense sentences or technical jargon without explanation. The goal is discussion, not narration.
+    *   **Explain & Contextualize:** Use analogies, simple examples, storytelling, or have one host ask clarifying questions (acting as a listener surrogate) to break down complex ideas from the source.
+    *   **Weave Information Naturally:** Integrate facts, data, or key points from the source *within* the dialogue, not as standalone, undigested blocks. Attribute information conversationally where appropriate ("The research mentioned...", "Apparently, the key factor is...").
+    *   **Balance Depth & Accessibility:** Ensure the conversation is informative and factually accurate based on the source content, but prioritize clear communication and engaging delivery over exhaustive technical detail. Make it understandable and interesting for a general audience.
+
+5.  **Length & Pacing:**
+    *   **Six-Minute Duration:** Create a transcript that, when read at a natural speaking pace, would result in approximately 6 minutes of audio. Typically, this means around 1000 words total (based on average speaking rate of 150 words per minute).
+    *   **Concise Speaking Turns:** Keep most speaking turns relatively brief and focused. Aim for a natural back-and-forth rhythm rather than extended monologues.
+    *   **Essential Content Only:** Prioritize the most important information from the source content. Focus on quality over quantity, ensuring every line contributes meaningfully to the topic.
+</guidelines>
+
+<examples>
+Input: "Quantum computing uses quantum bits or qubits which can exist in multiple states simultaneously due to superposition."
+
+Output:
+{{
+  "podcast_transcripts": [
+    {{
+      "speaker_id": 0,
+      "dialog": "Today we're diving into the mind-bending world of quantum computing. You know, this is a topic I've been excited to cover for weeks."
+    }},
+    {{
+      "speaker_id": 1,
+      "dialog": "Same here! And I know our listeners have been asking for it. But I have to admit, the concept of quantum computing makes my head spin a little. Can we start with the basics?"
+    }},
+    {{
+      "speaker_id": 0,
+      "dialog": "Absolutely. So regular computers use bits, right? Little on-off switches that are either 1 or 0. But quantum computers use something called qubits, and this is where it gets fascinating."
+    }},
+    {{
+      "speaker_id": 1, 
+      "dialog": "Wait, what makes qubits so special compared to regular bits?"
+    }},
+    {{
+      "speaker_id": 0,
+      "dialog": "The magic is in something called superposition. These qubits can exist in multiple states at the same time, not just 1 or 0."
+    }},
+    {{
+      "speaker_id": 1,
+      "dialog": "That sounds impossible! How would you even picture that?"
+    }},
+    {{
+      "speaker_id": 0,
+      "dialog": "Think of it like a coin spinning in the air. Before it lands, is it heads or tails?"
+    }},
+    {{
+      "speaker_id": 1,
+      "dialog": "Well, it's... neither? Or I guess both, until it lands? Oh, I think I see where you're going with this."
+    }}
+  ]
+}}
+</examples>
+
+Transform the source material into a lively and engaging podcast conversation. Craft dialogue that showcases authentic host chemistry and natural interaction (including occasional disagreement, building on points, or asking follow-up questions). Use varied speech patterns reflecting real human conversation, ensuring the final script effectively educates *and* entertains the listener while keeping within a 3-minute audio duration.
+</podcast_generation_system>
+"""
\ No newline at end of file
diff --git a/surfsense_backend/app/agents/podcaster/state.py b/surfsense_backend/app/agents/podcaster/state.py
new file mode 100644
index 000000000..d77270d22
--- /dev/null
+++ b/surfsense_backend/app/agents/podcaster/state.py
@@ -0,0 +1,38 @@
+"""Define the state structures for the agent."""
+
+from __future__ import annotations
+
+from dataclasses import dataclass
+from typing import List, Optional
+from pydantic import BaseModel, Field
+
+
+class PodcastTranscriptEntry(BaseModel):
+    """
+    Represents a single entry in a podcast transcript.
+    """
+    speaker_id: int = Field(..., description="The ID of the speaker (0 or 1)")
+    dialog: str = Field(..., description="The dialog text spoken by the speaker")
+
+
+class PodcastTranscripts(BaseModel):
+    """
+    Represents the full podcast transcript structure.
+    """
+    podcast_transcripts: List[PodcastTranscriptEntry] = Field(
+        ..., 
+        description="List of transcript entries with alternating speakers"
+    ) 
+
+@dataclass
+class State:
+    """Defines the input state for the agent, representing a narrower interface to the outside world.
+
+    This class is used to define the initial state and structure of incoming data.
+    See: https://langchain-ai.github.io/langgraph/concepts/low_level/#state
+    for more information.
+    """
+
+    source_content: str
+    podcast_transcript: Optional[List[PodcastTranscriptEntry]] = None
+    final_podcast_file_path: Optional[str] = None
diff --git a/surfsense_backend/app/agents/podcaster/test_podcaster.py b/surfsense_backend/app/agents/podcaster/test_podcaster.py
new file mode 100644
index 000000000..df6728cc7
--- /dev/null
+++ b/surfsense_backend/app/agents/podcaster/test_podcaster.py
@@ -0,0 +1,474 @@
+#!/usr/bin/env python
+"""
+Test script for the Surfsense Podcaster agent.
+Run this directly from VS Code to test the Podcaster agent.
+"""
+
+import asyncio
+import os
+import sys
+from pathlib import Path
+
+# Add the project root to the Python path
+project_root = str(Path(__file__).resolve().parent.parent.parent.parent)
+if project_root not in sys.path:
+    sys.path.insert(0, project_root)
+
+from langchain_core.runnables import RunnableConfig
+
+# Now import modules using absolute imports
+from app.agents.podcaster.graph import graph
+from app.agents.podcaster.state import State
+
+
+async def test_podcaster_agent():
+    """Test the Podcaster agent with a sample input."""
+    
+    # Print banner
+    print("=" * 80)
+    print("SURFSENSE PODCASTER AGENT TEST")
+    print("=" * 80)
+    
+    # Sample input for testing
+    sample_source_content = """
+<h1 align="center">Deep-Live-Cam</h1>
+
+<p align="center">
+  Real-time face swap and video deepfake with a single click and only a single image.
+</p>
+
+<p align="center">
+<a href="https://trendshift.io/repositories/11395" target="_blank"><img src="https://trendshift.io/api/badge/repositories/11395" alt="hacksider%2FDeep-Live-Cam | Trendshift" style="width: 250px; height: 55px;" width="250" height="55"/></a>
+</p>
+
+<p align="center">
+  <img src="media/demo.gif" alt="Demo GIF" width="800">
+</p>
+
+##  Disclaimer
+
+This deepfake software is designed to be a productive tool for the AI-generated media industry. It can assist artists in animating custom characters, creating engaging content, and even using models for clothing design.
+
+We are aware of the potential for unethical applications and are committed to preventative measures. A built-in check prevents the program from processing inappropriate media (nudity, graphic content, sensitive material like war footage, etc.). We will continue to develop this project responsibly, adhering to the law and ethics. We may shut down the project or add watermarks if legally required.
+
+- Ethical Use: Users are expected to use this software responsibly and legally. If using a real person's face, obtain their consent and clearly label any output as a deepfake when sharing online.
+
+- Content Restrictions: The software includes built-in checks to prevent processing inappropriate media, such as nudity, graphic content, or sensitive material.
+
+- Legal Compliance: We adhere to all relevant laws and ethical guidelines. If legally required, we may shut down the project or add watermarks to the output.
+
+- User Responsibility: We are not responsible for end-user actions. Users must ensure their use of the software aligns with ethical standards and legal requirements.
+
+By using this software, you agree to these terms and commit to using it in a manner that respects the rights and dignity of others.
+
+Users are expected to use this software responsibly and legally. If using a real person's face, obtain their consent and clearly label any output as a deepfake when sharing online. We are not responsible for end-user actions.
+
+## Exclusive v2.0 Quick Start - Pre-built (Windows)
+
+  <a href="https://deeplivecam.net/index.php/quickstart"> <img src="media/Download.png" width="285" height="77" />
+
+##### This is the fastest build you can get if you have a discrete NVIDIA or AMD GPU.
+ 
+###### These Pre-builts are perfect for non-technical users or those who don't have time to, or can't manually install all the requirements. Just a heads-up: this is an open-source project, so you can also install it manually. This will be 60 days ahead on the open source version.
+
+## TLDR; Live Deepfake in just 3 Clicks
+![easysteps](https://github.com/user-attachments/assets/af825228-852c-411b-b787-ffd9aac72fc6)
+1. Select a face
+2. Select which camera to use
+3. Press live!
+
+## Features & Uses - Everything is in real-time
+
+### Mouth Mask
+
+**Retain your original mouth for accurate movement using Mouth Mask**
+
+<p align="center">
+  <img src="media/ludwig.gif" alt="resizable-gif">
+</p>
+
+### Face Mapping
+
+**Use different faces on multiple subjects simultaneously**
+
+<p align="center">
+  <img src="media/streamers.gif" alt="face_mapping_source">
+</p>
+
+### Your Movie, Your Face
+
+**Watch movies with any face in real-time**
+
+<p align="center">
+  <img src="media/movie.gif" alt="movie">
+</p>
+
+### Live Show
+
+**Run Live shows and performances**
+
+<p align="center">
+  <img src="media/live_show.gif" alt="show">
+</p>
+
+### Memes
+
+**Create Your Most Viral Meme Yet**
+
+<p align="center">
+  <img src="media/meme.gif" alt="show" width="450"> 
+  <br>
+  <sub>Created using Many Faces feature in Deep-Live-Cam</sub>
+</p>
+
+### Omegle
+
+**Surprise people on Omegle**
+
+<p align="center">
+  <video src="https://github.com/user-attachments/assets/2e9b9b82-fa04-4b70-9f56-b1f68e7672d0" width="450" controls></video>
+</p>
+
+## Installation (Manual)
+
+**Please be aware that the installation requires technical skills and is not for beginners. Consider downloading the prebuilt version.**
+
+<details>
+<summary>Click to see the process</summary>
+
+### Installation
+
+This is more likely to work on your computer but will be slower as it utilizes the CPU.
+
+**1. Set up Your Platform**
+
+-   Python (3.10 recommended)
+-   pip
+-   git
+-   [ffmpeg](https://www.youtube.com/watch?v=OlNWCpFdVMA) - ```iex (irm ffmpeg.tc.ht)```
+-   [Visual Studio 2022 Runtimes (Windows)](https://visualstudio.microsoft.com/visual-cpp-build-tools/)
+
+**2. Clone the Repository**
+
+```bash
+git clone https://github.com/hacksider/Deep-Live-Cam.git
+cd Deep-Live-Cam
+```
+
+**3. Download the Models**
+
+1. [GFPGANv1.4](https://huggingface.co/hacksider/deep-live-cam/resolve/main/GFPGANv1.4.pth)
+2. [inswapper\_128\_fp16.onnx](https://huggingface.co/hacksider/deep-live-cam/resolve/main/inswapper_128_fp16.onnx)
+
+Place these files in the "**models**" folder.
+
+**4. Install Dependencies**
+
+We highly recommend using a `venv` to avoid issues.
+
+
+For Windows:
+```bash
+python -m venv venv
+venv\Scripts\activate
+pip install -r requirements.txt
+```
+For Linux:
+```bash
+# Ensure you use the installed Python 3.10
+python3 -m venv venv
+source venv/bin/activate
+pip install -r requirements.txt
+```
+
+**For macOS:**
+
+Apple Silicon (M1/M2/M3) requires specific setup:
+
+```bash
+# Install Python 3.10 (specific version is important)
+brew install python@3.10
+
+# Install tkinter package (required for the GUI)
+brew install python-tk@3.10
+
+# Create and activate virtual environment with Python 3.10
+python3.10 -m venv venv
+source venv/bin/activate
+
+# Install dependencies
+pip install -r requirements.txt
+```
+
+** In case something goes wrong and you need to reinstall the virtual environment **
+
+```bash
+# Deactivate the virtual environment
+rm -rf venv
+
+# Reinstall the virtual environment
+python -m venv venv
+source venv/bin/activate
+
+# install the dependencies again
+pip install -r requirements.txt
+```
+
+**Run:** If you don't have a GPU, you can run Deep-Live-Cam using `python run.py`. Note that initial execution will download models (~300MB).
+
+### GPU Acceleration
+
+**CUDA Execution Provider (Nvidia)**
+
+1. Install [CUDA Toolkit 11.8.0](https://developer.nvidia.com/cuda-11-8-0-download-archive)
+2. Install dependencies:
+
+```bash
+pip uninstall onnxruntime onnxruntime-gpu
+pip install onnxruntime-gpu==1.16.3
+```
+
+3. Usage:
+
+```bash
+python run.py --execution-provider cuda
+```
+
+**CoreML Execution Provider (Apple Silicon)**
+
+Apple Silicon (M1/M2/M3) specific installation:
+
+1. Make sure you've completed the macOS setup above using Python 3.10.
+2. Install dependencies:
+
+```bash
+pip uninstall onnxruntime onnxruntime-silicon
+pip install onnxruntime-silicon==1.13.1
+```
+
+3. Usage (important: specify Python 3.10):
+
+```bash
+python3.10 run.py --execution-provider coreml
+```
+
+**Important Notes for macOS:**
+- You **must** use Python 3.10, not newer versions like 3.11 or 3.13
+- Always run with `python3.10` command not just `python` if you have multiple Python versions installed
+- If you get error about `_tkinter` missing, reinstall the tkinter package: `brew reinstall python-tk@3.10`
+- If you get model loading errors, check that your models are in the correct folder
+- If you encounter conflicts with other Python versions, consider uninstalling them:
+  ```bash
+  # List all installed Python versions
+  brew list | grep python
+  
+  # Uninstall conflicting versions if needed
+  brew uninstall --ignore-dependencies python@3.11 python@3.13
+  
+  # Keep only Python 3.10
+  brew cleanup
+  ```
+
+**CoreML Execution Provider (Apple Legacy)**
+
+1. Install dependencies:
+
+```bash
+pip uninstall onnxruntime onnxruntime-coreml
+pip install onnxruntime-coreml==1.13.1
+```
+
+2. Usage:
+
+```bash
+python run.py --execution-provider coreml
+```
+
+**DirectML Execution Provider (Windows)**
+
+1. Install dependencies:
+
+```bash
+pip uninstall onnxruntime onnxruntime-directml
+pip install onnxruntime-directml==1.15.1
+```
+
+2. Usage:
+
+```bash
+python run.py --execution-provider directml
+```
+
+**OpenVINO™ Execution Provider (Intel)**
+
+1. Install dependencies:
+
+```bash
+pip uninstall onnxruntime onnxruntime-openvino
+pip install onnxruntime-openvino==1.15.0
+```
+
+2. Usage:
+
+```bash
+python run.py --execution-provider openvino
+```
+</details>
+
+## Usage
+
+**1. Image/Video Mode**
+
+-   Execute `python run.py`.
+-   Choose a source face image and a target image/video.
+-   Click "Start".
+-   The output will be saved in a directory named after the target video.
+
+**2. Webcam Mode**
+
+-   Execute `python run.py`.
+-   Select a source face image.
+-   Click "Live".
+-   Wait for the preview to appear (10-30 seconds).
+-   Use a screen capture tool like OBS to stream.
+-   To change the face, select a new source image.
+
+## Tips and Tricks
+
+Check out these helpful guides to get the most out of Deep-Live-Cam:
+
+- [Unlocking the Secrets to the Perfect Deepfake Image](https://deeplivecam.net/index.php/blog/tips-and-tricks/unlocking-the-secrets-to-the-perfect-deepfake-image) - Learn how to create the best deepfake with full head coverage
+- [Video Call with DeepLiveCam](https://deeplivecam.net/index.php/blog/tips-and-tricks/video-call-with-deeplivecam) - Make your meetings livelier by using DeepLiveCam with OBS and meeting software
+- [Have a Special Guest!](https://deeplivecam.net/index.php/blog/tips-and-tricks/have-a-special-guest) - Tutorial on how to use face mapping to add special guests to your stream
+- [Watch Deepfake Movies in Realtime](https://deeplivecam.net/index.php/blog/tips-and-tricks/watch-deepfake-movies-in-realtime) - See yourself star in any video without processing the video
+- [Better Quality without Sacrificing Speed](https://deeplivecam.net/index.php/blog/tips-and-tricks/better-quality-without-sacrificing-speed) - Tips for achieving better results without impacting performance
+- [Instant Vtuber!](https://deeplivecam.net/index.php/blog/tips-and-tricks/instant-vtuber) - Create a new persona/vtuber easily using Metahuman Creator
+
+Visit our [official blog](https://deeplivecam.net/index.php/blog/tips-and-tricks) for more tips and tutorials.
+
+## Command Line Arguments (Unmaintained)
+
+```
+options:
+  -h, --help                                               show this help message and exit
+  -s SOURCE_PATH, --source SOURCE_PATH                     select a source image
+  -t TARGET_PATH, --target TARGET_PATH                     select a target image or video
+  -o OUTPUT_PATH, --output OUTPUT_PATH                     select output file or directory
+  --frame-processor FRAME_PROCESSOR [FRAME_PROCESSOR ...]  frame processors (choices: face_swapper, face_enhancer, ...)
+  --keep-fps                                               keep original fps
+  --keep-audio                                             keep original audio
+  --keep-frames                                            keep temporary frames
+  --many-faces                                             process every face
+  --map-faces                                              map source target faces
+  --mouth-mask                                             mask the mouth region
+  --video-encoder {libx264,libx265,libvpx-vp9}             adjust output video encoder
+  --video-quality [0-51]                                   adjust output video quality
+  --live-mirror                                            the live camera display as you see it in the front-facing camera frame
+  --live-resizable                                         the live camera frame is resizable
+  --max-memory MAX_MEMORY                                  maximum amount of RAM in GB
+  --execution-provider {cpu} [{cpu} ...]                   available execution provider (choices: cpu, ...)
+  --execution-threads EXECUTION_THREADS                    number of execution threads
+  -v, --version                                            show program's version number and exit
+```
+
+Looking for a CLI mode? Using the -s/--source argument will make the run program in cli mode.
+
+## Press
+
+**We are always open to criticism and are ready to improve, that's why we didn't cherry-pick anything.**
+
+ - [*"Deep-Live-Cam goes viral, allowing anyone to become a digital doppelganger"*](https://arstechnica.com/information-technology/2024/08/new-ai-tool-enables-real-time-face-swapping-on-webcams-raising-fraud-concerns/) - Ars Technica
+ - [*"Thanks Deep Live Cam, shapeshifters are among us now"*](https://dataconomy.com/2024/08/15/what-is-deep-live-cam-github-deepfake/) - Dataconomy
+ - [*"This free AI tool lets you become anyone during video-calls"*](https://www.newsbytesapp.com/news/science/deep-live-cam-ai-impersonation-tool-goes-viral/story) - NewsBytes
+ - [*"OK, this viral AI live stream software is truly terrifying"*](https://www.creativebloq.com/ai/ok-this-viral-ai-live-stream-software-is-truly-terrifying) - Creative Bloq
+ - [*"Deepfake AI Tool Lets You Become Anyone in a Video Call With Single Photo"*](https://petapixel.com/2024/08/14/deep-live-cam-deepfake-ai-tool-lets-you-become-anyone-in-a-video-call-with-single-photo-mark-zuckerberg-jd-vance-elon-musk/) - PetaPixel
+ - [*"Deep-Live-Cam Uses AI to Transform Your Face in Real-Time, Celebrities Included"*](https://www.techeblog.com/deep-live-cam-ai-transform-face/) - TechEBlog
+ - [*"An AI tool that "makes you look like anyone" during a video call is going viral online"*](https://telegrafi.com/en/a-tool-that-makes-you-look-like-anyone-during-a-video-call-is-going-viral-on-the-Internet/) - Telegrafi
+ - [*"This Deepfake Tool Turning Images Into Livestreams is Topping the GitHub Charts"*](https://decrypt.co/244565/this-deepfake-tool-turning-images-into-livestreams-is-topping-the-github-charts) - Emerge
+ - [*"New Real-Time Face-Swapping AI Allows Anyone to Mimic Famous Faces"*](https://www.digitalmusicnews.com/2024/08/15/face-swapping-ai-real-time-mimic/) - Digital Music News
+ - [*"This real-time webcam deepfake tool raises alarms about the future of identity theft"*](https://www.diyphotography.net/this-real-time-webcam-deepfake-tool-raises-alarms-about-the-future-of-identity-theft/) - DIYPhotography
+ - [*"That's Crazy, Oh God. That's Fucking Freaky Dude... That's So Wild Dude"*](https://www.youtube.com/watch?time_continue=1074&v=py4Tc-Y8BcY) - SomeOrdinaryGamers
+ - [*"Alright look look look, now look chat, we can do any face we want to look like chat"*](https://www.youtube.com/live/mFsCe7AIxq8?feature=shared&t=2686) - IShowSpeed
+
+## Credits
+
+-   [ffmpeg](https://ffmpeg.org/): for making video-related operations easy
+-   [deepinsight](https://github.com/deepinsight): for their [insightface](https://github.com/deepinsight/insightface) project which provided a well-made library and models. Please be reminded that the [use of the model is for non-commercial research purposes only](https://github.com/deepinsight/insightface?tab=readme-ov-file#license).
+-   [havok2-htwo](https://github.com/havok2-htwo): for sharing the code for webcam
+-   [GosuDRM](https://github.com/GosuDRM): for the open version of roop
+-   [pereiraroland26](https://github.com/pereiraroland26): Multiple faces support
+-   [vic4key](https://github.com/vic4key): For supporting/contributing to this project
+-   [kier007](https://github.com/kier007): for improving the user experience
+-   [qitianai](https://github.com/qitianai): for multi-lingual support
+-   and [all developers](https://github.com/hacksider/Deep-Live-Cam/graphs/contributors) behind libraries used in this project.
+-   Footnote: Please be informed that the base author of the code is [s0md3v](https://github.com/s0md3v/roop)
+-   All the wonderful users who helped make this project go viral by starring the repo ❤️
+
+[![Stargazers](https://reporoster.com/stars/hacksider/Deep-Live-Cam)](https://github.com/hacksider/Deep-Live-Cam/stargazers)
+
+## Contributions
+
+![Alt](https://repobeats.axiom.co/api/embed/fec8e29c45dfdb9c5916f3a7830e1249308d20e1.svg "Repobeats analytics image")
+
+## Stars to the Moon 🚀
+
+<a href="https://star-history.com/#hacksider/deep-live-cam&Date">
+ <picture>
+   <source media="(prefers-color-scheme: dark)" srcset="https://api.star-history.com/svg?repos=hacksider/deep-live-cam&type=Date&theme=dark" />
+   <source media="(prefers-color-scheme: light)" srcset="https://api.star-history.com/svg?repos=hacksider/deep-live-cam&type=Date" />
+   <img alt="Star History Chart" src="https://api.star-history.com/svg?repos=hacksider/deep-live-cam&type=Date" />
+ </picture>
+</a>
+    """
+    
+    # Create initial state
+    initial_state = State(
+        source_content=sample_source_content
+    )
+    
+    # Configuration with podcast title
+    config = RunnableConfig(
+        configurable={
+            "podcast_title": "SurfSense"
+        }
+    )
+    
+    # Create 'podcasts' directory if it doesn't exist
+    os.makedirs("podcasts", exist_ok=True)
+    
+    # Run the agent
+    print("\nRunning Podcaster agent...\n")
+    
+    try:
+        # Execute the graph
+        final_state = await graph.ainvoke(initial_state, config)
+        
+        # Print results
+        print("\nAgent execution completed successfully!")
+        print(f"Generated podcast file: {final_state.get('final_podcast_file_path', 'No audio file generated')}")
+        
+        # If transcript was generated, show a preview
+        if final_state.get('podcast_transcript'):
+            print("\nPodcast transcript preview (first 3 entries):")
+            for i, entry in enumerate(final_state.get('podcast_transcript')[:3]):
+                # Handle both dictionary and PodcastTranscriptEntry objects
+                if hasattr(entry, 'speaker_id'):
+                    speaker_id = entry.speaker_id
+                    dialog = entry.dialog
+                else:
+                    speaker_id = entry.get('speaker_id', 0)
+                    dialog = entry.get('dialog', '')
+                    
+                print(f"Speaker {speaker_id}: {dialog[:50]}...")
+        
+    except Exception as e:
+        print(f"\nError running the agent: {str(e)}")
+        raise
+    
+    print("\nTest completed!")
+    return final_state
+
+
+if __name__ == "__main__":
+    # Run the test function
+    final_state = asyncio.run(test_podcaster_agent()) 
\ No newline at end of file
diff --git a/surfsense_backend/app/config/__init__.py b/surfsense_backend/app/config/__init__.py
index 8c457e17b..bdc370ea3 100644
--- a/surfsense_backend/app/config/__init__.py
+++ b/surfsense_backend/app/config/__init__.py
@@ -1,10 +1,12 @@
 import os
 from pathlib import Path
+import shutil
 
 from chonkie import AutoEmbeddings, CodeChunker, RecursiveChunker
 from dotenv import load_dotenv
 from langchain_community.chat_models import ChatLiteLLM
 from rerankers import Reranker
+from litellm import speech
 
 # Get the base directory of the project
 BASE_DIR = Path(__file__).resolve().parent.parent.parent
@@ -13,8 +15,27 @@ env_file = BASE_DIR / ".env"
 load_dotenv(env_file)
 
 
+def is_ffmpeg_installed():
+    """
+    Check if ffmpeg is installed on the current system.
+    
+    Returns:
+        bool: True if ffmpeg is installed, False otherwise.
+    """
+    return shutil.which("ffmpeg") is not None
+
+
 
 class Config:
+    # Check if ffmpeg is installed
+    if not is_ffmpeg_installed():
+        import static_ffmpeg
+        # ffmpeg installed on first call to add_paths(), threadsafe.
+        static_ffmpeg.add_paths()
+        # check if ffmpeg is installed again
+        if not is_ffmpeg_installed():
+            raise ValueError("FFmpeg is not installed on the system. Please install it to use the Surfsense Podcaster.")
+    
     # Database
     DATABASE_URL = os.getenv("DATABASE_URL")
 
@@ -61,6 +82,9 @@ class Config:
     # Firecrawl API Key
     FIRECRAWL_API_KEY = os.getenv("FIRECRAWL_API_KEY", None) 
     
+    # Litellm TTS Configuration
+    TTS_SERVICE = os.getenv("TTS_SERVICE")
+    
     # Validation Checks
     # Check embedding dimension
     if hasattr(embedding_model_instance, 'dimension') and embedding_model_instance.dimension > 2000:
diff --git a/surfsense_backend/pyproject.toml b/surfsense_backend/pyproject.toml
index c447a74b8..cecf70943 100644
--- a/surfsense_backend/pyproject.toml
+++ b/surfsense_backend/pyproject.toml
@@ -21,9 +21,11 @@ dependencies = [
     "notion-client>=2.3.0",
     "pgvector>=0.3.6",
     "playwright>=1.50.0",
+    "python-ffmpeg>=2.0.12",
     "rerankers[flashrank]>=0.7.1",
     "sentence-transformers>=3.4.1",
     "slack-sdk>=3.34.0",
+    "static-ffmpeg>=2.13",
     "tavily-python>=0.3.2",
     "unstructured-client>=0.30.0",
     "unstructured[all-docs]>=0.16.25",
diff --git a/surfsense_backend/uv.lock b/surfsense_backend/uv.lock
index a5621abda..5f90ed9ae 100644
--- a/surfsense_backend/uv.lock
+++ b/surfsense_backend/uv.lock
@@ -590,6 +590,15 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/68/1b/e0a87d256e40e8c888847551b20a017a6b98139178505dc7ffb96f04e954/dnspython-2.7.0-py3-none-any.whl", hash = "sha256:b4c34b7d10b51bcc3a5071e7b8dee77939f1e878477eeecc965e9835f63c6c86", size = 313632 },
 ]
 
+[[package]]
+name = "docutils"
+version = "0.21.2"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/ae/ed/aefcc8cd0ba62a0560c3c18c33925362d46c6075480bfa4df87b28e169a9/docutils-0.21.2.tar.gz", hash = "sha256:3a6b18732edf182daa3cd12775bbb338cf5691468f91eeeb109deff6ebfa986f", size = 2204444 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/8f/d7/9322c609343d929e75e7e5e6255e614fcc67572cfd083959cdef3b7aad79/docutils-0.21.2-py3-none-any.whl", hash = "sha256:dafca5b9e384f0e419294eb4d2ff9fa826435bf15f15b7bd45723e8ad76811b2", size = 587408 },
+]
+
 [[package]]
 name = "effdet"
 version = "0.4.1"
@@ -1144,6 +1153,18 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/f0/0f/310fb31e39e2d734ccaa2c0fb981ee41f7bd5056ce9bc29b2248bd569169/humanfriendly-10.0-py2.py3-none-any.whl", hash = "sha256:1697e1a8a8f550fd43c2865cd84542fc175a61dcb779b6fee18cf6b6ccba1477", size = 86794 },
 ]
 
+[[package]]
+name = "id"
+version = "1.5.0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "requests" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/22/11/102da08f88412d875fa2f1a9a469ff7ad4c874b0ca6fed0048fe385bdb3d/id-1.5.0.tar.gz", hash = "sha256:292cb8a49eacbbdbce97244f47a97b4c62540169c976552e497fd57df0734c1d", size = 15237 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/9f/cb/18326d2d89ad3b0dd143da971e77afd1e6ca6674f1b1c3df4b6bec6279fc/id-1.5.0-py3-none-any.whl", hash = "sha256:f1434e1cef91f2cbb8a4ec64663d5a23b9ed43ef44c4c957d02583d61714c658", size = 13611 },
+]
+
 [[package]]
 name = "idna"
 version = "3.10"
@@ -1165,6 +1186,48 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/79/9d/0fb148dc4d6fa4a7dd1d8378168d9b4cd8d4560a6fbf6f0121c5fc34eb68/importlib_metadata-8.6.1-py3-none-any.whl", hash = "sha256:02a89390c1e15fdfdc0d7c6b25cb3e62650d0494005c97d6f148bf5b9787525e", size = 26971 },
 ]
 
+[[package]]
+name = "jaraco-classes"
+version = "3.4.0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "more-itertools" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/06/c0/ed4a27bc5571b99e3cff68f8a9fa5b56ff7df1c2251cc715a652ddd26402/jaraco.classes-3.4.0.tar.gz", hash = "sha256:47a024b51d0239c0dd8c8540c6c7f484be3b8fcf0b2d85c13825780d3b3f3acd", size = 11780 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/7f/66/b15ce62552d84bbfcec9a4873ab79d993a1dd4edb922cbfccae192bd5b5f/jaraco.classes-3.4.0-py3-none-any.whl", hash = "sha256:f662826b6bed8cace05e7ff873ce0f9283b5c924470fe664fff1c2f00f581790", size = 6777 },
+]
+
+[[package]]
+name = "jaraco-context"
+version = "6.0.1"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/df/ad/f3777b81bf0b6e7bc7514a1656d3e637b2e8e15fab2ce3235730b3e7a4e6/jaraco_context-6.0.1.tar.gz", hash = "sha256:9bae4ea555cf0b14938dc0aee7c9f32ed303aa20a3b73e7dc80111628792d1b3", size = 13912 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/ff/db/0c52c4cf5e4bd9f5d7135ec7669a3a767af21b3a308e1ed3674881e52b62/jaraco.context-6.0.1-py3-none-any.whl", hash = "sha256:f797fc481b490edb305122c9181830a3a5b76d84ef6d1aef2fb9b47ab956f9e4", size = 6825 },
+]
+
+[[package]]
+name = "jaraco-functools"
+version = "4.1.0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "more-itertools" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/ab/23/9894b3df5d0a6eb44611c36aec777823fc2e07740dabbd0b810e19594013/jaraco_functools-4.1.0.tar.gz", hash = "sha256:70f7e0e2ae076498e212562325e805204fc092d7b4c17e0e86c959e249701a9d", size = 19159 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/9f/4f/24b319316142c44283d7540e76c7b5a6dbd5db623abd86bb7b3491c21018/jaraco.functools-4.1.0-py3-none-any.whl", hash = "sha256:ad159f13428bc4acbf5541ad6dec511f91573b90fba04df61dafa2a1231cf649", size = 10187 },
+]
+
+[[package]]
+name = "jeepney"
+version = "0.9.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/7b/6f/357efd7602486741aa73ffc0617fb310a29b588ed0fd69c2399acbb85b0c/jeepney-0.9.0.tar.gz", hash = "sha256:cf0e9e845622b81e4a28df94c40345400256ec608d0e55bb8a3feaa9163f5732", size = 106758 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/b2/a3/e137168c9c44d18eff0376253da9f1e9234d0239e0ee230d2fee6cea8e55/jeepney-0.9.0-py3-none-any.whl", hash = "sha256:97e5714520c16fc0a45695e5365a2e11b81ea79bba796e26f9f1d178cb182683", size = 49010 },
+]
+
 [[package]]
 name = "jinja2"
 version = "3.1.5"
@@ -1269,6 +1332,23 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/d1/0f/8910b19ac0670a0f80ce1008e5e751c4a57e14d2c4c13a482aa6079fa9d6/jsonschema_specifications-2024.10.1-py3-none-any.whl", hash = "sha256:a09a0680616357d9a0ecf05c12ad234479f549239d0f5b55f3deea67475da9bf", size = 18459 },
 ]
 
+[[package]]
+name = "keyring"
+version = "25.6.0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "jaraco-classes" },
+    { name = "jaraco-context" },
+    { name = "jaraco-functools" },
+    { name = "jeepney", marker = "sys_platform == 'linux'" },
+    { name = "pywin32-ctypes", marker = "sys_platform == 'win32'" },
+    { name = "secretstorage", marker = "sys_platform == 'linux'" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/70/09/d904a6e96f76ff214be59e7aa6ef7190008f52a0ab6689760a98de0bf37d/keyring-25.6.0.tar.gz", hash = "sha256:0b39998aa941431eb3d9b0d4b2460bc773b9df6fed7621c2dfb291a7e0187a66", size = 62750 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/d3/32/da7f44bcb1105d3e88a0b74ebdca50c59121d2ddf71c9e34ba47df7f3a56/keyring-25.6.0-py3-none-any.whl", hash = "sha256:552a3f7af126ece7ed5c89753650eec89c7eaae8617d0aa4d9ad2b75111266bd", size = 39085 },
+]
+
 [[package]]
 name = "kiwisolver"
 version = "1.4.8"
@@ -1754,6 +1834,15 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/cd/76/c8575f90f521017597c5e57e3bfef61e3f27d9cb6c741a82a24d72b10a60/model2vec-0.4.1-py3-none-any.whl", hash = "sha256:04a397a17da9b967082b6baa4c494f0be48c89ec4e1a3975b4f290f045238a38", size = 41972 },
 ]
 
+[[package]]
+name = "more-itertools"
+version = "10.7.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/ce/a0/834b0cebabbfc7e311f30b46c8188790a37f89fc8d756660346fe5abfd09/more_itertools-10.7.0.tar.gz", hash = "sha256:9fddd5403be01a94b204faadcff459ec3568cf110265d3c54323e1e866ad29d3", size = 127671 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/2b/9f/7ba6f94fc1e9ac3d2b853fdff3035fb2fa5afbed898c4a72b8a020610594/more_itertools-10.7.0-py3-none-any.whl", hash = "sha256:d43980384673cb07d2f7d2d918c616b30c659c089ee23953f601d6609c67510e", size = 65278 },
+]
+
 [[package]]
 name = "mpmath"
 version = "1.3.0"
@@ -1829,6 +1918,37 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/b9/54/dd730b32ea14ea797530a4479b2ed46a6fb250f682a9cfb997e968bf0261/networkx-3.4.2-py3-none-any.whl", hash = "sha256:df5d4365b724cf81b8c6a7312509d0c22386097011ad1abe274afd5e9d3bbc5f", size = 1723263 },
 ]
 
+[[package]]
+name = "nh3"
+version = "0.2.21"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/37/30/2f81466f250eb7f591d4d193930df661c8c23e9056bdc78e365b646054d8/nh3-0.2.21.tar.gz", hash = "sha256:4990e7ee6a55490dbf00d61a6f476c9a3258e31e711e13713b2ea7d6616f670e", size = 16581 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/7f/81/b83775687fcf00e08ade6d4605f0be9c4584cb44c4973d9f27b7456a31c9/nh3-0.2.21-cp313-cp313t-macosx_10_12_x86_64.macosx_11_0_arm64.macosx_10_12_universal2.whl", hash = "sha256:fcff321bd60c6c5c9cb4ddf2554e22772bb41ebd93ad88171bbbb6f271255286", size = 1297678 },
+    { url = "https://files.pythonhosted.org/packages/22/ee/d0ad8fb4b5769f073b2df6807f69a5e57ca9cea504b78809921aef460d20/nh3-0.2.21-cp313-cp313t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:31eedcd7d08b0eae28ba47f43fd33a653b4cdb271d64f1aeda47001618348fde", size = 733774 },
+    { url = "https://files.pythonhosted.org/packages/ea/76/b450141e2d384ede43fe53953552f1c6741a499a8c20955ad049555cabc8/nh3-0.2.21-cp313-cp313t-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:d426d7be1a2f3d896950fe263332ed1662f6c78525b4520c8e9861f8d7f0d243", size = 760012 },
+    { url = "https://files.pythonhosted.org/packages/97/90/1182275db76cd8fbb1f6bf84c770107fafee0cb7da3e66e416bcb9633da2/nh3-0.2.21-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:9d67709bc0d7d1f5797b21db26e7a8b3d15d21c9c5f58ccfe48b5328483b685b", size = 923619 },
+    { url = "https://files.pythonhosted.org/packages/29/c7/269a7cfbec9693fad8d767c34a755c25ccb8d048fc1dfc7a7d86bc99375c/nh3-0.2.21-cp313-cp313t-musllinux_1_2_armv7l.whl", hash = "sha256:55823c5ea1f6b267a4fad5de39bc0524d49a47783e1fe094bcf9c537a37df251", size = 1000384 },
+    { url = "https://files.pythonhosted.org/packages/68/a9/48479dbf5f49ad93f0badd73fbb48b3d769189f04c6c69b0df261978b009/nh3-0.2.21-cp313-cp313t-musllinux_1_2_i686.whl", hash = "sha256:818f2b6df3763e058efa9e69677b5a92f9bc0acff3295af5ed013da544250d5b", size = 918908 },
+    { url = "https://files.pythonhosted.org/packages/d7/da/0279c118f8be2dc306e56819880b19a1cf2379472e3b79fc8eab44e267e3/nh3-0.2.21-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:b3b5c58161e08549904ac4abd450dacd94ff648916f7c376ae4b2c0652b98ff9", size = 909180 },
+    { url = "https://files.pythonhosted.org/packages/26/16/93309693f8abcb1088ae143a9c8dbcece9c8f7fb297d492d3918340c41f1/nh3-0.2.21-cp313-cp313t-win32.whl", hash = "sha256:637d4a10c834e1b7d9548592c7aad760611415fcd5bd346f77fd8a064309ae6d", size = 532747 },
+    { url = "https://files.pythonhosted.org/packages/a2/3a/96eb26c56cbb733c0b4a6a907fab8408ddf3ead5d1b065830a8f6a9c3557/nh3-0.2.21-cp313-cp313t-win_amd64.whl", hash = "sha256:713d16686596e556b65e7f8c58328c2df63f1a7abe1277d87625dcbbc012ef82", size = 528908 },
+    { url = "https://files.pythonhosted.org/packages/ba/1d/b1ef74121fe325a69601270f276021908392081f4953d50b03cbb38b395f/nh3-0.2.21-cp38-abi3-macosx_10_12_x86_64.macosx_11_0_arm64.macosx_10_12_universal2.whl", hash = "sha256:a772dec5b7b7325780922dd904709f0f5f3a79fbf756de5291c01370f6df0967", size = 1316133 },
+    { url = "https://files.pythonhosted.org/packages/b8/f2/2c7f79ce6de55b41e7715f7f59b159fd59f6cdb66223c05b42adaee2b645/nh3-0.2.21-cp38-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d002b648592bf3033adfd875a48f09b8ecc000abd7f6a8769ed86b6ccc70c759", size = 758328 },
+    { url = "https://files.pythonhosted.org/packages/6d/ad/07bd706fcf2b7979c51b83d8b8def28f413b090cf0cb0035ee6b425e9de5/nh3-0.2.21-cp38-abi3-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:2a5174551f95f2836f2ad6a8074560f261cf9740a48437d6151fd2d4d7d617ab", size = 747020 },
+    { url = "https://files.pythonhosted.org/packages/75/99/06a6ba0b8a0d79c3d35496f19accc58199a1fb2dce5e711a31be7e2c1426/nh3-0.2.21-cp38-abi3-manylinux_2_17_ppc64.manylinux2014_ppc64.whl", hash = "sha256:b8d55ea1fc7ae3633d758a92aafa3505cd3cc5a6e40470c9164d54dff6f96d42", size = 944878 },
+    { url = "https://files.pythonhosted.org/packages/79/d4/dc76f5dc50018cdaf161d436449181557373869aacf38a826885192fc587/nh3-0.2.21-cp38-abi3-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:6ae319f17cd8960d0612f0f0ddff5a90700fa71926ca800e9028e7851ce44a6f", size = 903460 },
+    { url = "https://files.pythonhosted.org/packages/cd/c3/d4f8037b2ab02ebf5a2e8637bd54736ed3d0e6a2869e10341f8d9085f00e/nh3-0.2.21-cp38-abi3-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:63ca02ac6f27fc80f9894409eb61de2cb20ef0a23740c7e29f9ec827139fa578", size = 839369 },
+    { url = "https://files.pythonhosted.org/packages/11/a9/1cd3c6964ec51daed7b01ca4686a5c793581bf4492cbd7274b3f544c9abe/nh3-0.2.21-cp38-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a5f77e62aed5c4acad635239ac1290404c7e940c81abe561fd2af011ff59f585", size = 739036 },
+    { url = "https://files.pythonhosted.org/packages/fd/04/bfb3ff08d17a8a96325010ae6c53ba41de6248e63cdb1b88ef6369a6cdfc/nh3-0.2.21-cp38-abi3-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:087ffadfdcd497658c3adc797258ce0f06be8a537786a7217649fc1c0c60c293", size = 768712 },
+    { url = "https://files.pythonhosted.org/packages/9e/aa/cfc0bf545d668b97d9adea4f8b4598667d2b21b725d83396c343ad12bba7/nh3-0.2.21-cp38-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:ac7006c3abd097790e611fe4646ecb19a8d7f2184b882f6093293b8d9b887431", size = 930559 },
+    { url = "https://files.pythonhosted.org/packages/78/9d/6f5369a801d3a1b02e6a9a097d56bcc2f6ef98cffebf03c4bb3850d8e0f0/nh3-0.2.21-cp38-abi3-musllinux_1_2_armv7l.whl", hash = "sha256:6141caabe00bbddc869665b35fc56a478eb774a8c1dfd6fba9fe1dfdf29e6efa", size = 1008591 },
+    { url = "https://files.pythonhosted.org/packages/a6/df/01b05299f68c69e480edff608248313cbb5dbd7595c5e048abe8972a57f9/nh3-0.2.21-cp38-abi3-musllinux_1_2_i686.whl", hash = "sha256:20979783526641c81d2f5bfa6ca5ccca3d1e4472474b162c6256745fbfe31cd1", size = 925670 },
+    { url = "https://files.pythonhosted.org/packages/3d/79/bdba276f58d15386a3387fe8d54e980fb47557c915f5448d8c6ac6f7ea9b/nh3-0.2.21-cp38-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:a7ea28cd49293749d67e4fcf326c554c83ec912cd09cd94aa7ec3ab1921c8283", size = 917093 },
+    { url = "https://files.pythonhosted.org/packages/e7/d8/c6f977a5cd4011c914fb58f5ae573b071d736187ccab31bfb1d539f4af9f/nh3-0.2.21-cp38-abi3-win32.whl", hash = "sha256:6c9c30b8b0d291a7c5ab0967ab200598ba33208f754f2f4920e9343bdd88f79a", size = 537623 },
+    { url = "https://files.pythonhosted.org/packages/23/fc/8ce756c032c70ae3dd1d48a3552577a325475af2a2f629604b44f571165c/nh3-0.2.21-cp38-abi3-win_amd64.whl", hash = "sha256:bb0014948f04d7976aabae43fcd4cb7f551f9f8ce785a4c9ef66e6c2590f8629", size = 535283 },
+]
+
 [[package]]
 name = "nltk"
 version = "3.9.1"
@@ -2366,6 +2486,12 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/bc/2b/e944e10c9b18e77e43d3bb4d6faa323f6cc27597db37b75bc3fd796adfd5/playwright-1.50.0-py3-none-win_amd64.whl", hash = "sha256:1859423da82de631704d5e3d88602d755462b0906824c1debe140979397d2e8d", size = 34784546 },
 ]
 
+[[package]]
+name = "progress"
+version = "1.6"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/2a/68/d8412d1e0d70edf9791cbac5426dc859f4649afc22f2abbeb0d947cf70fd/progress-1.6.tar.gz", hash = "sha256:c9c86e98b5c03fa1fe11e3b67c1feda4788b8d0fe7336c2ff7d5644ccfba34cd", size = 7842 }
+
 [[package]]
 name = "propcache"
 version = "0.2.1"
@@ -2705,6 +2831,19 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/6a/3e/b68c118422ec867fa7ab88444e1274aa40681c606d59ac27de5a5588f082/python_dotenv-1.0.1-py3-none-any.whl", hash = "sha256:f7b63ef50f1b690dddf550d03497b66d609393b40b564ed0d674909a68ebf16a", size = 19863 },
 ]
 
+[[package]]
+name = "python-ffmpeg"
+version = "2.0.12"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "pyee" },
+    { name = "typing-extensions" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/dd/4d/7ecffb341d646e016be76e36f5a42cb32f409c9ca21a57b68f067fad3fc7/python_ffmpeg-2.0.12.tar.gz", hash = "sha256:19ac80af5a064a2f53c245af1a909b2d7648ea045500d96d3bcd507b88d43dc7", size = 14126292 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/7f/6d/02e817aec661defe148cb9eb0c4eca2444846305f625c2243fb9f92a9045/python_ffmpeg-2.0.12-py3-none-any.whl", hash = "sha256:d86697da8dfb39335183e336d31baf42fb217468adf5ac97fd743898240faae3", size = 14411 },
+]
+
 [[package]]
 name = "python-iso639"
 version = "2025.2.18"
@@ -2770,6 +2909,15 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/eb/38/ac33370d784287baa1c3d538978b5e2ea064d4c1b93ffbd12826c190dd10/pytz-2025.1-py2.py3-none-any.whl", hash = "sha256:89dd22dca55b46eac6eda23b2d72721bf1bdfef212645d81513ef5d03038de57", size = 507930 },
 ]
 
+[[package]]
+name = "pywin32-ctypes"
+version = "0.2.3"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/85/9f/01a1a99704853cb63f253eea009390c88e7131c67e66a0a02099a8c917cb/pywin32-ctypes-0.2.3.tar.gz", hash = "sha256:d162dc04946d704503b2edc4d55f3dba5c1d539ead017afa00142c38b9885755", size = 29471 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/de/3d/8161f7711c017e01ac9f008dfddd9410dff3674334c233bde66e7ba65bbf/pywin32_ctypes-0.2.3-py3-none-any.whl", hash = "sha256:8a1513379d709975552d202d942d9837758905c8d01eb82b8bcc30918929e7b8", size = 30756 },
+]
+
 [[package]]
 name = "pyyaml"
 version = "6.0.2"
@@ -2834,6 +2982,20 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/4b/43/ca3d1018b392f49131843648e10b08ace23afe8dad3bee5f136e4346b7cd/rapidfuzz-3.12.2-cp313-cp313-win_arm64.whl", hash = "sha256:69f6ecdf1452139f2b947d0c169a605de578efdb72cbb2373cb0a94edca1fd34", size = 863535 },
 ]
 
+[[package]]
+name = "readme-renderer"
+version = "44.0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "docutils" },
+    { name = "nh3" },
+    { name = "pygments" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/5a/a9/104ec9234c8448c4379768221ea6df01260cd6c2ce13182d4eac531c8342/readme_renderer-44.0.tar.gz", hash = "sha256:8712034eabbfa6805cacf1402b4eeb2a73028f72d1166d6f5cb7f9c047c5d1e1", size = 32056 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/e1/67/921ec3024056483db83953ae8e48079ad62b92db7880013ca77632921dd0/readme_renderer-44.0-py3-none-any.whl", hash = "sha256:2fbca89b81a08526aadf1357a8c2ae889ec05fb03f5da67f9769c9a592166151", size = 13310 },
+]
+
 [[package]]
 name = "referencing"
 version = "0.36.2"
@@ -2927,6 +3089,15 @@ flashrank = [
     { name = "flashrank" },
 ]
 
+[[package]]
+name = "rfc3986"
+version = "2.0.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/85/40/1520d68bfa07ab5a6f065a186815fb6610c86fe957bc065754e47f7b0840/rfc3986-2.0.0.tar.gz", hash = "sha256:97aacf9dbd4bfd829baad6e6309fa6573aaf1be3f6fa735c8ab05e46cecb261c", size = 49026 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/ff/9a/9afaade874b2fa6c752c36f1548f718b5b83af81ed9b76628329dab81c1b/rfc3986-2.0.0-py2.py3-none-any.whl", hash = "sha256:50b1502b60e289cb37883f3dfd34532b8873c7de9f49bb546641ce9cbd256ebd", size = 31326 },
+]
+
 [[package]]
 name = "rich"
 version = "14.0.0"
@@ -3083,6 +3254,19 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/e4/1f/5d46a8d94e9f6d2c913cbb109e57e7eed914de38ea99e2c4d69a9fc93140/scipy-1.15.1-cp313-cp313t-win_amd64.whl", hash = "sha256:bc7136626261ac1ed988dca56cfc4ab5180f75e0ee52e58f1e6aa74b5f3eacd5", size = 43181730 },
 ]
 
+[[package]]
+name = "secretstorage"
+version = "3.3.3"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "cryptography", marker = "sys_platform != 'darwin'" },
+    { name = "jeepney", marker = "sys_platform != 'darwin'" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/53/a4/f48c9d79cb507ed1373477dbceaba7401fd8a23af63b837fa61f1dcd3691/SecretStorage-3.3.3.tar.gz", hash = "sha256:2403533ef369eca6d2ba81718576c5e0f564d5cca1b58f73a8b23e7d4eeebd77", size = 19739 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/54/24/b4293291fa1dd830f353d2cb163295742fa87f179fcc8a20a306a81978b7/SecretStorage-3.3.3-py3-none-any.whl", hash = "sha256:f356e6628222568e3af06f2eba8df495efa13b3b63081dafd4f7d9a7b7bc9f99", size = 15221 },
+]
+
 [[package]]
 name = "sentence-transformers"
 version = "3.4.1"
@@ -3192,6 +3376,20 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/d9/61/f2b52e107b1fc8944b33ef56bf6ac4ebbe16d91b94d2b87ce013bf63fb84/starlette-0.45.3-py3-none-any.whl", hash = "sha256:dfb6d332576f136ec740296c7e8bb8c8a7125044e7c6da30744718880cdd059d", size = 71507 },
 ]
 
+[[package]]
+name = "static-ffmpeg"
+version = "2.13"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "filelock" },
+    { name = "progress" },
+    { name = "requests" },
+    { name = "twine" },
+]
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/09/39/1a5d0603280dd681ec52a2a6717c05dab530190dff7887b7603740a1741b/static_ffmpeg-2.13-py3-none-any.whl", hash = "sha256:3bed55a7979f9de9d1eec1126b98774a1d41c2e323811f59973d54b9c94d6dac", size = 7586 },
+]
+
 [[package]]
 name = "surf-new-backend"
 version = "0.0.6"
@@ -3213,9 +3411,11 @@ dependencies = [
     { name = "notion-client" },
     { name = "pgvector" },
     { name = "playwright" },
+    { name = "python-ffmpeg" },
     { name = "rerankers", extra = ["flashrank"] },
     { name = "sentence-transformers" },
     { name = "slack-sdk" },
+    { name = "static-ffmpeg" },
     { name = "tavily-python" },
     { name = "unstructured", extra = ["all-docs"] },
     { name = "unstructured-client" },
@@ -3242,9 +3442,11 @@ requires-dist = [
     { name = "notion-client", specifier = ">=2.3.0" },
     { name = "pgvector", specifier = ">=0.3.6" },
     { name = "playwright", specifier = ">=1.50.0" },
+    { name = "python-ffmpeg", specifier = ">=2.0.12" },
     { name = "rerankers", extras = ["flashrank"], specifier = ">=0.7.1" },
     { name = "sentence-transformers", specifier = ">=3.4.1" },
     { name = "slack-sdk", specifier = ">=3.34.0" },
+    { name = "static-ffmpeg", specifier = ">=2.13" },
     { name = "tavily-python", specifier = ">=0.3.2" },
     { name = "unstructured", extras = ["all-docs"], specifier = ">=0.16.25" },
     { name = "unstructured-client", specifier = ">=0.30.0" },
@@ -3549,6 +3751,26 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/c7/30/37a3384d1e2e9320331baca41e835e90a3767303642c7a80d4510152cbcf/triton-3.2.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e5dfa23ba84541d7c0a531dfce76d8bcd19159d50a4a8b14ad01e91734a5c1b0", size = 253154278 },
 ]
 
+[[package]]
+name = "twine"
+version = "6.1.0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "id" },
+    { name = "keyring", marker = "platform_machine != 'ppc64le' and platform_machine != 's390x'" },
+    { name = "packaging" },
+    { name = "readme-renderer" },
+    { name = "requests" },
+    { name = "requests-toolbelt" },
+    { name = "rfc3986" },
+    { name = "rich" },
+    { name = "urllib3" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/c8/a2/6df94fc5c8e2170d21d7134a565c3a8fb84f9797c1dd65a5976aaf714418/twine-6.1.0.tar.gz", hash = "sha256:be324f6272eff91d07ee93f251edf232fc647935dd585ac003539b42404a8dbd", size = 168404 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/7c/b6/74e927715a285743351233f33ea3c684528a0d374d2e43ff9ce9585b73fe/twine-6.1.0-py3-none-any.whl", hash = "sha256:a47f973caf122930bf0fbbf17f80b83bc1602c9ce393c7845f289a3001dc5384", size = 40791 },
+]
+
 [[package]]
 name = "types-requests"
 version = "2.32.0.20250328"

From b4bee887bdb95593bede649447a450e3843c4b56 Mon Sep 17 00:00:00 2001
From: "DESKTOP-RTLN3BA\\$punk" <vermarohanfinal@gmail.com>
Date: Mon, 5 May 2025 23:18:12 -0700
Subject: [PATCH 12/70] feat: Added Podcast Feature and its actually fast.

- Fully Async
---
 .../6_change_podcast_content_to_transcript.py |  44 +
 .../versions/7_remove_is_generated_column.py  |  28 +
 .../app/agents/podcaster/graph.py             |  32 +-
 .../app/agents/podcaster/nodes.py             |   2 +-
 .../app/agents/podcaster/prompts.py           |   2 +-
 surfsense_backend/app/db.py                   |   3 +-
 .../app/routes/podcasts_routes.py             | 129 ++-
 surfsense_backend/app/schemas/__init__.py     |   3 +-
 surfsense_backend/app/schemas/chats.py        |   8 +-
 surfsense_backend/app/schemas/podcasts.py     |  12 +-
 surfsense_backend/app/tasks/podcast_tasks.py  |  94 +++
 .../[search_space_id]/chats/chats-client.tsx  | 447 ++++++++--
 .../dashboard/[search_space_id]/layout.tsx    |   7 +
 .../[search_space_id]/podcasts/page.tsx       |  22 +
 .../podcasts/podcasts-client.tsx              | 787 ++++++++++++++++++
 .../components/sidebar/app-sidebar.tsx        |   4 +-
 surfsense_web/components/ui/slider.tsx        |  28 +
 surfsense_web/package.json                    |   1 +
 surfsense_web/pnpm-lock.yaml                  |  98 +++
 19 files changed, 1676 insertions(+), 75 deletions(-)
 create mode 100644 surfsense_backend/alembic/versions/6_change_podcast_content_to_transcript.py
 create mode 100644 surfsense_backend/alembic/versions/7_remove_is_generated_column.py
 create mode 100644 surfsense_backend/app/tasks/podcast_tasks.py
 create mode 100644 surfsense_web/app/dashboard/[search_space_id]/podcasts/page.tsx
 create mode 100644 surfsense_web/app/dashboard/[search_space_id]/podcasts/podcasts-client.tsx
 create mode 100644 surfsense_web/components/ui/slider.tsx

diff --git a/surfsense_backend/alembic/versions/6_change_podcast_content_to_transcript.py b/surfsense_backend/alembic/versions/6_change_podcast_content_to_transcript.py
new file mode 100644
index 000000000..991948f3a
--- /dev/null
+++ b/surfsense_backend/alembic/versions/6_change_podcast_content_to_transcript.py
@@ -0,0 +1,44 @@
+"""Change podcast_content to podcast_transcript with JSON type
+
+Revision ID: 6
+Revises: 5
+Create Date: 2023-08-15 00:00:00.000000
+
+"""
+from typing import Sequence, Union
+
+from alembic import op
+import sqlalchemy as sa
+from sqlalchemy.dialects.postgresql import JSON
+
+
+# revision identifiers, used by Alembic.
+revision: str = '6'
+down_revision: Union[str, None] = '5'
+branch_labels: Union[str, Sequence[str], None] = None
+depends_on: Union[str, Sequence[str], None] = None
+
+
+def upgrade() -> None:
+    # Drop the old column and create a new one with the new name and type
+    # We need to do this because PostgreSQL doesn't support direct column renames with type changes
+    op.add_column('podcasts', sa.Column('podcast_transcript', JSON, nullable=False, server_default='{}'))
+    
+    # Copy data from old column to new column
+    # Convert text to JSON by storing it as a JSON string value
+    op.execute("UPDATE podcasts SET podcast_transcript = jsonb_build_object('text', podcast_content) WHERE podcast_content != ''")
+    
+    # Drop the old column
+    op.drop_column('podcasts', 'podcast_content')
+
+
+def downgrade() -> None:
+    # Add back the original column
+    op.add_column('podcasts', sa.Column('podcast_content', sa.Text(), nullable=False, server_default=''))
+    
+    # Copy data from JSON column back to text column
+    # Extract the 'text' field if it exists, otherwise use empty string
+    op.execute("UPDATE podcasts SET podcast_content = COALESCE((podcast_transcript->>'text'), '')")
+    
+    # Drop the new column
+    op.drop_column('podcasts', 'podcast_transcript') 
\ No newline at end of file
diff --git a/surfsense_backend/alembic/versions/7_remove_is_generated_column.py b/surfsense_backend/alembic/versions/7_remove_is_generated_column.py
new file mode 100644
index 000000000..c5d25ad70
--- /dev/null
+++ b/surfsense_backend/alembic/versions/7_remove_is_generated_column.py
@@ -0,0 +1,28 @@
+"""Remove is_generated column from podcasts table
+
+Revision ID: 7
+Revises: 6
+Create Date: 2023-08-15 01:00:00.000000
+
+"""
+from typing import Sequence, Union
+
+from alembic import op
+import sqlalchemy as sa
+
+
+# revision identifiers, used by Alembic.
+revision: str = '7'
+down_revision: Union[str, None] = '6'
+branch_labels: Union[str, Sequence[str], None] = None
+depends_on: Union[str, Sequence[str], None] = None
+
+
+def upgrade() -> None:
+    # Drop the is_generated column
+    op.drop_column('podcasts', 'is_generated')
+
+
+def downgrade() -> None:
+    # Add back the is_generated column with its original constraints
+    op.add_column('podcasts', sa.Column('is_generated', sa.Boolean(), nullable=False, server_default='false')) 
\ No newline at end of file
diff --git a/surfsense_backend/app/agents/podcaster/graph.py b/surfsense_backend/app/agents/podcaster/graph.py
index f4604a7c8..d102432ef 100644
--- a/surfsense_backend/app/agents/podcaster/graph.py
+++ b/surfsense_backend/app/agents/podcaster/graph.py
@@ -6,18 +6,26 @@ from .state import State
 
 from .nodes import create_merged_podcast_audio, create_podcast_transcript
 
-# Define a new graph
-workflow = StateGraph(State, config_schema=Configuration)
 
-# Add the node to the graph
-workflow.add_node("create_podcast_transcript", create_podcast_transcript)
-workflow.add_node("create_merged_podcast_audio", create_merged_podcast_audio)
+def build_graph():
+    
+    # Define a new graph
+    workflow = StateGraph(State, config_schema=Configuration)
 
-# Set the entrypoint as `call_model`
-workflow.add_edge("__start__", "create_podcast_transcript")
-workflow.add_edge("create_podcast_transcript", "create_merged_podcast_audio")
-workflow.add_edge("create_merged_podcast_audio", "__end__")
+    # Add the node to the graph
+    workflow.add_node("create_podcast_transcript", create_podcast_transcript)
+    workflow.add_node("create_merged_podcast_audio", create_merged_podcast_audio)
 
-# Compile the workflow into an executable graph
-graph = workflow.compile()
-graph.name = "Surfsense Podcaster"  # This defines the custom name in LangSmith
+    # Set the entrypoint as `call_model`
+    workflow.add_edge("__start__", "create_podcast_transcript")
+    workflow.add_edge("create_podcast_transcript", "create_merged_podcast_audio")
+    workflow.add_edge("create_merged_podcast_audio", "__end__")
+
+    # Compile the workflow into an executable graph
+    graph = workflow.compile()
+    graph.name = "Surfsense Podcaster"  # This defines the custom name in LangSmith
+    
+    return graph
+
+# Compile the graph once when the module is loaded
+graph = build_graph()
diff --git a/surfsense_backend/app/agents/podcaster/nodes.py b/surfsense_backend/app/agents/podcaster/nodes.py
index 810307ec2..19a233a6c 100644
--- a/surfsense_backend/app/agents/podcaster/nodes.py
+++ b/surfsense_backend/app/agents/podcaster/nodes.py
@@ -28,7 +28,7 @@ async def create_podcast_transcript(state: State, config: RunnableConfig) -> Dic
     # Create the messages
     messages = [
         SystemMessage(content=prompt),
-        HumanMessage(content=state.source_content)
+        HumanMessage(content=f"<source_content>{state.source_content}</source_content>")
     ]
     
     # Generate the podcast transcript
diff --git a/surfsense_backend/app/agents/podcaster/prompts.py b/surfsense_backend/app/agents/podcaster/prompts.py
index 2b4bdcfec..c08d38e31 100644
--- a/surfsense_backend/app/agents/podcaster/prompts.py
+++ b/surfsense_backend/app/agents/podcaster/prompts.py
@@ -106,6 +106,6 @@ Output:
 }}
 </examples>
 
-Transform the source material into a lively and engaging podcast conversation. Craft dialogue that showcases authentic host chemistry and natural interaction (including occasional disagreement, building on points, or asking follow-up questions). Use varied speech patterns reflecting real human conversation, ensuring the final script effectively educates *and* entertains the listener while keeping within a 3-minute audio duration.
+Transform the source material into a lively and engaging podcast conversation. Craft dialogue that showcases authentic host chemistry and natural interaction (including occasional disagreement, building on points, or asking follow-up questions). Use varied speech patterns reflecting real human conversation, ensuring the final script effectively educates *and* entertains the listener while keeping within a 5-minute audio duration.
 </podcast_generation_system>
 """
\ No newline at end of file
diff --git a/surfsense_backend/app/db.py b/surfsense_backend/app/db.py
index b4ee3e790..7327c3a0c 100644
--- a/surfsense_backend/app/db.py
+++ b/surfsense_backend/app/db.py
@@ -110,8 +110,7 @@ class Podcast(BaseModel, TimestampMixin):
     __tablename__ = "podcasts"
     
     title = Column(String, nullable=False, index=True)
-    is_generated = Column(Boolean, nullable=False, default=False)
-    podcast_content = Column(Text, nullable=False, default="")
+    podcast_transcript = Column(JSON, nullable=False, default={})
     file_location = Column(String(500), nullable=False, default="")
     
     search_space_id = Column(Integer, ForeignKey("searchspaces.id", ondelete='CASCADE'), nullable=False)
diff --git a/surfsense_backend/app/routes/podcasts_routes.py b/surfsense_backend/app/routes/podcasts_routes.py
index 7ac1da1ba..bc82e21d0 100644
--- a/surfsense_backend/app/routes/podcasts_routes.py
+++ b/surfsense_backend/app/routes/podcasts_routes.py
@@ -1,12 +1,16 @@
-from fastapi import APIRouter, Depends, HTTPException
+from fastapi import APIRouter, BackgroundTasks, Depends, HTTPException
 from sqlalchemy.ext.asyncio import AsyncSession
 from sqlalchemy.future import select
 from sqlalchemy.exc import IntegrityError, SQLAlchemyError
 from typing import List
-from app.db import get_async_session, User, SearchSpace, Podcast
-from app.schemas import PodcastCreate, PodcastUpdate, PodcastRead
+from app.db import get_async_session, User, SearchSpace, Podcast, Chat
+from app.schemas import PodcastCreate, PodcastUpdate, PodcastRead, PodcastGenerateRequest
 from app.users import current_active_user
 from app.utils.check_ownership import check_ownership
+from app.tasks.podcast_tasks import generate_chat_podcast
+from fastapi.responses import StreamingResponse
+import os
+from pathlib import Path
 
 router = APIRouter()
 
@@ -119,4 +123,121 @@ async def delete_podcast(
         raise he
     except SQLAlchemyError:
         await session.rollback()
-        raise HTTPException(status_code=500, detail="Database error occurred while deleting podcast") 
\ No newline at end of file
+        raise HTTPException(status_code=500, detail="Database error occurred while deleting podcast")
+
+async def generate_chat_podcast_with_new_session(
+    chat_id: int,
+    search_space_id: int,
+    podcast_title: str = "SurfSense Podcast"
+):
+    """Create a new session and process chat podcast generation."""
+    from app.db import async_session_maker
+    
+    async with async_session_maker() as session:
+        try:
+            await generate_chat_podcast(session, chat_id, search_space_id, podcast_title)
+        except Exception as e:
+            import logging
+            logging.error(f"Error generating podcast from chat: {str(e)}")
+
+@router.post("/podcasts/generate/")
+async def generate_podcast(
+    request: PodcastGenerateRequest,
+    session: AsyncSession = Depends(get_async_session),
+    user: User = Depends(current_active_user),
+    fastapi_background_tasks: BackgroundTasks = BackgroundTasks()
+):
+    try:
+        # Check if the user owns the search space
+        await check_ownership(session, SearchSpace, request.search_space_id, user)
+        
+        if request.type == "CHAT":
+            # Verify that all chat IDs belong to this user and search space
+            query = select(Chat).filter(
+                Chat.id.in_(request.ids),
+                Chat.search_space_id == request.search_space_id
+            ).join(SearchSpace).filter(SearchSpace.user_id == user.id)
+            
+            result = await session.execute(query)
+            valid_chats = result.scalars().all()
+            valid_chat_ids = [chat.id for chat in valid_chats]
+            
+            # If any requested ID is not in valid IDs, raise error immediately
+            if len(valid_chat_ids) != len(request.ids):
+                raise HTTPException(
+                    status_code=403, 
+                    detail="One or more chat IDs do not belong to this user or search space"
+                )
+            
+            # Only add a single task with the first chat ID
+            for chat_id in valid_chat_ids:
+                fastapi_background_tasks.add_task(
+                    generate_chat_podcast_with_new_session, 
+                    chat_id, 
+                    request.search_space_id,
+                    request.podcast_title
+                )
+        
+        return {
+            "message": "Podcast generation started",
+        }
+    except HTTPException as he:
+        raise he
+    except IntegrityError as e:
+        await session.rollback()
+        raise HTTPException(status_code=400, detail="Podcast generation failed due to constraint violation")
+    except SQLAlchemyError as e:
+        await session.rollback()
+        raise HTTPException(status_code=500, detail="Database error occurred while generating podcast")
+    except Exception as e:
+        await session.rollback()
+        raise HTTPException(status_code=500, detail=f"An unexpected error occurred: {str(e)}")
+
+@router.get("/podcasts/{podcast_id}/stream")
+async def stream_podcast(
+    podcast_id: int,
+    session: AsyncSession = Depends(get_async_session),
+    user: User = Depends(current_active_user)
+):
+    """Stream a podcast audio file."""
+    try:
+        # Get the podcast and check if user has access
+        result = await session.execute(
+            select(Podcast)
+            .join(SearchSpace)
+            .filter(Podcast.id == podcast_id, SearchSpace.user_id == user.id)
+        )
+        podcast = result.scalars().first()
+        
+        if not podcast:
+            raise HTTPException(
+                status_code=404,
+                detail="Podcast not found or you don't have permission to access it"
+            )
+        
+        # Get the file path
+        file_path = podcast.file_location
+        
+        # Check if the file exists
+        if not os.path.isfile(file_path):
+            raise HTTPException(status_code=404, detail="Podcast audio file not found")
+        
+        # Define a generator function to stream the file
+        def iterfile():
+            with open(file_path, mode="rb") as file_like:
+                yield from file_like
+        
+        # Return a streaming response with appropriate headers
+        return StreamingResponse(
+            iterfile(),
+            media_type="audio/mpeg",
+            headers={
+                "Accept-Ranges": "bytes",
+                "Content-Disposition": f"inline; filename={Path(file_path).name}"
+            }
+        )
+    
+    except HTTPException as he:
+        raise he
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=f"Error streaming podcast: {str(e)}") 
\ No newline at end of file
diff --git a/surfsense_backend/app/schemas/__init__.py b/surfsense_backend/app/schemas/__init__.py
index 07adf24de..21688dfb0 100644
--- a/surfsense_backend/app/schemas/__init__.py
+++ b/surfsense_backend/app/schemas/__init__.py
@@ -10,7 +10,7 @@ from .documents import (
     DocumentRead,
 )
 from .chunks import ChunkBase, ChunkCreate, ChunkUpdate, ChunkRead
-from .podcasts import PodcastBase, PodcastCreate, PodcastUpdate, PodcastRead
+from .podcasts import PodcastBase, PodcastCreate, PodcastUpdate, PodcastRead, PodcastGenerateRequest
 from .chats import ChatBase, ChatCreate, ChatUpdate, ChatRead, AISDKChatRequest
 from .search_source_connector import SearchSourceConnectorBase, SearchSourceConnectorCreate, SearchSourceConnectorUpdate, SearchSourceConnectorRead
 
@@ -39,6 +39,7 @@ __all__ = [
     "PodcastCreate",
     "PodcastUpdate",
     "PodcastRead",
+    "PodcastGenerateRequest",
     "ChatBase",
     "ChatCreate",
     "ChatUpdate",
diff --git a/surfsense_backend/app/schemas/chats.py b/surfsense_backend/app/schemas/chats.py
index ad7829b26..f5eefc532 100644
--- a/surfsense_backend/app/schemas/chats.py
+++ b/surfsense_backend/app/schemas/chats.py
@@ -1,8 +1,10 @@
 from typing import Any, Dict, List, Optional
-from pydantic import BaseModel
-from sqlalchemy import JSON
-from .base import IDModel, TimestampModel
+
 from app.db import ChatType
+from pydantic import BaseModel
+
+from .base import IDModel, TimestampModel
+
 
 class ChatBase(BaseModel):
     type: ChatType
diff --git a/surfsense_backend/app/schemas/podcasts.py b/surfsense_backend/app/schemas/podcasts.py
index fbec5482b..4132fb211 100644
--- a/surfsense_backend/app/schemas/podcasts.py
+++ b/surfsense_backend/app/schemas/podcasts.py
@@ -1,10 +1,10 @@
 from pydantic import BaseModel
+from typing import Any, List, Literal
 from .base import IDModel, TimestampModel
 
 class PodcastBase(BaseModel):
     title: str
-    is_generated: bool = False
-    podcast_content: str = ""
+    podcast_transcript: List[Any]
     file_location: str = ""
     search_space_id: int
 
@@ -16,4 +16,10 @@ class PodcastUpdate(PodcastBase):
 
 class PodcastRead(PodcastBase, IDModel, TimestampModel):
     class Config:
-        from_attributes = True 
\ No newline at end of file
+        from_attributes = True
+
+class PodcastGenerateRequest(BaseModel):
+    type: Literal["DOCUMENT", "CHAT"]
+    ids: List[int]
+    search_space_id: int
+    podcast_title: str = "SurfSense Podcast" 
\ No newline at end of file
diff --git a/surfsense_backend/app/tasks/podcast_tasks.py b/surfsense_backend/app/tasks/podcast_tasks.py
new file mode 100644
index 000000000..e148f5465
--- /dev/null
+++ b/surfsense_backend/app/tasks/podcast_tasks.py
@@ -0,0 +1,94 @@
+from sqlalchemy.ext.asyncio import AsyncSession
+from app.schemas import PodcastGenerateRequest
+from typing import List
+from sqlalchemy import select
+from app.db import Chat, Podcast
+from app.agents.podcaster.graph import graph as podcaster_graph
+from surfsense_backend.app.agents.podcaster.state import State
+
+
+async def generate_document_podcast(
+    session: AsyncSession,
+    document_id: int,
+    search_space_id: int,
+    user_id: int
+):
+    # TODO: Need to fetch the document chunks, then concatenate them and pass them to the podcast generation model
+    pass
+
+
+
+async def generate_chat_podcast(
+    session: AsyncSession,
+    chat_id: int,
+    search_space_id: int,
+    podcast_title: str
+):
+    # Fetch the chat with the specified ID
+    query = select(Chat).filter(
+        Chat.id == chat_id,
+        Chat.search_space_id == search_space_id
+    )
+    
+    result = await session.execute(query)
+    chat = result.scalars().first()
+    
+    if not chat:
+        raise ValueError(f"Chat with id {chat_id} not found in search space {search_space_id}")
+    
+    # Create chat history structure
+    chat_history_str = "<chat_history>"
+    
+    for message in chat.messages:
+        if message["role"] == "user":
+            chat_history_str += f"<user_message>{message['content']}</user_message>"
+        elif message["role"] == "assistant":
+            # Last annotation type will always be "ANSWER" here
+            answer_annotation = message["annotations"][-1]
+            answer_text = ""
+            if answer_annotation["type"] == "ANSWER":
+                answer_text = answer_annotation["content"]
+                # If content is a list, join it into a single string
+                if isinstance(answer_text, list):
+                    answer_text = "\n".join(answer_text)
+                chat_history_str += f"<assistant_message>{answer_text}</assistant_message>"
+                
+    chat_history_str += "</chat_history>"
+    
+    # Pass it to the SurfSense Podcaster
+    config = {
+        "configurable": {
+            "podcast_title" : "Surfsense",
+        }
+    }
+    # Initialize state with database session and streaming service
+    initial_state = State(
+        source_content=chat_history_str,
+    )
+    
+    # Run the graph directly
+    result = await podcaster_graph.ainvoke(initial_state, config=config)
+    
+    # Convert podcast transcript entries to serializable format
+    serializable_transcript = []
+    for entry in result["podcast_transcript"]:
+        serializable_transcript.append({
+            "speaker_id": entry.speaker_id,
+            "dialog": entry.dialog
+        })
+    
+    # Create a new podcast entry
+    podcast = Podcast(
+        title=f"{podcast_title}",
+        podcast_transcript=serializable_transcript,
+        file_location=result["final_podcast_file_path"],
+        search_space_id=search_space_id
+    )
+    
+    # Add to session and commit
+    session.add(podcast)
+    await session.commit()
+    await session.refresh(podcast)
+    
+    return podcast
+
diff --git a/surfsense_web/app/dashboard/[search_space_id]/chats/chats-client.tsx b/surfsense_web/app/dashboard/[search_space_id]/chats/chats-client.tsx
index c481bd6ec..6501ca684 100644
--- a/surfsense_web/app/dashboard/[search_space_id]/chats/chats-client.tsx
+++ b/surfsense_web/app/dashboard/[search_space_id]/chats/chats-client.tsx
@@ -3,7 +3,7 @@
 import { useState, useEffect } from 'react';
 import { motion, AnimatePresence } from 'framer-motion';
 import { useSearchParams } from 'next/navigation';
-import { MessageCircleMore, Search, Calendar, Tag, Trash2, ExternalLink, MoreHorizontal } from 'lucide-react';
+import { MessageCircleMore, Search, Calendar, Tag, Trash2, ExternalLink, MoreHorizontal, Radio, CheckCircle, Circle, Podcast } from 'lucide-react';
 import { format } from 'date-fns';
 
 // UI Components
@@ -42,6 +42,9 @@ import {
   SelectTrigger,
   SelectValue,
 } from "@/components/ui/select";
+import { Checkbox } from "@/components/ui/checkbox";
+import { Label } from "@/components/ui/label";
+import { toast } from "sonner";
 
 interface Chat {
   created_at: string;
@@ -92,6 +95,18 @@ export default function ChatsPageClient({ searchSpaceId }: ChatsPageClientProps)
   const [chatToDelete, setChatToDelete] = useState<{ id: number, title: string } | null>(null);
   const [isDeleting, setIsDeleting] = useState(false);
   
+  // New state for podcast generation
+  const [selectedChats, setSelectedChats] = useState<number[]>([]);
+  const [selectionMode, setSelectionMode] = useState(false);
+  const [podcastDialogOpen, setPodcastDialogOpen] = useState(false);
+  const [podcastTitle, setPodcastTitle] = useState("");
+  const [isGeneratingPodcast, setIsGeneratingPodcast] = useState(false);
+  
+  // New state for individual podcast generation
+  const [currentChatIndex, setCurrentChatIndex] = useState(0);
+  const [podcastTitles, setPodcastTitles] = useState<{[key: number]: string}>({});
+  const [processingChat, setProcessingChat] = useState<Chat | null>(null);
+  
   const chatsPerPage = 9;
   const searchParams = useSearchParams();
   
@@ -234,6 +249,177 @@ export default function ChatsPageClient({ searchSpaceId }: ChatsPageClientProps)
   // Get unique chat types for filter dropdown
   const chatTypes = ['all', ...Array.from(new Set(chats.map(chat => chat.type)))];
 
+  // Generate individual podcasts from selected chats
+  const handleGeneratePodcast = async () => {
+    if (selectedChats.length === 0) {
+      toast.error("Please select at least one chat");
+      return;
+    }
+    
+    const currentChatId = selectedChats[currentChatIndex];
+    const currentTitle = podcastTitles[currentChatId] || podcastTitle;
+    
+    if (!currentTitle.trim()) {
+      toast.error("Please enter a podcast title");
+      return;
+    }
+    
+    setIsGeneratingPodcast(true);
+    try {
+      const token = localStorage.getItem('surfsense_bearer_token');
+      if (!token) {
+        toast.error("Authentication error. Please log in again.");
+        setIsGeneratingPodcast(false);
+        return;
+      }
+      
+      // Create payload for single chat
+      const payload = {
+        type: "CHAT",
+        ids: [currentChatId], // Single chat ID
+        search_space_id: parseInt(searchSpaceId),
+        podcast_title: currentTitle
+      };
+      
+      const response = await fetch(`${process.env.NEXT_PUBLIC_FASTAPI_BACKEND_URL}/api/v1/podcasts/generate/`, {
+        method: 'POST',
+        headers: {
+          'Authorization': `Bearer ${token}`,
+          'Content-Type': 'application/json',
+        },
+        body: JSON.stringify(payload)
+      });
+      
+      if (!response.ok) {
+        const errorData = await response.json().catch(() => ({}));
+        throw new Error(errorData.detail || "Failed to generate podcast");
+      }
+      
+      const data = await response.json();
+      toast.success(`Podcast "${currentTitle}" generation started!`);
+      
+      // Move to the next chat or finish
+      if (currentChatIndex < selectedChats.length - 1) {
+        // Set up for next chat
+        setCurrentChatIndex(currentChatIndex + 1);
+        
+        // Find the next chat from the chats array
+        const nextChatId = selectedChats[currentChatIndex + 1];
+        const nextChat = chats.find(chat => chat.id === nextChatId) || null;
+        setProcessingChat(nextChat);
+        
+        // Default title for the next chat
+        if (!podcastTitles[nextChatId]) {
+          setPodcastTitle(nextChat?.title || `Podcast from Chat ${nextChatId}`);
+        } else {
+          setPodcastTitle(podcastTitles[nextChatId]);
+        }
+        
+        setIsGeneratingPodcast(false);
+      } else {
+        // All done
+        finishPodcastGeneration();
+      }
+    } catch (error) {
+      console.error('Error generating podcast:', error);
+      toast.error(error instanceof Error ? error.message : 'Failed to generate podcast');
+      setIsGeneratingPodcast(false);
+    }
+  };
+  
+  // Helper to finish the podcast generation process
+  const finishPodcastGeneration = () => {
+    toast.success("All podcasts are being generated! Check the podcasts tab to see them when ready.");
+    setPodcastDialogOpen(false);
+    setSelectedChats([]);
+    setSelectionMode(false);
+    setCurrentChatIndex(0);
+    setPodcastTitles({});
+    setProcessingChat(null);
+    setPodcastTitle("");
+    setIsGeneratingPodcast(false);
+  };
+
+  // Start podcast generation flow
+  const startPodcastGeneration = () => {
+    if (selectedChats.length === 0) {
+      toast.error("Please select at least one chat");
+      return;
+    }
+    
+    // Reset the state for podcast generation
+    setCurrentChatIndex(0);
+    setPodcastTitles({});
+    
+    // Set up for the first chat
+    const firstChatId = selectedChats[0];
+    const firstChat = chats.find(chat => chat.id === firstChatId) || null;
+    setProcessingChat(firstChat);
+    
+    // Set default title for the first chat
+    setPodcastTitle(firstChat?.title || `Podcast from Chat ${firstChatId}`);
+    setPodcastDialogOpen(true);
+  };
+  
+  // Update the title for the current chat
+  const updateCurrentChatTitle = (title: string) => {
+    const currentChatId = selectedChats[currentChatIndex];
+    setPodcastTitle(title);
+    setPodcastTitles(prev => ({
+      ...prev,
+      [currentChatId]: title
+    }));
+  };
+  
+  // Skip generating a podcast for the current chat
+  const skipCurrentChat = () => {
+    if (currentChatIndex < selectedChats.length - 1) {
+      // Move to the next chat
+      setCurrentChatIndex(currentChatIndex + 1);
+      
+      // Find the next chat
+      const nextChatId = selectedChats[currentChatIndex + 1];
+      const nextChat = chats.find(chat => chat.id === nextChatId) || null;
+      setProcessingChat(nextChat);
+      
+      // Set default title for the next chat
+      if (!podcastTitles[nextChatId]) {
+        setPodcastTitle(nextChat?.title || `Podcast from Chat ${nextChatId}`);
+      } else {
+        setPodcastTitle(podcastTitles[nextChatId]);
+      }
+    } else {
+      // All done (all skipped)
+      finishPodcastGeneration();
+    }
+  };
+
+  // Toggle chat selection
+  const toggleChatSelection = (chatId: number) => {
+    setSelectedChats(prev => 
+      prev.includes(chatId) 
+        ? prev.filter(id => id !== chatId) 
+        : [...prev, chatId]
+    );
+  };
+
+  // Select all visible chats
+  const selectAllVisibleChats = () => {
+    const visibleChatIds = currentChats.map(chat => chat.id);
+    setSelectedChats(prev => {
+      const allSelected = visibleChatIds.every(id => prev.includes(id));
+      return allSelected
+        ? prev.filter(id => !visibleChatIds.includes(id)) // Deselect all visible if all are selected
+        : [...new Set([...prev, ...visibleChatIds])]; // Add all visible, ensuring no duplicates
+    });
+  };
+
+  // Cancel selection mode
+  const cancelSelectionMode = () => {
+    setSelectionMode(false);
+    setSelectedChats([]);
+  };
+
   return (
     <motion.div
       className="container p-6 mx-auto"
@@ -278,18 +464,62 @@ export default function ChatsPageClient({ searchSpaceId }: ChatsPageClientProps)
             </Select>
           </div>
           
-          <div>
-            <Select value={sortOrder} onValueChange={setSortOrder}>
-              <SelectTrigger className="w-40">
-                <SelectValue placeholder="Sort order" />
-              </SelectTrigger>
-              <SelectContent>
-                <SelectGroup>
-                  <SelectItem value="newest">Newest First</SelectItem>
-                  <SelectItem value="oldest">Oldest First</SelectItem>
-                </SelectGroup>
-              </SelectContent>
-            </Select>
+          <div className="flex items-center gap-2">
+            {selectionMode ? (
+              <>
+                <Button 
+                  variant="outline" 
+                  size="sm" 
+                  onClick={selectAllVisibleChats}
+                  className="gap-1"
+                >
+                  <CheckCircle className="h-4 w-4" />
+                  {currentChats.every(chat => selectedChats.includes(chat.id)) 
+                    ? "Deselect All" 
+                    : "Select All"}
+                </Button>
+                <Button 
+                  variant="default" 
+                  size="sm" 
+                  onClick={startPodcastGeneration}
+                  className="gap-1"
+                  disabled={selectedChats.length === 0}
+                >
+                  <Podcast className="h-4 w-4" />
+                  Generate Podcast ({selectedChats.length})
+                </Button>
+                <Button 
+                  variant="ghost" 
+                  size="sm" 
+                  onClick={cancelSelectionMode}
+                >
+                  Cancel
+                </Button>
+              </>
+            ) : (
+              <>
+                <Button
+                  variant="outline"
+                  size="sm"
+                  onClick={() => setSelectionMode(true)}
+                  className="gap-1"
+                >
+                  <Podcast className="h-4 w-4" />
+                  Podcaster
+                </Button>
+                <Select value={sortOrder} onValueChange={setSortOrder}>
+                  <SelectTrigger className="w-40">
+                    <SelectValue placeholder="Sort order" />
+                  </SelectTrigger>
+                  <SelectContent>
+                    <SelectGroup>
+                      <SelectItem value="newest">Newest First</SelectItem>
+                      <SelectItem value="oldest">Oldest First</SelectItem>
+                    </SelectGroup>
+                  </SelectContent>
+                </Select>
+              </>
+            )}
           </div>
         </div>
         
@@ -334,44 +564,69 @@ export default function ChatsPageClient({ searchSpaceId }: ChatsPageClientProps)
                   animate="animate"
                   exit="exit"
                   transition={{ duration: 0.2, delay: index * 0.05 }}
-                  className="overflow-hidden hover:shadow-md transition-shadow"
+                  className={`overflow-hidden hover:shadow-md transition-shadow 
+                    ${selectionMode && selectedChats.includes(chat.id) 
+                    ? 'ring-2 ring-primary ring-offset-2' : ''}`}
+                  onClick={() => selectionMode ? toggleChatSelection(chat.id) : null}
                 >
                   <CardHeader className="pb-3">
                     <div className="flex justify-between items-start">
-                      <div className="space-y-1">
-                        <CardTitle className="line-clamp-1">{chat.title || `Chat ${chat.id}`}</CardTitle>
-                        <CardDescription>
-                          <span className="flex items-center gap-1">
-                            <Calendar className="h-3.5 w-3.5" />
-                            <span>{format(new Date(chat.created_at), 'MMM d, yyyy')}</span>
-                          </span>
-                        </CardDescription>
+                      <div className="space-y-1 flex items-start gap-2">
+                        {selectionMode && (
+                          <div className="mt-1">
+                            {selectedChats.includes(chat.id) 
+                              ? <CheckCircle className="h-4 w-4 text-primary" /> 
+                              : <Circle className="h-4 w-4 text-muted-foreground" />}
+                          </div>
+                        )}
+                        <div>
+                          <CardTitle className="line-clamp-1">{chat.title || `Chat ${chat.id}`}</CardTitle>
+                          <CardDescription>
+                            <span className="flex items-center gap-1">
+                              <Calendar className="h-3.5 w-3.5" />
+                              <span>{format(new Date(chat.created_at), 'MMM d, yyyy')}</span>
+                            </span>
+                          </CardDescription>
+                        </div>
                       </div>
-                      <DropdownMenu>
-                        <DropdownMenuTrigger asChild>
-                          <Button variant="ghost" size="icon" className="h-8 w-8">
-                            <MoreHorizontal className="h-4 w-4" />
-                            <span className="sr-only">Open menu</span>
-                          </Button>
-                        </DropdownMenuTrigger>
-                        <DropdownMenuContent align="end">
-                          <DropdownMenuItem onClick={() => window.location.href = `/dashboard/${chat.search_space_id}/researcher/${chat.id}`}>
-                            <ExternalLink className="mr-2 h-4 w-4" />
-                            <span>View Chat</span>
-                          </DropdownMenuItem>
-                          <DropdownMenuSeparator />
-                          <DropdownMenuItem 
-                            className="text-destructive focus:text-destructive"
-                            onClick={() => {
-                              setChatToDelete({ id: chat.id, title: chat.title || `Chat ${chat.id}` });
-                              setDeleteDialogOpen(true);
-                            }}
-                          >
-                            <Trash2 className="mr-2 h-4 w-4" />
-                            <span>Delete Chat</span>
-                          </DropdownMenuItem>
-                        </DropdownMenuContent>
-                      </DropdownMenu>
+                      {!selectionMode && (
+                        <DropdownMenu>
+                          <DropdownMenuTrigger asChild>
+                            <Button variant="ghost" size="icon" className="h-8 w-8">
+                              <MoreHorizontal className="h-4 w-4" />
+                              <span className="sr-only">Open menu</span>
+                            </Button>
+                          </DropdownMenuTrigger>
+                          <DropdownMenuContent align="end">
+                            <DropdownMenuItem onClick={() => window.location.href = `/dashboard/${chat.search_space_id}/researcher/${chat.id}`}>
+                              <ExternalLink className="mr-2 h-4 w-4" />
+                              <span>View Chat</span>
+                            </DropdownMenuItem>
+                            <DropdownMenuItem
+                              onClick={() => {
+                                setSelectedChats([chat.id]);
+                                setPodcastTitle(chat.title || `Chat ${chat.id}`);
+                                setPodcastDialogOpen(true);
+                              }}
+                            >
+                              <Podcast className="mr-2 h-4 w-4" />
+                              <span>Generate Podcast</span>
+                            </DropdownMenuItem>
+                            <DropdownMenuSeparator />
+                            <DropdownMenuItem 
+                              className="text-destructive focus:text-destructive"
+                              onClick={(e) => {
+                                e.stopPropagation();
+                                setChatToDelete({ id: chat.id, title: chat.title || `Chat ${chat.id}` });
+                                setDeleteDialogOpen(true);
+                              }}
+                            >
+                              <Trash2 className="mr-2 h-4 w-4" />
+                              <span>Delete Chat</span>
+                            </DropdownMenuItem>
+                          </DropdownMenuContent>
+                        </DropdownMenu>
+                      )}
                     </div>
                   </CardHeader>
                   <CardContent>
@@ -505,6 +760,104 @@ export default function ChatsPageClient({ searchSpaceId }: ChatsPageClientProps)
           </DialogFooter>
         </DialogContent>
       </Dialog>
+      
+      {/* Podcast Generation Dialog */}
+      <Dialog 
+        open={podcastDialogOpen} 
+        onOpenChange={(isOpen: boolean) => {
+          if (!isOpen) {
+            // Cancel the process if dialog is closed
+            setPodcastDialogOpen(false);
+            setSelectedChats([]);
+            setSelectionMode(false);
+            setCurrentChatIndex(0);
+            setPodcastTitles({});
+            setProcessingChat(null);
+            setPodcastTitle("");
+          } else {
+            setPodcastDialogOpen(true);
+          }
+        }}
+      >
+        <DialogContent className="sm:max-w-md">
+          <DialogHeader>
+            <DialogTitle className="flex items-center gap-2">
+              <Podcast className="h-5 w-5 text-primary" />
+              <span>Generate Podcast {currentChatIndex + 1} of {selectedChats.length}</span>
+            </DialogTitle>
+            <DialogDescription>
+              {selectedChats.length > 1 ? (
+                <>Creating individual podcasts for each selected chat. Currently processing: <span className="font-medium">{processingChat?.title || `Chat ${selectedChats[currentChatIndex]}`}</span></>
+              ) : (
+                <>Create a podcast from this chat. The podcast will be available in the podcasts section once generated.</>
+              )}
+            </DialogDescription>
+          </DialogHeader>
+          
+          <div className="space-y-4 py-2">
+            <div className="space-y-2">
+              <Label htmlFor="podcast-title">Podcast Title</Label>
+              <Input
+                id="podcast-title"
+                placeholder="Enter podcast title"
+                value={podcastTitle}
+                onChange={(e) => updateCurrentChatTitle(e.target.value)}
+              />
+            </div>
+            
+            {selectedChats.length > 1 && (
+              <div className="w-full bg-muted rounded-full h-2.5 mt-4">
+                <div
+                  className="bg-primary h-2.5 rounded-full transition-all duration-300"
+                  style={{ width: `${((currentChatIndex) / selectedChats.length) * 100}%` }}
+                ></div>
+              </div>
+            )}
+          </div>
+          
+          <DialogFooter className="flex gap-2 sm:justify-end">
+            {selectedChats.length > 1 && !isGeneratingPodcast && (
+              <Button
+                variant="outline"
+                onClick={skipCurrentChat}
+                className="gap-1"
+              >
+                Skip
+              </Button>
+            )}
+            <Button
+              variant="outline"
+              onClick={() => {
+                setPodcastDialogOpen(false);
+                setCurrentChatIndex(0);
+                setPodcastTitles({});
+                setProcessingChat(null);
+              }}
+              disabled={isGeneratingPodcast}
+            >
+              Cancel
+            </Button>
+            <Button
+              variant="default"
+              onClick={handleGeneratePodcast}
+              disabled={isGeneratingPodcast}
+              className="gap-2"
+            >
+              {isGeneratingPodcast ? (
+                <>
+                  <span className="h-4 w-4 animate-spin rounded-full border-2 border-current border-t-transparent" />
+                  Generating...
+                </>
+              ) : (
+                <>
+                  <Podcast className="h-4 w-4" />
+                  Generate Podcast
+                </>
+              )}
+            </Button>
+          </DialogFooter>
+        </DialogContent>
+      </Dialog>
     </motion.div>
   );
 } 
\ No newline at end of file
diff --git a/surfsense_web/app/dashboard/[search_space_id]/layout.tsx b/surfsense_web/app/dashboard/[search_space_id]/layout.tsx
index 7449e10b5..a3c344aaf 100644
--- a/surfsense_web/app/dashboard/[search_space_id]/layout.tsx
+++ b/surfsense_web/app/dashboard/[search_space_id]/layout.tsx
@@ -73,6 +73,13 @@ export default function DashboardLayout({
         },
       ],
     },
+    {
+      title: "Podcasts",
+      url: `/dashboard/${search_space_id}/podcasts`,
+      icon: "Podcast",
+      items: [
+      ],
+    }
     // TODO: Add research synthesizer's
     // {
     //   title: "Research Synthesizer's",
diff --git a/surfsense_web/app/dashboard/[search_space_id]/podcasts/page.tsx b/surfsense_web/app/dashboard/[search_space_id]/podcasts/page.tsx
new file mode 100644
index 000000000..394177c88
--- /dev/null
+++ b/surfsense_web/app/dashboard/[search_space_id]/podcasts/page.tsx
@@ -0,0 +1,22 @@
+import { Suspense } from 'react';
+import PodcastsPageClient from './podcasts-client';
+
+interface PageProps {
+  params: {
+    search_space_id: string;
+  };
+}
+
+export default async function PodcastsPage({ params }: PageProps) {
+  // Access dynamic route parameters
+  // Need to await params before accessing its properties in an async component
+  const { search_space_id: searchSpaceId } = await Promise.resolve(params);
+  
+  return (
+    <Suspense fallback={<div className="flex items-center justify-center h-[60vh]">
+      <div className="h-8 w-8 animate-spin rounded-full border-4 border-primary border-t-transparent"></div>
+    </div>}>
+      <PodcastsPageClient searchSpaceId={searchSpaceId} />
+    </Suspense>
+  );
+}
diff --git a/surfsense_web/app/dashboard/[search_space_id]/podcasts/podcasts-client.tsx b/surfsense_web/app/dashboard/[search_space_id]/podcasts/podcasts-client.tsx
new file mode 100644
index 000000000..cacee7061
--- /dev/null
+++ b/surfsense_web/app/dashboard/[search_space_id]/podcasts/podcasts-client.tsx
@@ -0,0 +1,787 @@
+'use client';
+
+import { useState, useEffect, useRef } from 'react';
+import { motion, AnimatePresence } from 'framer-motion';
+import { format } from 'date-fns';
+import { 
+  Search, Calendar, Trash2, MoreHorizontal, Podcast, 
+  Play, Pause, SkipForward, SkipBack, Volume2, VolumeX
+} from 'lucide-react';
+
+// UI Components
+import { Input } from '@/components/ui/input';
+import { Button } from '@/components/ui/button';
+import { Card, CardContent, CardFooter, CardHeader, CardTitle } from '@/components/ui/card';
+import { Slider } from '@/components/ui/slider';
+import { 
+  DropdownMenu, 
+  DropdownMenuContent, 
+  DropdownMenuItem, 
+  DropdownMenuTrigger,
+  DropdownMenuSeparator
+} from '@/components/ui/dropdown-menu';
+import {
+  Dialog,
+  DialogContent,
+  DialogDescription,
+  DialogFooter,
+  DialogHeader,
+  DialogTitle,
+} from "@/components/ui/dialog";
+import {
+  Select,
+  SelectContent,
+  SelectGroup,
+  SelectItem,
+  SelectTrigger,
+  SelectValue,
+} from "@/components/ui/select";
+import { toast } from "sonner";
+
+interface Podcast {
+  id: number;
+  title: string;
+  created_at: string;
+  file_location: string;
+  podcast_transcript: any[];
+  search_space_id: number;
+}
+
+interface PodcastsPageClientProps {
+  searchSpaceId: string;
+}
+
+const pageVariants = {
+  initial: { opacity: 0 },
+  enter: { opacity: 1, transition: { duration: 0.3, ease: 'easeInOut' } },
+  exit: { opacity: 0, transition: { duration: 0.3, ease: 'easeInOut' } }
+};
+
+const podcastCardVariants = {
+  initial: { y: 20, opacity: 0 },
+  animate: { y: 0, opacity: 1 },
+  exit: { y: -20, opacity: 0 }
+};
+
+const MotionCard = motion(Card);
+
+export default function PodcastsPageClient({ searchSpaceId }: PodcastsPageClientProps) {
+  const [podcasts, setPodcasts] = useState<Podcast[]>([]);
+  const [filteredPodcasts, setFilteredPodcasts] = useState<Podcast[]>([]);
+  const [isLoading, setIsLoading] = useState(true);
+  const [error, setError] = useState<string | null>(null);
+  const [searchQuery, setSearchQuery] = useState('');
+  const [sortOrder, setSortOrder] = useState<string>('newest');
+  const [deleteDialogOpen, setDeleteDialogOpen] = useState(false);
+  const [podcastToDelete, setPodcastToDelete] = useState<{ id: number, title: string } | null>(null);
+  const [isDeleting, setIsDeleting] = useState(false);
+  
+  // Audio player state
+  const [currentPodcast, setCurrentPodcast] = useState<Podcast | null>(null);
+  const [audioSrc, setAudioSrc] = useState<string | undefined>(undefined);
+  const [isAudioLoading, setIsAudioLoading] = useState(false);
+  const [isPlaying, setIsPlaying] = useState(false);
+  const [currentTime, setCurrentTime] = useState(0);
+  const [duration, setDuration] = useState(0);
+  const [volume, setVolume] = useState(0.7);
+  const [isMuted, setIsMuted] = useState(false);
+  const audioRef = useRef<HTMLAudioElement | null>(null);
+  const currentObjectUrlRef = useRef<string | null>(null);
+
+  // Add podcast image URL constant
+  const PODCAST_IMAGE_URL = "https://static.vecteezy.com/system/resources/thumbnails/002/157/611/small_2x/illustrations-concept-design-podcast-channel-free-vector.jpg";
+
+  // Fetch podcasts from API
+  useEffect(() => {
+    const fetchPodcasts = async () => {
+      try {
+        setIsLoading(true);
+        
+        // Get token from localStorage
+        const token = localStorage.getItem('surfsense_bearer_token');
+        
+        if (!token) {
+          setError('Authentication token not found. Please log in again.');
+          setIsLoading(false);
+          return;
+        }
+
+        // Fetch all podcasts for this search space
+        const response = await fetch(
+          `${process.env.NEXT_PUBLIC_FASTAPI_BACKEND_URL}/api/v1/podcasts/`,
+          {
+            headers: {
+              'Authorization': `Bearer ${token}`,
+              'Content-Type': 'application/json',
+            },
+            cache: 'no-store',
+          }
+        );
+
+        if (!response.ok) {
+          const errorData = await response.json().catch(() => null);
+          throw new Error(`Failed to fetch podcasts: ${response.status} ${errorData?.detail || ''}`);
+        }
+
+        const data: Podcast[] = await response.json();
+        setPodcasts(data);
+        setFilteredPodcasts(data);
+        setError(null);
+      } catch (error) {
+        console.error('Error fetching podcasts:', error);
+        setError(error instanceof Error ? error.message : 'Unknown error occurred');
+        setPodcasts([]);
+        setFilteredPodcasts([]);
+      } finally {
+        setIsLoading(false);
+      }
+    };
+
+    fetchPodcasts();
+  }, [searchSpaceId]);
+
+  // Filter and sort podcasts based on search query and sort order
+  useEffect(() => {
+    let result = [...podcasts];
+    
+    // Filter by search term
+    if (searchQuery) {
+      const query = searchQuery.toLowerCase();
+      result = result.filter(podcast => 
+        podcast.title.toLowerCase().includes(query)
+      );
+    }
+    
+    // Filter by search space
+    result = result.filter(podcast => 
+      podcast.search_space_id === parseInt(searchSpaceId)
+    );
+    
+    // Sort podcasts
+    result.sort((a, b) => {
+      const dateA = new Date(a.created_at).getTime();
+      const dateB = new Date(b.created_at).getTime();
+      
+      return sortOrder === 'newest' ? dateB - dateA : dateA - dateB;
+    });
+    
+    setFilteredPodcasts(result);
+  }, [podcasts, searchQuery, sortOrder, searchSpaceId]);
+
+  // Cleanup object URL on unmount or when currentPodcast changes
+  useEffect(() => {
+    return () => {
+      if (currentObjectUrlRef.current) {
+        URL.revokeObjectURL(currentObjectUrlRef.current);
+        currentObjectUrlRef.current = null;
+      }
+    };
+  }, []);
+
+  // Audio player time update handler
+  const handleTimeUpdate = () => {
+    if (audioRef.current) {
+      setCurrentTime(audioRef.current.currentTime);
+    }
+  };
+
+  // Audio player metadata loaded handler
+  const handleMetadataLoaded = () => {
+    if (audioRef.current) {
+      setDuration(audioRef.current.duration);
+    }
+  };
+
+  // Play/pause toggle
+  const togglePlayPause = () => {
+    if (audioRef.current) {
+      if (isPlaying) {
+        audioRef.current.pause();
+      } else {
+        audioRef.current.play();
+      }
+      setIsPlaying(!isPlaying);
+    }
+  };
+
+  // Seek to position
+  const handleSeek = (value: number[]) => {
+    if (audioRef.current) {
+      audioRef.current.currentTime = value[0];
+      setCurrentTime(value[0]);
+    }
+  };
+
+  // Volume change
+  const handleVolumeChange = (value: number[]) => {
+    if (audioRef.current) {
+      const newVolume = value[0];
+      audioRef.current.volume = newVolume;
+      setVolume(newVolume);
+      
+      if (newVolume === 0) {
+        setIsMuted(true);
+      } else if (isMuted) {
+        setIsMuted(false);
+      }
+    }
+  };
+
+  // Toggle mute
+  const toggleMute = () => {
+    if (audioRef.current) {
+      audioRef.current.muted = !isMuted;
+      setIsMuted(!isMuted);
+    }
+  };
+
+  // Skip forward 10 seconds
+  const skipForward = () => {
+    if (audioRef.current) {
+      audioRef.current.currentTime = Math.min(audioRef.current.duration, audioRef.current.currentTime + 10);
+    }
+  };
+
+  // Skip backward 10 seconds
+  const skipBackward = () => {
+    if (audioRef.current) {
+      audioRef.current.currentTime = Math.max(0, audioRef.current.currentTime - 10);
+    }
+  };
+
+  // Format time in MM:SS
+  const formatTime = (time: number) => {
+    const minutes = Math.floor(time / 60);
+    const seconds = Math.floor(time % 60);
+    return `${minutes}:${seconds < 10 ? '0' : ''}${seconds}`;
+  };
+
+  // Play podcast - Fetch blob and set object URL
+  const playPodcast = async (podcast: Podcast) => {
+    // If the same podcast is selected, just toggle play/pause
+    if (currentPodcast && currentPodcast.id === podcast.id) {
+      togglePlayPause();
+      return;
+    }
+
+    // Revoke previous object URL if exists
+    if (currentObjectUrlRef.current) {
+      URL.revokeObjectURL(currentObjectUrlRef.current);
+      currentObjectUrlRef.current = null;
+    }
+    
+    // Reset player state and show loading
+    setCurrentPodcast(podcast);
+    setAudioSrc(undefined);
+    setCurrentTime(0);
+    setDuration(0);
+    setIsPlaying(false);
+    setIsAudioLoading(true);
+    
+    try {
+      const token = localStorage.getItem('surfsense_bearer_token');
+      if (!token) {
+        toast.error('Authentication token not found.');
+        setIsAudioLoading(false);
+        return;
+      }
+
+      const response = await fetch(
+        `${process.env.NEXT_PUBLIC_FASTAPI_BACKEND_URL}/api/v1/podcasts/${podcast.id}/stream`,
+        {
+          headers: {
+            'Authorization': `Bearer ${token}`,
+          },
+        }
+      );
+
+      if (!response.ok) {
+        throw new Error(`Failed to fetch audio stream: ${response.statusText}`);
+      }
+
+      const blob = await response.blob();
+      const objectUrl = URL.createObjectURL(blob);
+      currentObjectUrlRef.current = objectUrl;
+      setAudioSrc(objectUrl);
+      
+      // Let the audio element load the new src
+      setTimeout(() => {
+        if (audioRef.current) {
+          audioRef.current.load();
+          audioRef.current.play()
+            .then(() => {
+              setIsPlaying(true);
+            })
+            .catch(error => {
+              console.error('Error playing audio:', error);
+              toast.error('Failed to play audio.');
+              setIsPlaying(false);
+            });
+        }
+      }, 50);
+
+    } catch (error) {
+      console.error('Error fetching or playing podcast:', error);
+      toast.error(error instanceof Error ? error.message : 'Failed to load podcast audio.');
+      setCurrentPodcast(null);
+    } finally {
+      setIsAudioLoading(false);
+    }
+  };
+
+  // Function to handle podcast deletion
+  const handleDeletePodcast = async () => {
+    if (!podcastToDelete) return;
+    
+    setIsDeleting(true);
+    try {
+      const token = localStorage.getItem('surfsense_bearer_token');
+      if (!token) {
+        setIsDeleting(false);
+        return;
+      }
+      
+      const response = await fetch(`${process.env.NEXT_PUBLIC_FASTAPI_BACKEND_URL}/api/v1/podcasts/${podcastToDelete.id}`, {
+        method: 'DELETE',
+        headers: {
+          'Authorization': `Bearer ${token}`,
+          'Content-Type': 'application/json',
+        }
+      });
+      
+      if (!response.ok) {
+        throw new Error(`Failed to delete podcast: ${response.statusText}`);
+      }
+      
+      // Close dialog and refresh podcasts
+      setDeleteDialogOpen(false);
+      setPodcastToDelete(null);
+      
+      // Update local state by removing the deleted podcast
+      setPodcasts(prevPodcasts => prevPodcasts.filter(podcast => podcast.id !== podcastToDelete.id));
+      
+      // If the current playing podcast is deleted, stop playback
+      if (currentPodcast && currentPodcast.id === podcastToDelete.id) {
+        if (audioRef.current) {
+          audioRef.current.pause();
+        }
+        setCurrentPodcast(null);
+        setIsPlaying(false);
+      }
+      
+      toast.success('Podcast deleted successfully');
+    } catch (error) {
+      console.error('Error deleting podcast:', error);
+      toast.error(error instanceof Error ? error.message : 'Failed to delete podcast');
+    } finally {
+      setIsDeleting(false);
+    }
+  };
+
+  return (
+    <motion.div
+      className="container p-6 mx-auto"
+      initial="initial"
+      animate="enter"
+      exit="exit"
+      variants={pageVariants}
+    >
+      <div className="flex flex-col space-y-4 md:space-y-6">
+        <div className="flex flex-col space-y-2">
+          <h1 className="text-3xl font-bold tracking-tight">Podcasts</h1>
+          <p className="text-muted-foreground">Listen to generated podcasts.</p>
+        </div>
+        
+        {/* Filter and Search Bar */}
+        <div className="flex flex-col space-y-4 md:flex-row md:items-center md:justify-between md:space-y-0">
+          <div className="flex flex-1 items-center gap-2">
+            <div className="relative w-full md:w-80">
+              <Search className="absolute left-2.5 top-2.5 h-4 w-4 text-muted-foreground" />
+              <Input
+                type="text"
+                placeholder="Search podcasts..."
+                className="pl-8"
+                value={searchQuery}
+                onChange={(e) => setSearchQuery(e.target.value)}
+              />
+            </div>
+          </div>
+          
+          <div>
+            <Select value={sortOrder} onValueChange={setSortOrder}>
+              <SelectTrigger className="w-40">
+                <SelectValue placeholder="Sort order" />
+              </SelectTrigger>
+              <SelectContent>
+                <SelectGroup>
+                  <SelectItem value="newest">Newest First</SelectItem>
+                  <SelectItem value="oldest">Oldest First</SelectItem>
+                </SelectGroup>
+              </SelectContent>
+            </Select>
+          </div>
+        </div>
+        
+        {/* Status Messages */}
+        {isLoading && (
+          <div className="flex items-center justify-center h-40">
+            <div className="flex flex-col items-center gap-2">
+              <div className="h-8 w-8 animate-spin rounded-full border-4 border-primary border-t-transparent"></div>
+              <p className="text-sm text-muted-foreground">Loading podcasts...</p>
+            </div>
+          </div>
+        )}
+        
+        {error && !isLoading && (
+          <div className="border border-destructive/50 text-destructive p-4 rounded-md">
+            <h3 className="font-medium">Error loading podcasts</h3>
+            <p className="text-sm">{error}</p>
+          </div>
+        )}
+        
+        {!isLoading && !error && filteredPodcasts.length === 0 && (
+          <div className="flex flex-col items-center justify-center h-40 gap-2 text-center">
+            <Podcast className="h-8 w-8 text-muted-foreground" />
+            <h3 className="font-medium">No podcasts found</h3>
+            <p className="text-sm text-muted-foreground">
+              {searchQuery 
+                ? 'Try adjusting your search filters' 
+                : 'Generate podcasts from your chats to get started'}
+            </p>
+          </div>
+        )}
+        
+        {/* Podcast Grid */}
+        {!isLoading && !error && filteredPodcasts.length > 0 && (
+          <AnimatePresence mode="wait">
+            <div className="grid grid-cols-1 md:grid-cols-2 lg:grid-cols-3 gap-6">
+              {filteredPodcasts.map((podcast, index) => (
+                <MotionCard
+                  key={podcast.id}
+                  variants={podcastCardVariants}
+                  initial="initial"
+                  animate="animate"
+                  exit="exit"
+                  transition={{ duration: 0.2, delay: index * 0.05 }}
+                  className={`
+                    bg-card/60 dark:bg-card/40 backdrop-blur-lg rounded-xl p-4 
+                    shadow-lg hover:shadow-xl transition-all duration-300 
+                    border-border overflow-hidden 
+                    ${currentPodcast?.id === podcast.id ? 'ring-2 ring-primary ring-offset-2 ring-offset-background' : ''}
+                  `}
+                  layout
+                >
+                  <div 
+                    className="relative w-full aspect-[16/10] mb-4 rounded-lg overflow-hidden group cursor-pointer"
+                    onClick={() => playPodcast(podcast)}
+                  >
+                    {/* Podcast image */}
+                    <img 
+                      src={PODCAST_IMAGE_URL} 
+                      alt="Podcast illustration" 
+                      className="w-full h-full object-cover transition-transform duration-300 group-hover:scale-105 brightness-[0.85] contrast-[1.1]"
+                      loading="lazy"
+                    />
+                    
+                    {/* Overlay for better contrast with controls */}
+                    <div className="absolute inset-0 bg-black/20 group-hover:bg-black/30 transition-colors"></div>
+                    
+                    {/* Loading indicator */}
+                    {currentPodcast?.id === podcast.id && isAudioLoading && (
+                      <div className="absolute inset-0 flex items-center justify-center bg-background/50 z-10">
+                        <div className="h-8 w-8 animate-spin rounded-full border-2 border-primary border-t-transparent"></div>
+                      </div>
+                    )}
+
+                    {/* Play button */}
+                    {!(currentPodcast?.id === podcast.id && (isPlaying || isAudioLoading)) && (
+                      <Button
+                        variant="outline"
+                        size="icon"
+                        className="absolute top-1/2 left-1/2 -translate-x-1/2 -translate-y-1/2 h-14 w-14 rounded-full 
+                          bg-background/70 hover:bg-background/90 backdrop-blur-sm scale-90 group-hover:scale-100 
+                          transition-transform duration-200 z-0 shadow-lg"
+                        onClick={(e) => {
+                          e.stopPropagation();
+                          playPodcast(podcast);
+                        }}
+                        disabled={isAudioLoading}
+                      >
+                        <Play className="h-7 w-7 ml-1" /> 
+                      </Button>
+                    )}
+                    
+                    {/* Pause button */}
+                    {currentPodcast?.id === podcast.id && isPlaying && !isAudioLoading && (
+                      <Button
+                        variant="outline"
+                        size="icon"
+                        className="absolute top-1/2 left-1/2 -translate-x-1/2 -translate-y-1/2 h-14 w-14 rounded-full 
+                          bg-background/70 hover:bg-background/90 backdrop-blur-sm scale-90 group-hover:scale-100 
+                          transition-transform duration-200 z-0 shadow-lg"
+                        onClick={(e) => {
+                          e.stopPropagation();
+                          togglePlayPause();
+                        }}
+                        disabled={isAudioLoading}
+                      >
+                        <Pause className="h-7 w-7" /> 
+                      </Button>
+                    )}
+                  </div>
+
+                  <div className="mb-3 px-1">
+                    <h3 className="text-base font-semibold text-foreground truncate" title={podcast.title}>
+                      {podcast.title || 'Untitled Podcast'}
+                    </h3>
+                    <p className="text-xs text-muted-foreground mt-0.5 flex items-center gap-1.5">
+                      <Calendar className="h-3 w-3" /> 
+                      {format(new Date(podcast.created_at), 'MMM d, yyyy')}
+                    </p>
+                  </div>
+                  
+                  {currentPodcast?.id === podcast.id && !isAudioLoading && (
+                    <div className="mb-3 px-1">
+                      <div
+                        className="h-1.5 bg-muted rounded-full cursor-pointer group relative"
+                        onClick={(e) => {
+                          if (!audioRef.current || !duration) return;
+                          const container = e.currentTarget;
+                          const rect = container.getBoundingClientRect();
+                          const x = e.clientX - rect.left;
+                          const percentage = Math.max(0, Math.min(1, x / rect.width));
+                          const newTime = percentage * duration;
+                          handleSeek([newTime]);
+                        }}
+                      >
+                        <div
+                          className="h-full bg-primary rounded-full relative transition-all duration-75 ease-linear"
+                          style={{ width: `${(currentTime / duration) * 100}%` }}
+                        >
+                          <div className="absolute right-0 top-1/2 -translate-y-1/2 w-3 h-3 
+                            bg-primary rounded-full shadow-md transform scale-0 translate-x-1/2 
+                            group-hover:scale-100 transition-transform"
+                          />
+                        </div>
+                      </div>
+                      <div className="flex justify-between mt-1.5 text-xs text-muted-foreground">
+                        <span>{formatTime(currentTime)}</span>
+                        <span>{formatTime(duration)}</span>
+                      </div>
+                    </div>
+                  )}
+
+                  {currentPodcast?.id === podcast.id && !isAudioLoading && (
+                    <div className="flex items-center justify-between px-2 mt-1">
+                      <Button
+                        variant="ghost"
+                        size="icon"
+                        onClick={skipBackward}
+                        className="w-9 h-9 text-muted-foreground hover:text-primary transition-colors"
+                        title="Rewind 10 seconds"
+                        disabled={!duration}
+                      >
+                        <SkipBack className="w-5 h-5" />
+                      </Button>
+                      <Button
+                        variant="ghost"
+                        size="icon"
+                        onClick={togglePlayPause}
+                        className="w-10 h-10 text-primary hover:bg-primary/10 rounded-full transition-colors"
+                        disabled={!duration}
+                      >
+                        {isPlaying ?
+                          <Pause className="w-6 h-6" /> :
+                          <Play className="w-6 h-6 ml-0.5" />
+                        }
+                      </Button>
+                      <Button
+                        variant="ghost"
+                        size="icon"
+                        onClick={skipForward}
+                        className="w-9 h-9 text-muted-foreground hover:text-primary transition-colors"
+                        title="Forward 10 seconds"
+                        disabled={!duration}
+                      >
+                        <SkipForward className="w-5 h-5" />
+                      </Button>
+                    </div>
+                  )}
+                  
+                  <div className="absolute top-2 right-2 z-20">
+                      <DropdownMenu>
+                        <DropdownMenuTrigger asChild>
+                          <Button variant="ghost" size="icon" className="h-7 w-7 bg-background/50 hover:bg-background/80 rounded-full backdrop-blur-sm">
+                            <MoreHorizontal className="h-4 w-4" />
+                            <span className="sr-only">Open menu</span>
+                          </Button>
+                        </DropdownMenuTrigger>
+                        <DropdownMenuContent align="end">
+                          <DropdownMenuItem
+                            className="text-destructive focus:text-destructive"
+                            onClick={() => {
+                              setPodcastToDelete({ id: podcast.id, title: podcast.title });
+                              setDeleteDialogOpen(true);
+                            }}
+                          >
+                            <Trash2 className="mr-2 h-4 w-4" />
+                            <span>Delete Podcast</span>
+                          </DropdownMenuItem>
+                        </DropdownMenuContent>
+                      </DropdownMenu>
+                  </div>
+
+                </MotionCard>
+              ))}
+            </div>
+          </AnimatePresence>
+        )}
+        
+        {/* Current Podcast Player (Fixed at bottom) */}
+        {currentPodcast && !isAudioLoading && audioSrc && (
+          <motion.div
+            initial={{ y: 100, opacity: 0 }}
+            animate={{ y: 0, opacity: 1 }}
+            exit={{ y: 100, opacity: 0 }}
+            className="fixed bottom-0 left-0 right-0 bg-background border-t p-4 shadow-lg z-50"
+          >
+            <div className="container mx-auto">
+              <div className="flex flex-col md:flex-row items-center gap-4">
+                <div className="flex-shrink-0">
+                  <div className="w-12 h-12 bg-primary/20 rounded-md flex items-center justify-center">
+                    <Podcast className="h-6 w-6 text-primary" />
+                  </div>
+                </div>
+                
+                <div className="flex-grow min-w-0">
+                  <h4 className="font-medium text-sm line-clamp-1">{currentPodcast.title}</h4>
+                  
+                  <div className="flex items-center gap-2 mt-2">
+                    <div className="flex-grow">
+                      <Slider
+                        value={[currentTime]}
+                        min={0}
+                        max={duration || 100}
+                        step={0.1}
+                        onValueChange={handleSeek}
+                      />
+                    </div>
+                    <div className="flex-shrink-0 text-xs text-muted-foreground whitespace-nowrap">
+                      {formatTime(currentTime)} / {formatTime(duration)}
+                    </div>
+                  </div>
+                </div>
+                
+                <div className="flex items-center gap-2">
+                  <Button
+                    variant="ghost"
+                    size="icon"
+                    onClick={skipBackward}
+                    className="h-8 w-8"
+                  >
+                    <SkipBack className="h-4 w-4" />
+                  </Button>
+                  
+                  <Button
+                    variant="default"
+                    size="icon"
+                    onClick={togglePlayPause}
+                    className="h-10 w-10 rounded-full"
+                  >
+                    {isPlaying ? <Pause className="h-5 w-5" /> : <Play className="h-5 w-5 ml-0.5" />}
+                  </Button>
+                  
+                  <Button
+                    variant="ghost"
+                    size="icon"
+                    onClick={skipForward}
+                    className="h-8 w-8"
+                  >
+                    <SkipForward className="h-4 w-4" />
+                  </Button>
+                  
+                  <div className="hidden md:flex items-center gap-2 ml-4 w-28">
+                    <Button
+                      variant="ghost"
+                      size="icon"
+                      onClick={toggleMute}
+                      className="h-8 w-8"
+                    >
+                      {isMuted ? <VolumeX className="h-4 w-4" /> : <Volume2 className="h-4 w-4" />}
+                    </Button>
+                    
+                    <Slider
+                      value={[isMuted ? 0 : volume]}
+                      min={0}
+                      max={1}
+                      step={0.01}
+                      onValueChange={handleVolumeChange}
+                      className="w-20"
+                    />
+                  </div>
+                </div>
+              </div>
+            </div>
+          </motion.div>
+        )}
+      </div>
+      
+      {/* Delete Confirmation Dialog */}
+      <Dialog open={deleteDialogOpen} onOpenChange={setDeleteDialogOpen}>
+        <DialogContent className="sm:max-w-md">
+          <DialogHeader>
+            <DialogTitle className="flex items-center gap-2">
+              <Trash2 className="h-5 w-5 text-destructive" />
+              <span>Delete Podcast</span>
+            </DialogTitle>
+            <DialogDescription>
+              Are you sure you want to delete <span className="font-medium">{podcastToDelete?.title}</span>? This action cannot be undone.
+            </DialogDescription>
+          </DialogHeader>
+          <DialogFooter className="flex gap-2 sm:justify-end">
+            <Button
+              variant="outline"
+              onClick={() => setDeleteDialogOpen(false)}
+              disabled={isDeleting}
+            >
+              Cancel
+            </Button>
+            <Button
+              variant="destructive"
+              onClick={handleDeletePodcast}
+              disabled={isDeleting}
+              className="gap-2"
+            >
+              {isDeleting ? (
+                <>
+                  <span className="h-4 w-4 animate-spin rounded-full border-2 border-current border-t-transparent" />
+                  Deleting...
+                </>
+              ) : (
+                <>
+                  <Trash2 className="h-4 w-4" />
+                  Delete
+                </>
+              )}
+            </Button>
+          </DialogFooter>
+        </DialogContent>
+      </Dialog>
+      
+      {/* Hidden audio element for playback */}
+      <audio
+        ref={audioRef}
+        src={audioSrc}
+        onTimeUpdate={handleTimeUpdate}
+        onLoadedMetadata={handleMetadataLoaded}
+        onEnded={() => setIsPlaying(false)}
+        onError={(e) => {
+          console.error('Audio error:', e);
+          if (audioRef.current?.error?.code !== audioRef.current?.error?.MEDIA_ERR_ABORTED) {
+             toast.error('Error playing audio.');
+          }
+        }}
+      />
+    </motion.div>
+  );
+} 
\ No newline at end of file
diff --git a/surfsense_web/components/sidebar/app-sidebar.tsx b/surfsense_web/components/sidebar/app-sidebar.tsx
index ac5f978ab..4f56c4252 100644
--- a/surfsense_web/components/sidebar/app-sidebar.tsx
+++ b/surfsense_web/components/sidebar/app-sidebar.tsx
@@ -14,6 +14,7 @@ import {
   Info,
   ExternalLink,
   Trash2,
+  Podcast,
   type LucideIcon,
 } from "lucide-react"
 
@@ -45,7 +46,8 @@ export const iconMap: Record<string, LucideIcon> = {
   AlertCircle,
   Info,
   ExternalLink,
-  Trash2
+  Trash2,
+  Podcast
 }
 
 const defaultData = {
diff --git a/surfsense_web/components/ui/slider.tsx b/surfsense_web/components/ui/slider.tsx
new file mode 100644
index 000000000..f6ab6d565
--- /dev/null
+++ b/surfsense_web/components/ui/slider.tsx
@@ -0,0 +1,28 @@
+"use client"
+
+import * as React from "react"
+import * as SliderPrimitive from "@radix-ui/react-slider"
+
+import { cn } from "@/lib/utils"
+
+const Slider = React.forwardRef<
+  React.ElementRef<typeof SliderPrimitive.Root>,
+  React.ComponentPropsWithoutRef<typeof SliderPrimitive.Root>
+>(({ className, ...props }, ref) => (
+  <SliderPrimitive.Root
+    ref={ref}
+    className={cn(
+      "relative flex w-full touch-none select-none items-center",
+      className
+    )}
+    {...props}
+  >
+    <SliderPrimitive.Track className="relative h-2 w-full grow overflow-hidden rounded-full bg-secondary">
+      <SliderPrimitive.Range className="absolute h-full bg-primary" />
+    </SliderPrimitive.Track>
+    <SliderPrimitive.Thumb className="block h-5 w-5 rounded-full border-2 border-primary bg-background ring-offset-background transition-colors focus-visible:outline-none focus-visible:ring-2 focus-visible:ring-ring focus-visible:ring-offset-2 disabled:pointer-events-none disabled:opacity-50" />
+  </SliderPrimitive.Root>
+))
+Slider.displayName = SliderPrimitive.Root.displayName
+
+export { Slider } 
\ No newline at end of file
diff --git a/surfsense_web/package.json b/surfsense_web/package.json
index 07fcb5549..a5ec4d8f1 100644
--- a/surfsense_web/package.json
+++ b/surfsense_web/package.json
@@ -28,6 +28,7 @@
     "@radix-ui/react-popover": "^1.1.6",
     "@radix-ui/react-select": "^2.1.6",
     "@radix-ui/react-separator": "^1.1.2",
+    "@radix-ui/react-slider": "^1.3.4",
     "@radix-ui/react-slot": "^1.1.2",
     "@radix-ui/react-tabs": "^1.1.3",
     "@radix-ui/react-tooltip": "^1.1.8",
diff --git a/surfsense_web/pnpm-lock.yaml b/surfsense_web/pnpm-lock.yaml
index a8eb3c82a..1d8653f5e 100644
--- a/surfsense_web/pnpm-lock.yaml
+++ b/surfsense_web/pnpm-lock.yaml
@@ -47,6 +47,9 @@ importers:
       '@radix-ui/react-separator':
         specifier: ^1.1.2
         version: 1.1.2(@types/react-dom@19.0.4(@types/react@19.0.10))(@types/react@19.0.10)(react-dom@19.0.0(react@19.0.0))(react@19.0.0)
+      '@radix-ui/react-slider':
+        specifier: ^1.3.4
+        version: 1.3.4(@types/react-dom@19.0.4(@types/react@19.0.10))(@types/react@19.0.10)(react-dom@19.0.0(react@19.0.0))(react@19.0.0)
       '@radix-ui/react-slot':
         specifier: ^1.1.2
         version: 1.1.2(@types/react@19.0.10)(react@19.0.0)
@@ -960,6 +963,19 @@ packages:
       '@types/react-dom':
         optional: true
 
+  '@radix-ui/react-collection@1.1.6':
+    resolution: {integrity: sha512-PbhRFK4lIEw9ADonj48tiYWzkllz81TM7KVYyyMMw2cwHO7D5h4XKEblL8NlaRisTK3QTe6tBEhDccFUryxHBQ==}
+    peerDependencies:
+      '@types/react': '*'
+      '@types/react-dom': '*'
+      react: ^16.8 || ^17.0 || ^18.0 || ^19.0 || ^19.0.0-rc
+      react-dom: ^16.8 || ^17.0 || ^18.0 || ^19.0 || ^19.0.0-rc
+    peerDependenciesMeta:
+      '@types/react':
+        optional: true
+      '@types/react-dom':
+        optional: true
+
   '@radix-ui/react-compose-refs@1.0.0':
     resolution: {integrity: sha512-0KaSv6sx787/hK3eF53iOkiSLwAGlFMx5lotrqD2pTjB18KbybKoEIgkNZTKC60YECDQTKGTRcDBILwZVqVKvA==}
     peerDependencies:
@@ -1480,6 +1496,19 @@ packages:
       '@types/react-dom':
         optional: true
 
+  '@radix-ui/react-primitive@2.1.2':
+    resolution: {integrity: sha512-uHa+l/lKfxuDD2zjN/0peM/RhhSmRjr5YWdk/37EnSv1nJ88uvG85DPexSm8HdFQROd2VdERJ6ynXbkCFi+APw==}
+    peerDependencies:
+      '@types/react': '*'
+      '@types/react-dom': '*'
+      react: ^16.8 || ^17.0 || ^18.0 || ^19.0 || ^19.0.0-rc
+      react-dom: ^16.8 || ^17.0 || ^18.0 || ^19.0 || ^19.0.0-rc
+    peerDependenciesMeta:
+      '@types/react':
+        optional: true
+      '@types/react-dom':
+        optional: true
+
   '@radix-ui/react-roving-focus@1.1.2':
     resolution: {integrity: sha512-zgMQWkNO169GtGqRvYrzb0Zf8NhMHS2DuEB/TiEmVnpr5OqPU3i8lfbxaAmC2J/KYuIQxyoQQ6DxepyXp61/xw==}
     peerDependencies:
@@ -1545,6 +1574,19 @@ packages:
       '@types/react-dom':
         optional: true
 
+  '@radix-ui/react-slider@1.3.4':
+    resolution: {integrity: sha512-Cp6hEmQtRJFci285vkdIJ+HCDLTRDk+25VhFwa1fcubywjMUE3PynBgtN5RLudOgSCYMlT4jizCXdmV+8J7Y2w==}
+    peerDependencies:
+      '@types/react': '*'
+      '@types/react-dom': '*'
+      react: ^16.8 || ^17.0 || ^18.0 || ^19.0 || ^19.0.0-rc
+      react-dom: ^16.8 || ^17.0 || ^18.0 || ^19.0 || ^19.0.0-rc
+    peerDependenciesMeta:
+      '@types/react':
+        optional: true
+      '@types/react-dom':
+        optional: true
+
   '@radix-ui/react-slot@1.0.0':
     resolution: {integrity: sha512-3mrKauI/tWXo1Ll+gN5dHcxDPdm/Df1ufcDLCecn+pnCIVcdWE7CujXo8QaXOWRJyZyQWWbpB8eFwHzWXlv5mQ==}
     peerDependencies:
@@ -1577,6 +1619,15 @@ packages:
       '@types/react':
         optional: true
 
+  '@radix-ui/react-slot@1.2.2':
+    resolution: {integrity: sha512-y7TBO4xN4Y94FvcWIOIh18fM4R1A8S4q1jhoz4PNzOoHsFcN8pogcFmZrTYAm4F9VRUrWP/Mw7xSKybIeRI+CQ==}
+    peerDependencies:
+      '@types/react': '*'
+      react: ^16.8 || ^17.0 || ^18.0 || ^19.0 || ^19.0.0-rc
+    peerDependenciesMeta:
+      '@types/react':
+        optional: true
+
   '@radix-ui/react-tabs@1.1.3':
     resolution: {integrity: sha512-9mFyI30cuRDImbmFF6O2KUJdgEOsGh9Vmx9x/Dh9tOhL7BngmQPQfwW4aejKm5OHpfWIdmeV6ySyuxoOGjtNng==}
     peerDependencies:
@@ -5094,6 +5145,18 @@ snapshots:
       '@types/react': 19.0.10
       '@types/react-dom': 19.0.4(@types/react@19.0.10)
 
+  '@radix-ui/react-collection@1.1.6(@types/react-dom@19.0.4(@types/react@19.0.10))(@types/react@19.0.10)(react-dom@19.0.0(react@19.0.0))(react@19.0.0)':
+    dependencies:
+      '@radix-ui/react-compose-refs': 1.1.2(@types/react@19.0.10)(react@19.0.0)
+      '@radix-ui/react-context': 1.1.2(@types/react@19.0.10)(react@19.0.0)
+      '@radix-ui/react-primitive': 2.1.2(@types/react-dom@19.0.4(@types/react@19.0.10))(@types/react@19.0.10)(react-dom@19.0.0(react@19.0.0))(react@19.0.0)
+      '@radix-ui/react-slot': 1.2.2(@types/react@19.0.10)(react@19.0.0)
+      react: 19.0.0
+      react-dom: 19.0.0(react@19.0.0)
+    optionalDependencies:
+      '@types/react': 19.0.10
+      '@types/react-dom': 19.0.4(@types/react@19.0.10)
+
   '@radix-ui/react-compose-refs@1.0.0(react@19.0.0)':
     dependencies:
       '@babel/runtime': 7.26.9
@@ -5654,6 +5717,15 @@ snapshots:
       '@types/react': 19.0.10
       '@types/react-dom': 19.0.4(@types/react@19.0.10)
 
+  '@radix-ui/react-primitive@2.1.2(@types/react-dom@19.0.4(@types/react@19.0.10))(@types/react@19.0.10)(react-dom@19.0.0(react@19.0.0))(react@19.0.0)':
+    dependencies:
+      '@radix-ui/react-slot': 1.2.2(@types/react@19.0.10)(react@19.0.0)
+      react: 19.0.0
+      react-dom: 19.0.0(react@19.0.0)
+    optionalDependencies:
+      '@types/react': 19.0.10
+      '@types/react-dom': 19.0.4(@types/react@19.0.10)
+
   '@radix-ui/react-roving-focus@1.1.2(@types/react-dom@19.0.4(@types/react@19.0.10))(@types/react@19.0.10)(react-dom@19.0.0(react@19.0.0))(react@19.0.0)':
     dependencies:
       '@radix-ui/primitive': 1.1.1
@@ -5743,6 +5815,25 @@ snapshots:
       '@types/react': 19.0.10
       '@types/react-dom': 19.0.4(@types/react@19.0.10)
 
+  '@radix-ui/react-slider@1.3.4(@types/react-dom@19.0.4(@types/react@19.0.10))(@types/react@19.0.10)(react-dom@19.0.0(react@19.0.0))(react@19.0.0)':
+    dependencies:
+      '@radix-ui/number': 1.1.1
+      '@radix-ui/primitive': 1.1.2
+      '@radix-ui/react-collection': 1.1.6(@types/react-dom@19.0.4(@types/react@19.0.10))(@types/react@19.0.10)(react-dom@19.0.0(react@19.0.0))(react@19.0.0)
+      '@radix-ui/react-compose-refs': 1.1.2(@types/react@19.0.10)(react@19.0.0)
+      '@radix-ui/react-context': 1.1.2(@types/react@19.0.10)(react@19.0.0)
+      '@radix-ui/react-direction': 1.1.1(@types/react@19.0.10)(react@19.0.0)
+      '@radix-ui/react-primitive': 2.1.2(@types/react-dom@19.0.4(@types/react@19.0.10))(@types/react@19.0.10)(react-dom@19.0.0(react@19.0.0))(react@19.0.0)
+      '@radix-ui/react-use-controllable-state': 1.2.2(@types/react@19.0.10)(react@19.0.0)
+      '@radix-ui/react-use-layout-effect': 1.1.1(@types/react@19.0.10)(react@19.0.0)
+      '@radix-ui/react-use-previous': 1.1.1(@types/react@19.0.10)(react@19.0.0)
+      '@radix-ui/react-use-size': 1.1.1(@types/react@19.0.10)(react@19.0.0)
+      react: 19.0.0
+      react-dom: 19.0.0(react@19.0.0)
+    optionalDependencies:
+      '@types/react': 19.0.10
+      '@types/react-dom': 19.0.4(@types/react@19.0.10)
+
   '@radix-ui/react-slot@1.0.0(react@19.0.0)':
     dependencies:
       '@babel/runtime': 7.26.9
@@ -5771,6 +5862,13 @@ snapshots:
     optionalDependencies:
       '@types/react': 19.0.10
 
+  '@radix-ui/react-slot@1.2.2(@types/react@19.0.10)(react@19.0.0)':
+    dependencies:
+      '@radix-ui/react-compose-refs': 1.1.2(@types/react@19.0.10)(react@19.0.0)
+      react: 19.0.0
+    optionalDependencies:
+      '@types/react': 19.0.10
+
   '@radix-ui/react-tabs@1.1.3(@types/react-dom@19.0.4(@types/react@19.0.10))(@types/react@19.0.10)(react-dom@19.0.0(react@19.0.0))(react@19.0.0)':
     dependencies:
       '@radix-ui/primitive': 1.1.1

From 8b8fd4fbadae308175f1e4127d804f6bdabcdb3f Mon Sep 17 00:00:00 2001
From: "DESKTOP-RTLN3BA\\$punk" <vermarohanfinal@gmail.com>
Date: Mon, 5 May 2025 23:24:48 -0700
Subject: [PATCH 13/70] chore: remove test script for Podcaster agent

---
 .../app/agents/podcaster/test_podcaster.py    | 474 ------------------
 1 file changed, 474 deletions(-)
 delete mode 100644 surfsense_backend/app/agents/podcaster/test_podcaster.py

diff --git a/surfsense_backend/app/agents/podcaster/test_podcaster.py b/surfsense_backend/app/agents/podcaster/test_podcaster.py
deleted file mode 100644
index df6728cc7..000000000
--- a/surfsense_backend/app/agents/podcaster/test_podcaster.py
+++ /dev/null
@@ -1,474 +0,0 @@
-#!/usr/bin/env python
-"""
-Test script for the Surfsense Podcaster agent.
-Run this directly from VS Code to test the Podcaster agent.
-"""
-
-import asyncio
-import os
-import sys
-from pathlib import Path
-
-# Add the project root to the Python path
-project_root = str(Path(__file__).resolve().parent.parent.parent.parent)
-if project_root not in sys.path:
-    sys.path.insert(0, project_root)
-
-from langchain_core.runnables import RunnableConfig
-
-# Now import modules using absolute imports
-from app.agents.podcaster.graph import graph
-from app.agents.podcaster.state import State
-
-
-async def test_podcaster_agent():
-    """Test the Podcaster agent with a sample input."""
-    
-    # Print banner
-    print("=" * 80)
-    print("SURFSENSE PODCASTER AGENT TEST")
-    print("=" * 80)
-    
-    # Sample input for testing
-    sample_source_content = """
-<h1 align="center">Deep-Live-Cam</h1>
-
-<p align="center">
-  Real-time face swap and video deepfake with a single click and only a single image.
-</p>
-
-<p align="center">
-<a href="https://trendshift.io/repositories/11395" target="_blank"><img src="https://trendshift.io/api/badge/repositories/11395" alt="hacksider%2FDeep-Live-Cam | Trendshift" style="width: 250px; height: 55px;" width="250" height="55"/></a>
-</p>
-
-<p align="center">
-  <img src="media/demo.gif" alt="Demo GIF" width="800">
-</p>
-
-##  Disclaimer
-
-This deepfake software is designed to be a productive tool for the AI-generated media industry. It can assist artists in animating custom characters, creating engaging content, and even using models for clothing design.
-
-We are aware of the potential for unethical applications and are committed to preventative measures. A built-in check prevents the program from processing inappropriate media (nudity, graphic content, sensitive material like war footage, etc.). We will continue to develop this project responsibly, adhering to the law and ethics. We may shut down the project or add watermarks if legally required.
-
-- Ethical Use: Users are expected to use this software responsibly and legally. If using a real person's face, obtain their consent and clearly label any output as a deepfake when sharing online.
-
-- Content Restrictions: The software includes built-in checks to prevent processing inappropriate media, such as nudity, graphic content, or sensitive material.
-
-- Legal Compliance: We adhere to all relevant laws and ethical guidelines. If legally required, we may shut down the project or add watermarks to the output.
-
-- User Responsibility: We are not responsible for end-user actions. Users must ensure their use of the software aligns with ethical standards and legal requirements.
-
-By using this software, you agree to these terms and commit to using it in a manner that respects the rights and dignity of others.
-
-Users are expected to use this software responsibly and legally. If using a real person's face, obtain their consent and clearly label any output as a deepfake when sharing online. We are not responsible for end-user actions.
-
-## Exclusive v2.0 Quick Start - Pre-built (Windows)
-
-  <a href="https://deeplivecam.net/index.php/quickstart"> <img src="media/Download.png" width="285" height="77" />
-
-##### This is the fastest build you can get if you have a discrete NVIDIA or AMD GPU.
- 
-###### These Pre-builts are perfect for non-technical users or those who don't have time to, or can't manually install all the requirements. Just a heads-up: this is an open-source project, so you can also install it manually. This will be 60 days ahead on the open source version.
-
-## TLDR; Live Deepfake in just 3 Clicks
-![easysteps](https://github.com/user-attachments/assets/af825228-852c-411b-b787-ffd9aac72fc6)
-1. Select a face
-2. Select which camera to use
-3. Press live!
-
-## Features & Uses - Everything is in real-time
-
-### Mouth Mask
-
-**Retain your original mouth for accurate movement using Mouth Mask**
-
-<p align="center">
-  <img src="media/ludwig.gif" alt="resizable-gif">
-</p>
-
-### Face Mapping
-
-**Use different faces on multiple subjects simultaneously**
-
-<p align="center">
-  <img src="media/streamers.gif" alt="face_mapping_source">
-</p>
-
-### Your Movie, Your Face
-
-**Watch movies with any face in real-time**
-
-<p align="center">
-  <img src="media/movie.gif" alt="movie">
-</p>
-
-### Live Show
-
-**Run Live shows and performances**
-
-<p align="center">
-  <img src="media/live_show.gif" alt="show">
-</p>
-
-### Memes
-
-**Create Your Most Viral Meme Yet**
-
-<p align="center">
-  <img src="media/meme.gif" alt="show" width="450"> 
-  <br>
-  <sub>Created using Many Faces feature in Deep-Live-Cam</sub>
-</p>
-
-### Omegle
-
-**Surprise people on Omegle**
-
-<p align="center">
-  <video src="https://github.com/user-attachments/assets/2e9b9b82-fa04-4b70-9f56-b1f68e7672d0" width="450" controls></video>
-</p>
-
-## Installation (Manual)
-
-**Please be aware that the installation requires technical skills and is not for beginners. Consider downloading the prebuilt version.**
-
-<details>
-<summary>Click to see the process</summary>
-
-### Installation
-
-This is more likely to work on your computer but will be slower as it utilizes the CPU.
-
-**1. Set up Your Platform**
-
--   Python (3.10 recommended)
--   pip
--   git
--   [ffmpeg](https://www.youtube.com/watch?v=OlNWCpFdVMA) - ```iex (irm ffmpeg.tc.ht)```
--   [Visual Studio 2022 Runtimes (Windows)](https://visualstudio.microsoft.com/visual-cpp-build-tools/)
-
-**2. Clone the Repository**
-
-```bash
-git clone https://github.com/hacksider/Deep-Live-Cam.git
-cd Deep-Live-Cam
-```
-
-**3. Download the Models**
-
-1. [GFPGANv1.4](https://huggingface.co/hacksider/deep-live-cam/resolve/main/GFPGANv1.4.pth)
-2. [inswapper\_128\_fp16.onnx](https://huggingface.co/hacksider/deep-live-cam/resolve/main/inswapper_128_fp16.onnx)
-
-Place these files in the "**models**" folder.
-
-**4. Install Dependencies**
-
-We highly recommend using a `venv` to avoid issues.
-
-
-For Windows:
-```bash
-python -m venv venv
-venv\Scripts\activate
-pip install -r requirements.txt
-```
-For Linux:
-```bash
-# Ensure you use the installed Python 3.10
-python3 -m venv venv
-source venv/bin/activate
-pip install -r requirements.txt
-```
-
-**For macOS:**
-
-Apple Silicon (M1/M2/M3) requires specific setup:
-
-```bash
-# Install Python 3.10 (specific version is important)
-brew install python@3.10
-
-# Install tkinter package (required for the GUI)
-brew install python-tk@3.10
-
-# Create and activate virtual environment with Python 3.10
-python3.10 -m venv venv
-source venv/bin/activate
-
-# Install dependencies
-pip install -r requirements.txt
-```
-
-** In case something goes wrong and you need to reinstall the virtual environment **
-
-```bash
-# Deactivate the virtual environment
-rm -rf venv
-
-# Reinstall the virtual environment
-python -m venv venv
-source venv/bin/activate
-
-# install the dependencies again
-pip install -r requirements.txt
-```
-
-**Run:** If you don't have a GPU, you can run Deep-Live-Cam using `python run.py`. Note that initial execution will download models (~300MB).
-
-### GPU Acceleration
-
-**CUDA Execution Provider (Nvidia)**
-
-1. Install [CUDA Toolkit 11.8.0](https://developer.nvidia.com/cuda-11-8-0-download-archive)
-2. Install dependencies:
-
-```bash
-pip uninstall onnxruntime onnxruntime-gpu
-pip install onnxruntime-gpu==1.16.3
-```
-
-3. Usage:
-
-```bash
-python run.py --execution-provider cuda
-```
-
-**CoreML Execution Provider (Apple Silicon)**
-
-Apple Silicon (M1/M2/M3) specific installation:
-
-1. Make sure you've completed the macOS setup above using Python 3.10.
-2. Install dependencies:
-
-```bash
-pip uninstall onnxruntime onnxruntime-silicon
-pip install onnxruntime-silicon==1.13.1
-```
-
-3. Usage (important: specify Python 3.10):
-
-```bash
-python3.10 run.py --execution-provider coreml
-```
-
-**Important Notes for macOS:**
-- You **must** use Python 3.10, not newer versions like 3.11 or 3.13
-- Always run with `python3.10` command not just `python` if you have multiple Python versions installed
-- If you get error about `_tkinter` missing, reinstall the tkinter package: `brew reinstall python-tk@3.10`
-- If you get model loading errors, check that your models are in the correct folder
-- If you encounter conflicts with other Python versions, consider uninstalling them:
-  ```bash
-  # List all installed Python versions
-  brew list | grep python
-  
-  # Uninstall conflicting versions if needed
-  brew uninstall --ignore-dependencies python@3.11 python@3.13
-  
-  # Keep only Python 3.10
-  brew cleanup
-  ```
-
-**CoreML Execution Provider (Apple Legacy)**
-
-1. Install dependencies:
-
-```bash
-pip uninstall onnxruntime onnxruntime-coreml
-pip install onnxruntime-coreml==1.13.1
-```
-
-2. Usage:
-
-```bash
-python run.py --execution-provider coreml
-```
-
-**DirectML Execution Provider (Windows)**
-
-1. Install dependencies:
-
-```bash
-pip uninstall onnxruntime onnxruntime-directml
-pip install onnxruntime-directml==1.15.1
-```
-
-2. Usage:
-
-```bash
-python run.py --execution-provider directml
-```
-
-**OpenVINO™ Execution Provider (Intel)**
-
-1. Install dependencies:
-
-```bash
-pip uninstall onnxruntime onnxruntime-openvino
-pip install onnxruntime-openvino==1.15.0
-```
-
-2. Usage:
-
-```bash
-python run.py --execution-provider openvino
-```
-</details>
-
-## Usage
-
-**1. Image/Video Mode**
-
--   Execute `python run.py`.
--   Choose a source face image and a target image/video.
--   Click "Start".
--   The output will be saved in a directory named after the target video.
-
-**2. Webcam Mode**
-
--   Execute `python run.py`.
--   Select a source face image.
--   Click "Live".
--   Wait for the preview to appear (10-30 seconds).
--   Use a screen capture tool like OBS to stream.
--   To change the face, select a new source image.
-
-## Tips and Tricks
-
-Check out these helpful guides to get the most out of Deep-Live-Cam:
-
-- [Unlocking the Secrets to the Perfect Deepfake Image](https://deeplivecam.net/index.php/blog/tips-and-tricks/unlocking-the-secrets-to-the-perfect-deepfake-image) - Learn how to create the best deepfake with full head coverage
-- [Video Call with DeepLiveCam](https://deeplivecam.net/index.php/blog/tips-and-tricks/video-call-with-deeplivecam) - Make your meetings livelier by using DeepLiveCam with OBS and meeting software
-- [Have a Special Guest!](https://deeplivecam.net/index.php/blog/tips-and-tricks/have-a-special-guest) - Tutorial on how to use face mapping to add special guests to your stream
-- [Watch Deepfake Movies in Realtime](https://deeplivecam.net/index.php/blog/tips-and-tricks/watch-deepfake-movies-in-realtime) - See yourself star in any video without processing the video
-- [Better Quality without Sacrificing Speed](https://deeplivecam.net/index.php/blog/tips-and-tricks/better-quality-without-sacrificing-speed) - Tips for achieving better results without impacting performance
-- [Instant Vtuber!](https://deeplivecam.net/index.php/blog/tips-and-tricks/instant-vtuber) - Create a new persona/vtuber easily using Metahuman Creator
-
-Visit our [official blog](https://deeplivecam.net/index.php/blog/tips-and-tricks) for more tips and tutorials.
-
-## Command Line Arguments (Unmaintained)
-
-```
-options:
-  -h, --help                                               show this help message and exit
-  -s SOURCE_PATH, --source SOURCE_PATH                     select a source image
-  -t TARGET_PATH, --target TARGET_PATH                     select a target image or video
-  -o OUTPUT_PATH, --output OUTPUT_PATH                     select output file or directory
-  --frame-processor FRAME_PROCESSOR [FRAME_PROCESSOR ...]  frame processors (choices: face_swapper, face_enhancer, ...)
-  --keep-fps                                               keep original fps
-  --keep-audio                                             keep original audio
-  --keep-frames                                            keep temporary frames
-  --many-faces                                             process every face
-  --map-faces                                              map source target faces
-  --mouth-mask                                             mask the mouth region
-  --video-encoder {libx264,libx265,libvpx-vp9}             adjust output video encoder
-  --video-quality [0-51]                                   adjust output video quality
-  --live-mirror                                            the live camera display as you see it in the front-facing camera frame
-  --live-resizable                                         the live camera frame is resizable
-  --max-memory MAX_MEMORY                                  maximum amount of RAM in GB
-  --execution-provider {cpu} [{cpu} ...]                   available execution provider (choices: cpu, ...)
-  --execution-threads EXECUTION_THREADS                    number of execution threads
-  -v, --version                                            show program's version number and exit
-```
-
-Looking for a CLI mode? Using the -s/--source argument will make the run program in cli mode.
-
-## Press
-
-**We are always open to criticism and are ready to improve, that's why we didn't cherry-pick anything.**
-
- - [*"Deep-Live-Cam goes viral, allowing anyone to become a digital doppelganger"*](https://arstechnica.com/information-technology/2024/08/new-ai-tool-enables-real-time-face-swapping-on-webcams-raising-fraud-concerns/) - Ars Technica
- - [*"Thanks Deep Live Cam, shapeshifters are among us now"*](https://dataconomy.com/2024/08/15/what-is-deep-live-cam-github-deepfake/) - Dataconomy
- - [*"This free AI tool lets you become anyone during video-calls"*](https://www.newsbytesapp.com/news/science/deep-live-cam-ai-impersonation-tool-goes-viral/story) - NewsBytes
- - [*"OK, this viral AI live stream software is truly terrifying"*](https://www.creativebloq.com/ai/ok-this-viral-ai-live-stream-software-is-truly-terrifying) - Creative Bloq
- - [*"Deepfake AI Tool Lets You Become Anyone in a Video Call With Single Photo"*](https://petapixel.com/2024/08/14/deep-live-cam-deepfake-ai-tool-lets-you-become-anyone-in-a-video-call-with-single-photo-mark-zuckerberg-jd-vance-elon-musk/) - PetaPixel
- - [*"Deep-Live-Cam Uses AI to Transform Your Face in Real-Time, Celebrities Included"*](https://www.techeblog.com/deep-live-cam-ai-transform-face/) - TechEBlog
- - [*"An AI tool that "makes you look like anyone" during a video call is going viral online"*](https://telegrafi.com/en/a-tool-that-makes-you-look-like-anyone-during-a-video-call-is-going-viral-on-the-Internet/) - Telegrafi
- - [*"This Deepfake Tool Turning Images Into Livestreams is Topping the GitHub Charts"*](https://decrypt.co/244565/this-deepfake-tool-turning-images-into-livestreams-is-topping-the-github-charts) - Emerge
- - [*"New Real-Time Face-Swapping AI Allows Anyone to Mimic Famous Faces"*](https://www.digitalmusicnews.com/2024/08/15/face-swapping-ai-real-time-mimic/) - Digital Music News
- - [*"This real-time webcam deepfake tool raises alarms about the future of identity theft"*](https://www.diyphotography.net/this-real-time-webcam-deepfake-tool-raises-alarms-about-the-future-of-identity-theft/) - DIYPhotography
- - [*"That's Crazy, Oh God. That's Fucking Freaky Dude... That's So Wild Dude"*](https://www.youtube.com/watch?time_continue=1074&v=py4Tc-Y8BcY) - SomeOrdinaryGamers
- - [*"Alright look look look, now look chat, we can do any face we want to look like chat"*](https://www.youtube.com/live/mFsCe7AIxq8?feature=shared&t=2686) - IShowSpeed
-
-## Credits
-
--   [ffmpeg](https://ffmpeg.org/): for making video-related operations easy
--   [deepinsight](https://github.com/deepinsight): for their [insightface](https://github.com/deepinsight/insightface) project which provided a well-made library and models. Please be reminded that the [use of the model is for non-commercial research purposes only](https://github.com/deepinsight/insightface?tab=readme-ov-file#license).
--   [havok2-htwo](https://github.com/havok2-htwo): for sharing the code for webcam
--   [GosuDRM](https://github.com/GosuDRM): for the open version of roop
--   [pereiraroland26](https://github.com/pereiraroland26): Multiple faces support
--   [vic4key](https://github.com/vic4key): For supporting/contributing to this project
--   [kier007](https://github.com/kier007): for improving the user experience
--   [qitianai](https://github.com/qitianai): for multi-lingual support
--   and [all developers](https://github.com/hacksider/Deep-Live-Cam/graphs/contributors) behind libraries used in this project.
--   Footnote: Please be informed that the base author of the code is [s0md3v](https://github.com/s0md3v/roop)
--   All the wonderful users who helped make this project go viral by starring the repo ❤️
-
-[![Stargazers](https://reporoster.com/stars/hacksider/Deep-Live-Cam)](https://github.com/hacksider/Deep-Live-Cam/stargazers)
-
-## Contributions
-
-![Alt](https://repobeats.axiom.co/api/embed/fec8e29c45dfdb9c5916f3a7830e1249308d20e1.svg "Repobeats analytics image")
-
-## Stars to the Moon 🚀
-
-<a href="https://star-history.com/#hacksider/deep-live-cam&Date">
- <picture>
-   <source media="(prefers-color-scheme: dark)" srcset="https://api.star-history.com/svg?repos=hacksider/deep-live-cam&type=Date&theme=dark" />
-   <source media="(prefers-color-scheme: light)" srcset="https://api.star-history.com/svg?repos=hacksider/deep-live-cam&type=Date" />
-   <img alt="Star History Chart" src="https://api.star-history.com/svg?repos=hacksider/deep-live-cam&type=Date" />
- </picture>
-</a>
-    """
-    
-    # Create initial state
-    initial_state = State(
-        source_content=sample_source_content
-    )
-    
-    # Configuration with podcast title
-    config = RunnableConfig(
-        configurable={
-            "podcast_title": "SurfSense"
-        }
-    )
-    
-    # Create 'podcasts' directory if it doesn't exist
-    os.makedirs("podcasts", exist_ok=True)
-    
-    # Run the agent
-    print("\nRunning Podcaster agent...\n")
-    
-    try:
-        # Execute the graph
-        final_state = await graph.ainvoke(initial_state, config)
-        
-        # Print results
-        print("\nAgent execution completed successfully!")
-        print(f"Generated podcast file: {final_state.get('final_podcast_file_path', 'No audio file generated')}")
-        
-        # If transcript was generated, show a preview
-        if final_state.get('podcast_transcript'):
-            print("\nPodcast transcript preview (first 3 entries):")
-            for i, entry in enumerate(final_state.get('podcast_transcript')[:3]):
-                # Handle both dictionary and PodcastTranscriptEntry objects
-                if hasattr(entry, 'speaker_id'):
-                    speaker_id = entry.speaker_id
-                    dialog = entry.dialog
-                else:
-                    speaker_id = entry.get('speaker_id', 0)
-                    dialog = entry.get('dialog', '')
-                    
-                print(f"Speaker {speaker_id}: {dialog[:50]}...")
-        
-    except Exception as e:
-        print(f"\nError running the agent: {str(e)}")
-        raise
-    
-    print("\nTest completed!")
-    return final_state
-
-
-if __name__ == "__main__":
-    # Run the test function
-    final_state = asyncio.run(test_podcaster_agent()) 
\ No newline at end of file

From 5c79831ca534575349369f5a323337e7af910d0a Mon Sep 17 00:00:00 2001
From: "DESKTOP-RTLN3BA\\$punk" <vermarohanfinal@gmail.com>
Date: Mon, 5 May 2025 23:31:52 -0700
Subject: [PATCH 14/70] fix: coderabbit

---
 .../[search_space_id]/chats/chats-client.tsx  | 19 ++++++++++++----
 .../podcasts/podcasts-client.tsx              | 22 ++++++++++---------
 2 files changed, 27 insertions(+), 14 deletions(-)

diff --git a/surfsense_web/app/dashboard/[search_space_id]/chats/chats-client.tsx b/surfsense_web/app/dashboard/[search_space_id]/chats/chats-client.tsx
index 6501ca684..7b14c7ed1 100644
--- a/surfsense_web/app/dashboard/[search_space_id]/chats/chats-client.tsx
+++ b/surfsense_web/app/dashboard/[search_space_id]/chats/chats-client.tsx
@@ -472,11 +472,12 @@ export default function ChatsPageClient({ searchSpaceId }: ChatsPageClientProps)
                   size="sm" 
                   onClick={selectAllVisibleChats}
                   className="gap-1"
+                  title="Select or deselect all chats on the current page"
                 >
                   <CheckCircle className="h-4 w-4" />
                   {currentChats.every(chat => selectedChats.includes(chat.id)) 
-                    ? "Deselect All" 
-                    : "Select All"}
+                    ? "Deselect Page" 
+                    : "Select Page"}
                 </Button>
                 <Button 
                   variant="default" 
@@ -567,7 +568,12 @@ export default function ChatsPageClient({ searchSpaceId }: ChatsPageClientProps)
                   className={`overflow-hidden hover:shadow-md transition-shadow 
                     ${selectionMode && selectedChats.includes(chat.id) 
                     ? 'ring-2 ring-primary ring-offset-2' : ''}`}
-                  onClick={() => selectionMode ? toggleChatSelection(chat.id) : null}
+                  onClick={(e) => {
+                    if (!selectionMode) return;
+                    // Ignore clicks coming from interactive elements
+                    if ((e.target as HTMLElement).closest('button, a, [data-stop-selection]')) return;
+                    toggleChatSelection(chat.id);
+                  }}
                 >
                   <CardHeader className="pb-3">
                     <div className="flex justify-between items-start">
@@ -592,7 +598,12 @@ export default function ChatsPageClient({ searchSpaceId }: ChatsPageClientProps)
                       {!selectionMode && (
                         <DropdownMenu>
                           <DropdownMenuTrigger asChild>
-                            <Button variant="ghost" size="icon" className="h-8 w-8">
+                            <Button 
+                              variant="ghost" 
+                              size="icon" 
+                              className="h-8 w-8"
+                              data-stop-selection
+                            >
                               <MoreHorizontal className="h-4 w-4" />
                               <span className="sr-only">Open menu</span>
                             </Button>
diff --git a/surfsense_web/app/dashboard/[search_space_id]/podcasts/podcasts-client.tsx b/surfsense_web/app/dashboard/[search_space_id]/podcasts/podcasts-client.tsx
index cacee7061..72e50ab95 100644
--- a/surfsense_web/app/dashboard/[search_space_id]/podcasts/podcasts-client.tsx
+++ b/surfsense_web/app/dashboard/[search_space_id]/podcasts/podcasts-client.tsx
@@ -38,7 +38,7 @@ import {
 } from "@/components/ui/select";
 import { toast } from "sonner";
 
-interface Podcast {
+interface PodcastItem {
   id: number;
   title: string;
   created_at: string;
@@ -66,8 +66,8 @@ const podcastCardVariants = {
 const MotionCard = motion(Card);
 
 export default function PodcastsPageClient({ searchSpaceId }: PodcastsPageClientProps) {
-  const [podcasts, setPodcasts] = useState<Podcast[]>([]);
-  const [filteredPodcasts, setFilteredPodcasts] = useState<Podcast[]>([]);
+  const [podcasts, setPodcasts] = useState<PodcastItem[]>([]);
+  const [filteredPodcasts, setFilteredPodcasts] = useState<PodcastItem[]>([]);
   const [isLoading, setIsLoading] = useState(true);
   const [error, setError] = useState<string | null>(null);
   const [searchQuery, setSearchQuery] = useState('');
@@ -77,7 +77,7 @@ export default function PodcastsPageClient({ searchSpaceId }: PodcastsPageClient
   const [isDeleting, setIsDeleting] = useState(false);
   
   // Audio player state
-  const [currentPodcast, setCurrentPodcast] = useState<Podcast | null>(null);
+  const [currentPodcast, setCurrentPodcast] = useState<PodcastItem | null>(null);
   const [audioSrc, setAudioSrc] = useState<string | undefined>(undefined);
   const [isAudioLoading, setIsAudioLoading] = useState(false);
   const [isPlaying, setIsPlaying] = useState(false);
@@ -123,7 +123,7 @@ export default function PodcastsPageClient({ searchSpaceId }: PodcastsPageClient
           throw new Error(`Failed to fetch podcasts: ${response.status} ${errorData?.detail || ''}`);
         }
 
-        const data: Podcast[] = await response.json();
+        const data: PodcastItem[] = await response.json();
         setPodcasts(data);
         setFilteredPodcasts(data);
         setError(null);
@@ -257,7 +257,7 @@ export default function PodcastsPageClient({ searchSpaceId }: PodcastsPageClient
   };
 
   // Play podcast - Fetch blob and set object URL
-  const playPodcast = async (podcast: Podcast) => {
+  const playPodcast = async (podcast: PodcastItem) => {
     // If the same podcast is selected, just toggle play/pause
     if (currentPodcast && currentPodcast.id === podcast.id) {
       togglePlayPause();
@@ -302,12 +302,14 @@ export default function PodcastsPageClient({ searchSpaceId }: PodcastsPageClient
       const blob = await response.blob();
       const objectUrl = URL.createObjectURL(blob);
       currentObjectUrlRef.current = objectUrl;
+      
+      // Wait for React to commit the new `src`
       setAudioSrc(objectUrl);
       
-      // Let the audio element load the new src
-      setTimeout(() => {
+      // Use requestAnimationFrame instead of setTimeout for more reliable DOM updates
+      requestAnimationFrame(() => {
         if (audioRef.current) {
-          audioRef.current.load();
+          // The <audio> element has the new src now
           audioRef.current.play()
             .then(() => {
               setIsPlaying(true);
@@ -318,7 +320,7 @@ export default function PodcastsPageClient({ searchSpaceId }: PodcastsPageClient
               setIsPlaying(false);
             });
         }
-      }, 50);
+      });
 
     } catch (error) {
       console.error('Error fetching or playing podcast:', error);

From 1bd0f8a8857e3df577b372c4a19fd8b719546f1c Mon Sep 17 00:00:00 2001
From: "DESKTOP-RTLN3BA\\$punk" <vermarohanfinal@gmail.com>
Date: Tue, 6 May 2025 00:04:37 -0700
Subject: [PATCH 15/70] chore: docs update for tts

---
 README.md                                          | 11 +++++------
 surfsense_web/content/docs/docker-installation.mdx |  1 +
 surfsense_web/content/docs/manual-installation.mdx |  1 +
 3 files changed, 7 insertions(+), 6 deletions(-)

diff --git a/README.md b/README.md
index d2af00f33..a23bd37ca 100644
--- a/README.md
+++ b/README.md
@@ -36,6 +36,11 @@ Get Cited answers just like Perplexity.
 Works Flawlessly with Ollama local LLMs.
 #### 🏠 **Self Hostable**
 Open source and easy to deploy locally.
+#### 🎙️ Podcasts 
+- Blazingly fast podcast generation agent.
+- Convert your chat conversations into engaging audio content
+- Support for multiple TTS providers (OpenAI, Azure, Google Vertex AI)
+
 #### 📊 **Advanced RAG Techniques**
 - Supports 150+ LLM's
 - Supports 6000+ Embedding Models.
@@ -58,12 +63,6 @@ Open source and easy to deploy locally.
 - Its main usecase is to save any webpages protected beyond authentication.
 
 
-### 2. Temporarily Deprecated
-
-#### Podcasts 
-- The SurfSense Podcast feature is currently being reworked for better UI and stability. Expect it soon.
-
-
 ## FEATURE REQUESTS AND FUTURE
 
 
diff --git a/surfsense_web/content/docs/docker-installation.mdx b/surfsense_web/content/docs/docker-installation.mdx
index 236366546..47053c915 100644
--- a/surfsense_web/content/docs/docker-installation.mdx
+++ b/surfsense_web/content/docs/docker-installation.mdx
@@ -73,6 +73,7 @@ Before you begin, ensure you have:
    | LONG_CONTEXT_LLM | LiteLLM routed LLM for longer context windows (e.g., `gemini/gemini-2.0-flash`, `ollama/deepseek-r1:8b`) |
    | UNSTRUCTURED_API_KEY | API key for Unstructured.io service for document parsing |
    | FIRECRAWL_API_KEY | API key for Firecrawl service for web crawling |
+   | TTS_SERVICE | Text-to-Speech API provider for Podcasts (e.g., `openai/tts-1`, `azure/neural`, `vertex_ai/`). See [supported providers](https://docs.litellm.ai/docs/text_to_speech#supported-providers) |
 
    Include API keys for the LLM providers you're using. For example:
    - `OPENAI_API_KEY`: If using OpenAI models
diff --git a/surfsense_web/content/docs/manual-installation.mdx b/surfsense_web/content/docs/manual-installation.mdx
index 3813b1b88..b1fed6aa4 100644
--- a/surfsense_web/content/docs/manual-installation.mdx
+++ b/surfsense_web/content/docs/manual-installation.mdx
@@ -61,6 +61,7 @@ Edit the `.env` file and set the following variables:
 | LONG_CONTEXT_LLM | LiteLLM routed long-context LLM (e.g., `gemini/gemini-2.0-flash`, `ollama/deepseek-r1:8b`) |
 | UNSTRUCTURED_API_KEY | API key for Unstructured.io service |
 | FIRECRAWL_API_KEY | API key for Firecrawl service (if using crawler) |
+| TTS_SERVICE | Text-to-Speech API provider for Podcasts (e.g., `openai/tts-1`, `azure/neural`, `vertex_ai/`). See [supported providers](https://docs.litellm.ai/docs/text_to_speech#supported-providers) |
 
 **Important**: Since LLM calls are routed through LiteLLM, include API keys for the LLM providers you're using:
 - For OpenAI models: `OPENAI_API_KEY`

From 501d8d21b7630965ba729e290fcfdc4376af0bfb Mon Sep 17 00:00:00 2001
From: Rohan Verma <122026167+MODSetter@users.noreply.github.com>
Date: Tue, 6 May 2025 00:12:22 -0700
Subject: [PATCH 16/70] Update README.md

---
 README.md | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/README.md b/README.md
index a23bd37ca..80c2587da 100644
--- a/README.md
+++ b/README.md
@@ -16,6 +16,9 @@ While tools like NotebookLM and Perplexity are impressive and highly effective f
 
 https://github.com/user-attachments/assets/48142909-6391-4084-b7e8-81da388bb1fc
 
+# Podcast's
+
+https://github.com/user-attachments/assets/d516982f-de00-4c41-9e4c-632a7d942f41
 
 
 
@@ -37,7 +40,7 @@ Works Flawlessly with Ollama local LLMs.
 #### 🏠 **Self Hostable**
 Open source and easy to deploy locally.
 #### 🎙️ Podcasts 
-- Blazingly fast podcast generation agent.
+- Blazingly fast podcast generation agent. (Creates a 3-minute podcast in under 20 seconds.)
 - Convert your chat conversations into engaging audio content
 - Support for multiple TTS providers (OpenAI, Azure, Google Vertex AI)
 
@@ -103,6 +106,9 @@ Before installation, make sure to complete the [prerequisite setup steps](https:
 
 ![researcher](https://github.com/user-attachments/assets/fda3e61f-f936-4b66-b565-d84edde44a67)
 
+**Podcast Agent** 
+![podcasts](https://github.com/user-attachments/assets/6cb82ffd-9e14-4172-bc79-67faf34c4c1c)
+
 
 **Agent Chat** 
 
@@ -114,6 +120,7 @@ Before installation, make sure to complete the [prerequisite setup steps](https:
 
 ![ext2](https://github.com/user-attachments/assets/a9b9f1aa-2677-404d-b0a0-c1b2dddf24a7)
 
+
 ## Tech Stack
 
 

From 1289b26db7d47d2d1a7e22db2930f4a89c9388eb Mon Sep 17 00:00:00 2001
From: Rohan Verma <122026167+MODSetter@users.noreply.github.com>
Date: Tue, 6 May 2025 22:22:18 -0700
Subject: [PATCH 17/70] Update README.md

- Just Readme.
- I have to sleep.
---
 README.md | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/README.md b/README.md
index 80c2587da..213e1c199 100644
--- a/README.md
+++ b/README.md
@@ -20,6 +20,10 @@ https://github.com/user-attachments/assets/48142909-6391-4084-b7e8-81da388bb1fc
 
 https://github.com/user-attachments/assets/d516982f-de00-4c41-9e4c-632a7d942f41
 
+## Podcast Sample
+
+https://github.com/user-attachments/assets/bf64a6ca-934b-47ac-9e1b-edac5fe972ec
+
 
 
 ## Key Features

From fc30b88d14ea1dd71ff8240ebd1da224adcbad96 Mon Sep 17 00:00:00 2001
From: ritikprajapat21 <ritikprajapati084@gmail.com>
Date: Wed, 7 May 2025 16:19:04 +0530
Subject: [PATCH 18/70] Fix #66: Created sitemap

Could not find existing sitemap file so created a new sitemap file with
the listed url in the issue #66
---
 surfsense_web/app/sitemap.ts | 48 ++++++++++++++++++++++++++++++++++++
 1 file changed, 48 insertions(+)
 create mode 100644 surfsense_web/app/sitemap.ts

diff --git a/surfsense_web/app/sitemap.ts b/surfsense_web/app/sitemap.ts
new file mode 100644
index 000000000..cbd35fba1
--- /dev/null
+++ b/surfsense_web/app/sitemap.ts
@@ -0,0 +1,48 @@
+import type { MetadataRoute } from 'next'
+ 
+export default function sitemap(): MetadataRoute.Sitemap {
+  return [
+    {
+      url: 'https://www.surfsense.net/',
+      lastModified: new Date(),
+      changeFrequency: 'yearly',
+      priority: 1,
+    },
+    {
+      url: 'https://www.surfsense.net/privacy',
+      lastModified: new Date(),
+      changeFrequency: 'monthly',
+      priority: 0.9,
+    },
+    {
+      url: 'https://www.surfsense.net/terms',
+      lastModified: new Date(),
+      changeFrequency: 'monthly',
+      priority: 0.9,
+    },
+    {
+      url: 'https://www.surfsense.net/docs',
+      lastModified: new Date(),
+      changeFrequency: 'weekly',
+      priority: 0.9,
+    },
+    {
+      url: 'https://www.surfsense.net/docs/installation',
+      lastModified: new Date(),
+      changeFrequency: 'weekly',
+      priority: 0.9,
+    },
+    {
+      url: 'https://www.surfsense.net/docs/docker-installation',
+      lastModified: new Date(),
+      changeFrequency: 'weekly',
+      priority: 0.9,
+    },
+    {
+      url: 'https://www.surfsense.net/docs/manual-installation',
+      lastModified: new Date(),
+      changeFrequency: 'weekly',
+      priority: 0.9,
+    },
+  ]
+}

From 9ccc147bb0e7c2e94911d447b5cb52c73aac1452 Mon Sep 17 00:00:00 2001
From: "DESKTOP-RTLN3BA\\$punk" <vermarohanfinal@gmail.com>
Date: Wed, 7 May 2025 22:02:48 -0700
Subject: [PATCH 19/70] chore: Added direct handling for markdown files.

- Fixed podcast imports.
---
 .gitignore                                    |  2 +-
 .../app/routes/documents_routes.py            | 88 ++++++++++++-------
 surfsense_backend/app/schemas/chats.py        |  5 +-
 surfsense_backend/app/schemas/chunks.py       |  5 +-
 surfsense_backend/app/schemas/documents.py    |  5 +-
 surfsense_backend/app/schemas/podcasts.py     |  5 +-
 .../app/schemas/search_source_connector.py    |  5 +-
 surfsense_backend/app/schemas/search_space.py |  5 +-
 .../app/tasks/background_tasks.py             | 46 ++++++++++
 surfsense_backend/app/tasks/podcast_tasks.py  | 11 ++-
 10 files changed, 118 insertions(+), 59 deletions(-)

diff --git a/.gitignore b/.gitignore
index b67a7dd64..1a7f2267f 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,2 +1,2 @@
 .flashrank_cache*
-podcasts/*
+podcasts/
diff --git a/surfsense_backend/app/routes/documents_routes.py b/surfsense_backend/app/routes/documents_routes.py
index aa42476cc..bbefbcddc 100644
--- a/surfsense_backend/app/routes/documents_routes.py
+++ b/surfsense_backend/app/routes/documents_routes.py
@@ -6,7 +6,7 @@ from app.db import get_async_session, User, SearchSpace, Document, DocumentType
 from app.schemas import DocumentsCreate, DocumentUpdate, DocumentRead
 from app.users import current_active_user
 from app.utils.check_ownership import check_ownership
-from app.tasks.background_tasks import add_extension_received_document, add_received_file_document, add_crawled_url_document, add_youtube_video_document
+from app.tasks.background_tasks import add_received_markdown_file_document, add_extension_received_document, add_received_file_document, add_crawled_url_document, add_youtube_video_document
 # Force asyncio to use standard event loop before unstructured imports
 import asyncio
 try:
@@ -15,9 +15,8 @@ except RuntimeError:
     pass
 import os
 os.environ["UNSTRUCTURED_HAS_PATCHED_LOOP"] = "1"
-from langchain_unstructured import UnstructuredLoader
-from app.config import config
-import json
+
+
 
 router = APIRouter()
 
@@ -132,36 +131,57 @@ async def process_file_in_background(
     session: AsyncSession
 ):
     try:
-        # Use synchronous unstructured API to avoid event loop issues
-        from langchain_community.document_loaders import UnstructuredFileLoader
-        
-        # Process the file
-        loader = UnstructuredFileLoader(
-            file_path,
-            mode="elements",
-            post_processors=[],
-            languages=["eng"],
-            include_orig_elements=False,
-            include_metadata=False,
-            strategy="auto",
-        )
-        
-        docs = loader.load()
-        
-        # Clean up the temp file
-        import os
-        try:
-            os.unlink(file_path)
-        except:
-            pass
-        
-        # Pass the documents to the existing background task
-        await add_received_file_document(
-            session,
-            filename,
-            docs,
-            search_space_id
-        )
+        # Check if the file is a markdown file
+        if filename.lower().endswith(('.md', '.markdown')):
+            # For markdown files, read the content directly
+            with open(file_path, 'r', encoding='utf-8') as f:
+                markdown_content = f.read()
+            
+            # Clean up the temp file
+            import os
+            try:
+                os.unlink(file_path)
+            except:
+                pass
+            
+            # Process markdown directly through specialized function
+            await add_received_markdown_file_document(
+                session,
+                filename,
+                markdown_content,
+                search_space_id
+            )
+        else:
+            # Use synchronous unstructured API to avoid event loop issues
+            from langchain_unstructured import UnstructuredLoader
+            
+            # Process the file
+            loader = UnstructuredLoader(
+                file_path,
+                mode="elements",
+                post_processors=[],
+                languages=["eng"],
+                include_orig_elements=False,
+                include_metadata=False,
+                strategy="auto",
+            )
+            
+            docs = await loader.aload()
+            
+            # Clean up the temp file
+            import os
+            try:
+                os.unlink(file_path)
+            except:
+                pass
+            
+            # Pass the documents to the existing background task
+            await add_received_file_document(
+                session,
+                filename,
+                docs,
+                search_space_id
+            )
     except Exception as e:
         import logging
         logging.error(f"Error processing file in background: {str(e)}")
diff --git a/surfsense_backend/app/schemas/chats.py b/surfsense_backend/app/schemas/chats.py
index f5eefc532..3d73ab2f9 100644
--- a/surfsense_backend/app/schemas/chats.py
+++ b/surfsense_backend/app/schemas/chats.py
@@ -1,7 +1,7 @@
 from typing import Any, Dict, List, Optional
 
 from app.db import ChatType
-from pydantic import BaseModel
+from pydantic import BaseModel, ConfigDict
 
 from .base import IDModel, TimestampModel
 
@@ -44,5 +44,4 @@ class ChatUpdate(ChatBase):
     pass
 
 class ChatRead(ChatBase, IDModel, TimestampModel):
-    class Config:
-        from_attributes = True 
\ No newline at end of file
+    model_config = ConfigDict(from_attributes=True) 
\ No newline at end of file
diff --git a/surfsense_backend/app/schemas/chunks.py b/surfsense_backend/app/schemas/chunks.py
index 4230981ec..de0764fd4 100644
--- a/surfsense_backend/app/schemas/chunks.py
+++ b/surfsense_backend/app/schemas/chunks.py
@@ -1,4 +1,4 @@
-from pydantic import BaseModel
+from pydantic import BaseModel, ConfigDict
 from .base import IDModel, TimestampModel
 
 class ChunkBase(BaseModel):
@@ -12,5 +12,4 @@ class ChunkUpdate(ChunkBase):
     pass
 
 class ChunkRead(ChunkBase, IDModel, TimestampModel):
-    class Config:
-        from_attributes = True 
\ No newline at end of file
+    model_config = ConfigDict(from_attributes=True) 
\ No newline at end of file
diff --git a/surfsense_backend/app/schemas/documents.py b/surfsense_backend/app/schemas/documents.py
index dcd71879c..6a99fcde9 100644
--- a/surfsense_backend/app/schemas/documents.py
+++ b/surfsense_backend/app/schemas/documents.py
@@ -1,5 +1,5 @@
 from typing import List, Any
-from pydantic import BaseModel
+from pydantic import BaseModel, ConfigDict
 from sqlalchemy import JSON
 from .base import IDModel, TimestampModel
 from app.db import DocumentType
@@ -37,6 +37,5 @@ class DocumentRead(BaseModel):
     created_at: datetime
     search_space_id: int
     
-    class Config:
-        from_attributes = True
+    model_config = ConfigDict(from_attributes=True)
 
diff --git a/surfsense_backend/app/schemas/podcasts.py b/surfsense_backend/app/schemas/podcasts.py
index 4132fb211..0356dd0b7 100644
--- a/surfsense_backend/app/schemas/podcasts.py
+++ b/surfsense_backend/app/schemas/podcasts.py
@@ -1,4 +1,4 @@
-from pydantic import BaseModel
+from pydantic import BaseModel, ConfigDict
 from typing import Any, List, Literal
 from .base import IDModel, TimestampModel
 
@@ -15,8 +15,7 @@ class PodcastUpdate(PodcastBase):
     pass
 
 class PodcastRead(PodcastBase, IDModel, TimestampModel):
-    class Config:
-        from_attributes = True
+    model_config = ConfigDict(from_attributes=True)
 
 class PodcastGenerateRequest(BaseModel):
     type: Literal["DOCUMENT", "CHAT"]
diff --git a/surfsense_backend/app/schemas/search_source_connector.py b/surfsense_backend/app/schemas/search_source_connector.py
index cb7152e06..b136757fd 100644
--- a/surfsense_backend/app/schemas/search_source_connector.py
+++ b/surfsense_backend/app/schemas/search_source_connector.py
@@ -1,7 +1,7 @@
 from datetime import datetime
 import uuid
 from typing import Dict, Any, Optional
-from pydantic import BaseModel, field_validator
+from pydantic import BaseModel, field_validator, ConfigDict
 from .base import IDModel, TimestampModel
 from app.db import SearchSourceConnectorType
 
@@ -106,5 +106,4 @@ class SearchSourceConnectorUpdate(BaseModel):
 class SearchSourceConnectorRead(SearchSourceConnectorBase, IDModel, TimestampModel):
     user_id: uuid.UUID
 
-    class Config:
-        from_attributes = True 
+    model_config = ConfigDict(from_attributes=True) 
diff --git a/surfsense_backend/app/schemas/search_space.py b/surfsense_backend/app/schemas/search_space.py
index feebcf1ac..2c99c45ac 100644
--- a/surfsense_backend/app/schemas/search_space.py
+++ b/surfsense_backend/app/schemas/search_space.py
@@ -1,7 +1,7 @@
 from datetime import datetime
 import uuid
 from typing import Optional
-from pydantic import BaseModel
+from pydantic import BaseModel, ConfigDict
 from .base import IDModel, TimestampModel
 
 class SearchSpaceBase(BaseModel):
@@ -19,5 +19,4 @@ class SearchSpaceRead(SearchSpaceBase, IDModel, TimestampModel):
     created_at: datetime
     user_id: uuid.UUID
 
-    class Config:
-        from_attributes = True 
\ No newline at end of file
+    model_config = ConfigDict(from_attributes=True) 
\ No newline at end of file
diff --git a/surfsense_backend/app/tasks/background_tasks.py b/surfsense_backend/app/tasks/background_tasks.py
index 68b56c435..099391f24 100644
--- a/surfsense_backend/app/tasks/background_tasks.py
+++ b/surfsense_backend/app/tasks/background_tasks.py
@@ -194,6 +194,52 @@ async def add_extension_received_document(
         await session.rollback()
         raise RuntimeError(f"Failed to process extension document: {str(e)}")
 
+async def add_received_markdown_file_document(
+    session: AsyncSession,
+    file_name: str,
+    file_in_markdown: str,
+    search_space_id: int
+) -> Optional[Document]:
+    try:
+
+        # Generate summary
+        summary_chain = SUMMARY_PROMPT_TEMPLATE | config.long_context_llm_instance
+        summary_result = await summary_chain.ainvoke({"document": file_in_markdown})
+        summary_content = summary_result.content
+        summary_embedding = config.embedding_model_instance.embed(
+            summary_content)
+
+       # Process chunks
+        chunks = [
+            Chunk(content=chunk.text, embedding=config.embedding_model_instance.embed(chunk.text))
+            for chunk in config.chunker_instance.chunk(file_in_markdown)
+        ]
+
+        # Create and store document
+        document = Document(
+            search_space_id=search_space_id,
+            title=file_name,
+            document_type=DocumentType.FILE,
+            document_metadata={
+                "FILE_NAME": file_name,
+                "SAVED_AT": datetime.now().strftime("%Y-%m-%d %H:%M:%S")
+            },
+            content=summary_content,
+            embedding=summary_embedding,
+            chunks=chunks
+        )
+
+        session.add(document)
+        await session.commit()
+        await session.refresh(document)
+
+        return document
+    except SQLAlchemyError as db_error:
+        await session.rollback()
+        raise db_error
+    except Exception as e:
+        await session.rollback()
+        raise RuntimeError(f"Failed to process file document: {str(e)}")
 
 async def add_received_file_document(
     session: AsyncSession,
diff --git a/surfsense_backend/app/tasks/podcast_tasks.py b/surfsense_backend/app/tasks/podcast_tasks.py
index e148f5465..12364e7ce 100644
--- a/surfsense_backend/app/tasks/podcast_tasks.py
+++ b/surfsense_backend/app/tasks/podcast_tasks.py
@@ -1,10 +1,9 @@
-from sqlalchemy.ext.asyncio import AsyncSession
-from app.schemas import PodcastGenerateRequest
-from typing import List
-from sqlalchemy import select
-from app.db import Chat, Podcast
+
 from app.agents.podcaster.graph import graph as podcaster_graph
-from surfsense_backend.app.agents.podcaster.state import State
+from app.agents.podcaster.state import State
+from app.db import Chat, Podcast
+from sqlalchemy import select
+from sqlalchemy.ext.asyncio import AsyncSession
 
 
 async def generate_document_podcast(

From 1586a0bd780e3605cca71a26bb597117ca797ddf Mon Sep 17 00:00:00 2001
From: "DESKTOP-RTLN3BA\\$punk" <vermarohanfinal@gmail.com>
Date: Wed, 7 May 2025 22:04:57 -0700
Subject: [PATCH 20/70] chore: Added direct handling for markdown files.

- Fixed podcast imports.
---
 .gitignore                                    |  2 +-
 .../app/routes/documents_routes.py            | 88 ++++++++++++-------
 surfsense_backend/app/schemas/chats.py        |  5 +-
 surfsense_backend/app/schemas/chunks.py       |  5 +-
 surfsense_backend/app/schemas/documents.py    |  5 +-
 surfsense_backend/app/schemas/podcasts.py     |  5 +-
 .../app/schemas/search_source_connector.py    |  5 +-
 surfsense_backend/app/schemas/search_space.py |  5 +-
 .../app/tasks/background_tasks.py             | 46 ++++++++++
 surfsense_backend/app/tasks/podcast_tasks.py  | 11 ++-
 10 files changed, 118 insertions(+), 59 deletions(-)

diff --git a/.gitignore b/.gitignore
index b67a7dd64..1a7f2267f 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,2 +1,2 @@
 .flashrank_cache*
-podcasts/*
+podcasts/
diff --git a/surfsense_backend/app/routes/documents_routes.py b/surfsense_backend/app/routes/documents_routes.py
index aa42476cc..bbefbcddc 100644
--- a/surfsense_backend/app/routes/documents_routes.py
+++ b/surfsense_backend/app/routes/documents_routes.py
@@ -6,7 +6,7 @@ from app.db import get_async_session, User, SearchSpace, Document, DocumentType
 from app.schemas import DocumentsCreate, DocumentUpdate, DocumentRead
 from app.users import current_active_user
 from app.utils.check_ownership import check_ownership
-from app.tasks.background_tasks import add_extension_received_document, add_received_file_document, add_crawled_url_document, add_youtube_video_document
+from app.tasks.background_tasks import add_received_markdown_file_document, add_extension_received_document, add_received_file_document, add_crawled_url_document, add_youtube_video_document
 # Force asyncio to use standard event loop before unstructured imports
 import asyncio
 try:
@@ -15,9 +15,8 @@ except RuntimeError:
     pass
 import os
 os.environ["UNSTRUCTURED_HAS_PATCHED_LOOP"] = "1"
-from langchain_unstructured import UnstructuredLoader
-from app.config import config
-import json
+
+
 
 router = APIRouter()
 
@@ -132,36 +131,57 @@ async def process_file_in_background(
     session: AsyncSession
 ):
     try:
-        # Use synchronous unstructured API to avoid event loop issues
-        from langchain_community.document_loaders import UnstructuredFileLoader
-        
-        # Process the file
-        loader = UnstructuredFileLoader(
-            file_path,
-            mode="elements",
-            post_processors=[],
-            languages=["eng"],
-            include_orig_elements=False,
-            include_metadata=False,
-            strategy="auto",
-        )
-        
-        docs = loader.load()
-        
-        # Clean up the temp file
-        import os
-        try:
-            os.unlink(file_path)
-        except:
-            pass
-        
-        # Pass the documents to the existing background task
-        await add_received_file_document(
-            session,
-            filename,
-            docs,
-            search_space_id
-        )
+        # Check if the file is a markdown file
+        if filename.lower().endswith(('.md', '.markdown')):
+            # For markdown files, read the content directly
+            with open(file_path, 'r', encoding='utf-8') as f:
+                markdown_content = f.read()
+            
+            # Clean up the temp file
+            import os
+            try:
+                os.unlink(file_path)
+            except:
+                pass
+            
+            # Process markdown directly through specialized function
+            await add_received_markdown_file_document(
+                session,
+                filename,
+                markdown_content,
+                search_space_id
+            )
+        else:
+            # Use synchronous unstructured API to avoid event loop issues
+            from langchain_unstructured import UnstructuredLoader
+            
+            # Process the file
+            loader = UnstructuredLoader(
+                file_path,
+                mode="elements",
+                post_processors=[],
+                languages=["eng"],
+                include_orig_elements=False,
+                include_metadata=False,
+                strategy="auto",
+            )
+            
+            docs = await loader.aload()
+            
+            # Clean up the temp file
+            import os
+            try:
+                os.unlink(file_path)
+            except:
+                pass
+            
+            # Pass the documents to the existing background task
+            await add_received_file_document(
+                session,
+                filename,
+                docs,
+                search_space_id
+            )
     except Exception as e:
         import logging
         logging.error(f"Error processing file in background: {str(e)}")
diff --git a/surfsense_backend/app/schemas/chats.py b/surfsense_backend/app/schemas/chats.py
index f5eefc532..3d73ab2f9 100644
--- a/surfsense_backend/app/schemas/chats.py
+++ b/surfsense_backend/app/schemas/chats.py
@@ -1,7 +1,7 @@
 from typing import Any, Dict, List, Optional
 
 from app.db import ChatType
-from pydantic import BaseModel
+from pydantic import BaseModel, ConfigDict
 
 from .base import IDModel, TimestampModel
 
@@ -44,5 +44,4 @@ class ChatUpdate(ChatBase):
     pass
 
 class ChatRead(ChatBase, IDModel, TimestampModel):
-    class Config:
-        from_attributes = True 
\ No newline at end of file
+    model_config = ConfigDict(from_attributes=True) 
\ No newline at end of file
diff --git a/surfsense_backend/app/schemas/chunks.py b/surfsense_backend/app/schemas/chunks.py
index 4230981ec..de0764fd4 100644
--- a/surfsense_backend/app/schemas/chunks.py
+++ b/surfsense_backend/app/schemas/chunks.py
@@ -1,4 +1,4 @@
-from pydantic import BaseModel
+from pydantic import BaseModel, ConfigDict
 from .base import IDModel, TimestampModel
 
 class ChunkBase(BaseModel):
@@ -12,5 +12,4 @@ class ChunkUpdate(ChunkBase):
     pass
 
 class ChunkRead(ChunkBase, IDModel, TimestampModel):
-    class Config:
-        from_attributes = True 
\ No newline at end of file
+    model_config = ConfigDict(from_attributes=True) 
\ No newline at end of file
diff --git a/surfsense_backend/app/schemas/documents.py b/surfsense_backend/app/schemas/documents.py
index dcd71879c..6a99fcde9 100644
--- a/surfsense_backend/app/schemas/documents.py
+++ b/surfsense_backend/app/schemas/documents.py
@@ -1,5 +1,5 @@
 from typing import List, Any
-from pydantic import BaseModel
+from pydantic import BaseModel, ConfigDict
 from sqlalchemy import JSON
 from .base import IDModel, TimestampModel
 from app.db import DocumentType
@@ -37,6 +37,5 @@ class DocumentRead(BaseModel):
     created_at: datetime
     search_space_id: int
     
-    class Config:
-        from_attributes = True
+    model_config = ConfigDict(from_attributes=True)
 
diff --git a/surfsense_backend/app/schemas/podcasts.py b/surfsense_backend/app/schemas/podcasts.py
index 4132fb211..0356dd0b7 100644
--- a/surfsense_backend/app/schemas/podcasts.py
+++ b/surfsense_backend/app/schemas/podcasts.py
@@ -1,4 +1,4 @@
-from pydantic import BaseModel
+from pydantic import BaseModel, ConfigDict
 from typing import Any, List, Literal
 from .base import IDModel, TimestampModel
 
@@ -15,8 +15,7 @@ class PodcastUpdate(PodcastBase):
     pass
 
 class PodcastRead(PodcastBase, IDModel, TimestampModel):
-    class Config:
-        from_attributes = True
+    model_config = ConfigDict(from_attributes=True)
 
 class PodcastGenerateRequest(BaseModel):
     type: Literal["DOCUMENT", "CHAT"]
diff --git a/surfsense_backend/app/schemas/search_source_connector.py b/surfsense_backend/app/schemas/search_source_connector.py
index cb7152e06..b136757fd 100644
--- a/surfsense_backend/app/schemas/search_source_connector.py
+++ b/surfsense_backend/app/schemas/search_source_connector.py
@@ -1,7 +1,7 @@
 from datetime import datetime
 import uuid
 from typing import Dict, Any, Optional
-from pydantic import BaseModel, field_validator
+from pydantic import BaseModel, field_validator, ConfigDict
 from .base import IDModel, TimestampModel
 from app.db import SearchSourceConnectorType
 
@@ -106,5 +106,4 @@ class SearchSourceConnectorUpdate(BaseModel):
 class SearchSourceConnectorRead(SearchSourceConnectorBase, IDModel, TimestampModel):
     user_id: uuid.UUID
 
-    class Config:
-        from_attributes = True 
+    model_config = ConfigDict(from_attributes=True) 
diff --git a/surfsense_backend/app/schemas/search_space.py b/surfsense_backend/app/schemas/search_space.py
index feebcf1ac..2c99c45ac 100644
--- a/surfsense_backend/app/schemas/search_space.py
+++ b/surfsense_backend/app/schemas/search_space.py
@@ -1,7 +1,7 @@
 from datetime import datetime
 import uuid
 from typing import Optional
-from pydantic import BaseModel
+from pydantic import BaseModel, ConfigDict
 from .base import IDModel, TimestampModel
 
 class SearchSpaceBase(BaseModel):
@@ -19,5 +19,4 @@ class SearchSpaceRead(SearchSpaceBase, IDModel, TimestampModel):
     created_at: datetime
     user_id: uuid.UUID
 
-    class Config:
-        from_attributes = True 
\ No newline at end of file
+    model_config = ConfigDict(from_attributes=True) 
\ No newline at end of file
diff --git a/surfsense_backend/app/tasks/background_tasks.py b/surfsense_backend/app/tasks/background_tasks.py
index 68b56c435..099391f24 100644
--- a/surfsense_backend/app/tasks/background_tasks.py
+++ b/surfsense_backend/app/tasks/background_tasks.py
@@ -194,6 +194,52 @@ async def add_extension_received_document(
         await session.rollback()
         raise RuntimeError(f"Failed to process extension document: {str(e)}")
 
+async def add_received_markdown_file_document(
+    session: AsyncSession,
+    file_name: str,
+    file_in_markdown: str,
+    search_space_id: int
+) -> Optional[Document]:
+    try:
+
+        # Generate summary
+        summary_chain = SUMMARY_PROMPT_TEMPLATE | config.long_context_llm_instance
+        summary_result = await summary_chain.ainvoke({"document": file_in_markdown})
+        summary_content = summary_result.content
+        summary_embedding = config.embedding_model_instance.embed(
+            summary_content)
+
+       # Process chunks
+        chunks = [
+            Chunk(content=chunk.text, embedding=config.embedding_model_instance.embed(chunk.text))
+            for chunk in config.chunker_instance.chunk(file_in_markdown)
+        ]
+
+        # Create and store document
+        document = Document(
+            search_space_id=search_space_id,
+            title=file_name,
+            document_type=DocumentType.FILE,
+            document_metadata={
+                "FILE_NAME": file_name,
+                "SAVED_AT": datetime.now().strftime("%Y-%m-%d %H:%M:%S")
+            },
+            content=summary_content,
+            embedding=summary_embedding,
+            chunks=chunks
+        )
+
+        session.add(document)
+        await session.commit()
+        await session.refresh(document)
+
+        return document
+    except SQLAlchemyError as db_error:
+        await session.rollback()
+        raise db_error
+    except Exception as e:
+        await session.rollback()
+        raise RuntimeError(f"Failed to process file document: {str(e)}")
 
 async def add_received_file_document(
     session: AsyncSession,
diff --git a/surfsense_backend/app/tasks/podcast_tasks.py b/surfsense_backend/app/tasks/podcast_tasks.py
index e148f5465..12364e7ce 100644
--- a/surfsense_backend/app/tasks/podcast_tasks.py
+++ b/surfsense_backend/app/tasks/podcast_tasks.py
@@ -1,10 +1,9 @@
-from sqlalchemy.ext.asyncio import AsyncSession
-from app.schemas import PodcastGenerateRequest
-from typing import List
-from sqlalchemy import select
-from app.db import Chat, Podcast
+
 from app.agents.podcaster.graph import graph as podcaster_graph
-from surfsense_backend.app.agents.podcaster.state import State
+from app.agents.podcaster.state import State
+from app.db import Chat, Podcast
+from sqlalchemy import select
+from sqlalchemy.ext.asyncio import AsyncSession
 
 
 async def generate_document_podcast(

From 5899a89bf173baba7d4149bb777236ed8f9724a1 Mon Sep 17 00:00:00 2001
From: "DESKTOP-RTLN3BA\\$punk" <vermarohanfinal@gmail.com>
Date: Wed, 7 May 2025 22:18:02 -0700
Subject: [PATCH 21/70] recurse fix

---
 surfsense_backend/app/schemas/base.py      | 6 ++++--
 surfsense_backend/app/schemas/documents.py | 2 --
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/surfsense_backend/app/schemas/base.py b/surfsense_backend/app/schemas/base.py
index 4bd7b2262..d357aabcb 100644
--- a/surfsense_backend/app/schemas/base.py
+++ b/surfsense_backend/app/schemas/base.py
@@ -1,8 +1,10 @@
 from datetime import datetime
-from pydantic import BaseModel
+from pydantic import BaseModel, ConfigDict
 
 class TimestampModel(BaseModel):
     created_at: datetime
+    model_config = ConfigDict(from_attributes=True)
 
 class IDModel(BaseModel):
-    id: int 
\ No newline at end of file
+    id: int
+    model_config = ConfigDict(from_attributes=True) 
\ No newline at end of file
diff --git a/surfsense_backend/app/schemas/documents.py b/surfsense_backend/app/schemas/documents.py
index 6a99fcde9..ad8bd9599 100644
--- a/surfsense_backend/app/schemas/documents.py
+++ b/surfsense_backend/app/schemas/documents.py
@@ -1,7 +1,5 @@
 from typing import List, Any
 from pydantic import BaseModel, ConfigDict
-from sqlalchemy import JSON
-from .base import IDModel, TimestampModel
 from app.db import DocumentType
 from datetime import datetime
 

From 8a02d7c366a071156644599bd8a9851646cd5f0e Mon Sep 17 00:00:00 2001
From: "DESKTOP-RTLN3BA\\$punk" <vermarohanfinal@gmail.com>
Date: Wed, 7 May 2025 23:03:18 -0700
Subject: [PATCH 22/70] chore: UI Fixes

---
 .../[search_space_id]/chats/page.tsx          |   4 +-
 .../[search_space_id]/podcasts/page.tsx       |   2 -
 .../podcasts/podcasts-client.tsx              | 617 +++++++++++-------
 3 files changed, 400 insertions(+), 223 deletions(-)

diff --git a/surfsense_web/app/dashboard/[search_space_id]/chats/page.tsx b/surfsense_web/app/dashboard/[search_space_id]/chats/page.tsx
index 58c89f421..4d71dac77 100644
--- a/surfsense_web/app/dashboard/[search_space_id]/chats/page.tsx
+++ b/surfsense_web/app/dashboard/[search_space_id]/chats/page.tsx
@@ -8,8 +8,8 @@ interface PageProps {
 }
 
 export default async function ChatsPage({ params }: PageProps) {
-  // Await params to properly access dynamic route parameters
-  const searchSpaceId = params.search_space_id;
+  // Get search space ID from the route parameter
+  const { search_space_id: searchSpaceId } = params;
   
   return (
     <Suspense fallback={<div className="flex items-center justify-center h-[60vh]">
diff --git a/surfsense_web/app/dashboard/[search_space_id]/podcasts/page.tsx b/surfsense_web/app/dashboard/[search_space_id]/podcasts/page.tsx
index 394177c88..429260724 100644
--- a/surfsense_web/app/dashboard/[search_space_id]/podcasts/page.tsx
+++ b/surfsense_web/app/dashboard/[search_space_id]/podcasts/page.tsx
@@ -8,8 +8,6 @@ interface PageProps {
 }
 
 export default async function PodcastsPage({ params }: PageProps) {
-  // Access dynamic route parameters
-  // Need to await params before accessing its properties in an async component
   const { search_space_id: searchSpaceId } = await Promise.resolve(params);
   
   return (
diff --git a/surfsense_web/app/dashboard/[search_space_id]/podcasts/podcasts-client.tsx b/surfsense_web/app/dashboard/[search_space_id]/podcasts/podcasts-client.tsx
index 72e50ab95..16527fbee 100644
--- a/surfsense_web/app/dashboard/[search_space_id]/podcasts/podcasts-client.tsx
+++ b/surfsense_web/app/dashboard/[search_space_id]/podcasts/podcasts-client.tsx
@@ -1,25 +1,24 @@
 'use client';
 
-import { useState, useEffect, useRef } from 'react';
-import { motion, AnimatePresence } from 'framer-motion';
 import { format } from 'date-fns';
-import { 
-  Search, Calendar, Trash2, MoreHorizontal, Podcast, 
-  Play, Pause, SkipForward, SkipBack, Volume2, VolumeX
+import { AnimatePresence, motion } from 'framer-motion';
+import {
+  Calendar,
+  MoreHorizontal,
+  Pause,
+  Play,
+  Podcast,
+  Search,
+  SkipBack,
+  SkipForward,
+  Trash2,
+  Volume2, VolumeX
 } from 'lucide-react';
+import { useEffect, useRef, useState } from 'react';
 
 // UI Components
-import { Input } from '@/components/ui/input';
 import { Button } from '@/components/ui/button';
-import { Card, CardContent, CardFooter, CardHeader, CardTitle } from '@/components/ui/card';
-import { Slider } from '@/components/ui/slider';
-import { 
-  DropdownMenu, 
-  DropdownMenuContent, 
-  DropdownMenuItem, 
-  DropdownMenuTrigger,
-  DropdownMenuSeparator
-} from '@/components/ui/dropdown-menu';
+import { Card } from '@/components/ui/card';
 import {
   Dialog,
   DialogContent,
@@ -28,6 +27,13 @@ import {
   DialogHeader,
   DialogTitle,
 } from "@/components/ui/dialog";
+import {
+  DropdownMenu,
+  DropdownMenuContent,
+  DropdownMenuItem,
+  DropdownMenuTrigger
+} from '@/components/ui/dropdown-menu';
+import { Input } from '@/components/ui/input';
 import {
   Select,
   SelectContent,
@@ -36,6 +42,7 @@ import {
   SelectTrigger,
   SelectValue,
 } from "@/components/ui/select";
+import { Slider } from '@/components/ui/slider';
 import { toast } from "sonner";
 
 interface PodcastItem {
@@ -53,14 +60,15 @@ interface PodcastsPageClientProps {
 
 const pageVariants = {
   initial: { opacity: 0 },
-  enter: { opacity: 1, transition: { duration: 0.3, ease: 'easeInOut' } },
+  enter: { opacity: 1, transition: { duration: 0.4, ease: 'easeInOut', staggerChildren: 0.1 } },
   exit: { opacity: 0, transition: { duration: 0.3, ease: 'easeInOut' } }
 };
 
 const podcastCardVariants = {
-  initial: { y: 20, opacity: 0 },
-  animate: { y: 0, opacity: 1 },
-  exit: { y: -20, opacity: 0 }
+  initial: { scale: 0.95, y: 20, opacity: 0 },
+  animate: { scale: 1, y: 0, opacity: 1, transition: { type: "spring", stiffness: 300, damping: 25 } },
+  exit: { scale: 0.95, y: -20, opacity: 0 },
+  hover: { y: -5, scale: 1.02, transition: { duration: 0.2 } }
 };
 
 const MotionCard = motion(Card);
@@ -216,12 +224,17 @@ export default function PodcastsPageClient({ searchSpaceId }: PodcastsPageClient
   const handleVolumeChange = (value: number[]) => {
     if (audioRef.current) {
       const newVolume = value[0];
+      
+      // Set volume
       audioRef.current.volume = newVolume;
       setVolume(newVolume);
       
+      // Handle mute state based on volume
       if (newVolume === 0) {
+        audioRef.current.muted = true;
         setIsMuted(true);
-      } else if (isMuted) {
+      } else {
+        audioRef.current.muted = false;
         setIsMuted(false);
       }
     }
@@ -230,8 +243,16 @@ export default function PodcastsPageClient({ searchSpaceId }: PodcastsPageClient
   // Toggle mute
   const toggleMute = () => {
     if (audioRef.current) {
-      audioRef.current.muted = !isMuted;
-      setIsMuted(!isMuted);
+      const newMutedState = !isMuted;
+      audioRef.current.muted = newMutedState;
+      setIsMuted(newMutedState);
+      
+      // If unmuting, restore previous volume if it was 0
+      if (!newMutedState && volume === 0) {
+        const restoredVolume = 0.5;
+        audioRef.current.volume = restoredVolume;
+        setVolume(restoredVolume);
+      }
     }
   };
 
@@ -264,68 +285,73 @@ export default function PodcastsPageClient({ searchSpaceId }: PodcastsPageClient
       return;
     }
 
-    // Revoke previous object URL if exists
-    if (currentObjectUrlRef.current) {
-      URL.revokeObjectURL(currentObjectUrlRef.current);
-      currentObjectUrlRef.current = null;
+    // Prevent multiple simultaneous loading requests
+    if (isAudioLoading) {
+      return;
     }
     
-    // Reset player state and show loading
-    setCurrentPodcast(podcast);
-    setAudioSrc(undefined);
-    setCurrentTime(0);
-    setDuration(0);
-    setIsPlaying(false);
-    setIsAudioLoading(true);
-    
     try {
+      // Reset player state and show loading
+      setCurrentPodcast(podcast);
+      setAudioSrc(undefined);
+      setCurrentTime(0);
+      setDuration(0);
+      setIsPlaying(false);
+      setIsAudioLoading(true);
+      
       const token = localStorage.getItem('surfsense_bearer_token');
       if (!token) {
-        toast.error('Authentication token not found.');
-        setIsAudioLoading(false);
-        return;
+        throw new Error('Authentication token not found.');
       }
 
-      const response = await fetch(
-        `${process.env.NEXT_PUBLIC_FASTAPI_BACKEND_URL}/api/v1/podcasts/${podcast.id}/stream`,
-        {
-          headers: {
-            'Authorization': `Bearer ${token}`,
-          },
-        }
-      );
-
-      if (!response.ok) {
-        throw new Error(`Failed to fetch audio stream: ${response.statusText}`);
+      // Revoke previous object URL if exists (only after we've started the new request)
+      if (currentObjectUrlRef.current) {
+        URL.revokeObjectURL(currentObjectUrlRef.current);
+        currentObjectUrlRef.current = null;
       }
 
-      const blob = await response.blob();
-      const objectUrl = URL.createObjectURL(blob);
-      currentObjectUrlRef.current = objectUrl;
-      
-      // Wait for React to commit the new `src`
-      setAudioSrc(objectUrl);
-      
-      // Use requestAnimationFrame instead of setTimeout for more reliable DOM updates
-      requestAnimationFrame(() => {
-        if (audioRef.current) {
-          // The <audio> element has the new src now
-          audioRef.current.play()
-            .then(() => {
-              setIsPlaying(true);
-            })
-            .catch(error => {
-              console.error('Error playing audio:', error);
-              toast.error('Failed to play audio.');
-              setIsPlaying(false);
-            });
-        }
-      });
+      // Use AbortController to handle timeout or cancellation
+      const controller = new AbortController();
+      const timeoutId = setTimeout(() => controller.abort(), 30000); // 30 second timeout
 
+      try {
+        const response = await fetch(
+          `${process.env.NEXT_PUBLIC_FASTAPI_BACKEND_URL}/api/v1/podcasts/${podcast.id}/stream`,
+          {
+            headers: {
+              'Authorization': `Bearer ${token}`,
+            },
+            signal: controller.signal
+          }
+        );
+
+        clearTimeout(timeoutId);
+
+        if (!response.ok) {
+          throw new Error(`Failed to fetch audio stream: ${response.statusText}`);
+        }
+
+        const blob = await response.blob();
+        const objectUrl = URL.createObjectURL(blob);
+        currentObjectUrlRef.current = objectUrl;
+        
+        // Set audio source
+        setAudioSrc(objectUrl);
+        
+        // Wait for the audio to be ready before playing
+        // We'll handle actual playback in the onLoadedData event instead of here
+      } catch (error) {
+        if (error instanceof DOMException && error.name === 'AbortError') {
+          throw new Error('Request timed out. Please try again.');
+        }
+        throw error;
+      }
     } catch (error) {
       console.error('Error fetching or playing podcast:', error);
       toast.error(error instanceof Error ? error.message : 'Failed to load podcast audio.');
+      // Reset state on error
       setCurrentPodcast(null);
+      setAudioSrc(undefined);
     } finally {
       setIsAudioLoading(false);
     }
@@ -456,7 +482,13 @@ export default function PodcastsPageClient({ searchSpaceId }: PodcastsPageClient
         {/* Podcast Grid */}
         {!isLoading && !error && filteredPodcasts.length > 0 && (
           <AnimatePresence mode="wait">
-            <div className="grid grid-cols-1 md:grid-cols-2 lg:grid-cols-3 gap-6">
+            <motion.div 
+              className="grid grid-cols-1 md:grid-cols-2 lg:grid-cols-3 gap-6"
+              variants={pageVariants}
+              initial="initial"
+              animate="enter"
+              exit="exit"
+            >
               {filteredPodcasts.map((podcast, index) => (
                 <MotionCard
                   key={podcast.id}
@@ -464,71 +496,125 @@ export default function PodcastsPageClient({ searchSpaceId }: PodcastsPageClient
                   initial="initial"
                   animate="animate"
                   exit="exit"
+                  whileHover="hover"
                   transition={{ duration: 0.2, delay: index * 0.05 }}
                   className={`
                     bg-card/60 dark:bg-card/40 backdrop-blur-lg rounded-xl p-4 
-                    shadow-lg hover:shadow-xl transition-all duration-300 
-                    border-border overflow-hidden 
+                    shadow-md hover:shadow-xl transition-all duration-300 
+                    border-border overflow-hidden cursor-pointer
                     ${currentPodcast?.id === podcast.id ? 'ring-2 ring-primary ring-offset-2 ring-offset-background' : ''}
                   `}
                   layout
+                  onClick={() => playPodcast(podcast)}
                 >
                   <div 
-                    className="relative w-full aspect-[16/10] mb-4 rounded-lg overflow-hidden group cursor-pointer"
-                    onClick={() => playPodcast(podcast)}
+                    className="relative w-full aspect-[16/10] mb-4 rounded-lg overflow-hidden"
                   >
-                    {/* Podcast image */}
+                    {/* Podcast image with gradient overlay */}
                     <img 
                       src={PODCAST_IMAGE_URL} 
                       alt="Podcast illustration" 
-                      className="w-full h-full object-cover transition-transform duration-300 group-hover:scale-105 brightness-[0.85] contrast-[1.1]"
+                      className="w-full h-full object-cover transition-transform duration-500 group-hover:scale-105 brightness-[0.85] contrast-[1.1]"
                       loading="lazy"
                     />
                     
-                    {/* Overlay for better contrast with controls */}
-                    <div className="absolute inset-0 bg-black/20 group-hover:bg-black/30 transition-colors"></div>
+                    {/* Better overlay with gradient for improved text legibility */}
+                    <div className="absolute inset-0 bg-gradient-to-t from-black/60 to-black/10 transition-opacity duration-300"></div>
                     
-                    {/* Loading indicator */}
+                    {/* Loading indicator with improved animation */}
                     {currentPodcast?.id === podcast.id && isAudioLoading && (
-                      <div className="absolute inset-0 flex items-center justify-center bg-background/50 z-10">
-                        <div className="h-8 w-8 animate-spin rounded-full border-2 border-primary border-t-transparent"></div>
-                      </div>
+                      <motion.div 
+                        className="absolute inset-0 flex items-center justify-center bg-background/60 backdrop-blur-md z-10"
+                        initial={{ opacity: 0 }}
+                        animate={{ opacity: 1 }}
+                        exit={{ opacity: 0 }}
+                        transition={{ duration: 0.2 }}
+                      >
+                        <motion.div 
+                          className="flex flex-col items-center gap-3"
+                          initial={{ scale: 0.9 }}
+                          animate={{ scale: 1 }}
+                          transition={{ type: "spring", damping: 20 }}
+                        >
+                          <div className="h-14 w-14 rounded-full border-4 border-primary/30 border-t-primary animate-spin"></div>
+                          <p className="text-sm text-foreground font-medium">Loading podcast...</p>
+                        </motion.div>
+                      </motion.div>
                     )}
 
-                    {/* Play button */}
+                    {/* Play button with animations */}
                     {!(currentPodcast?.id === podcast.id && (isPlaying || isAudioLoading)) && (
-                      <Button
-                        variant="outline"
-                        size="icon"
-                        className="absolute top-1/2 left-1/2 -translate-x-1/2 -translate-y-1/2 h-14 w-14 rounded-full 
-                          bg-background/70 hover:bg-background/90 backdrop-blur-sm scale-90 group-hover:scale-100 
-                          transition-transform duration-200 z-0 shadow-lg"
-                        onClick={(e) => {
-                          e.stopPropagation();
-                          playPodcast(podcast);
-                        }}
-                        disabled={isAudioLoading}
+                      <motion.div
+                        className="absolute top-1/2 left-1/2 -translate-x-1/2 -translate-y-1/2 z-10"
+                        whileHover={{ scale: 1.1 }}
+                        whileTap={{ scale: 0.9 }}
                       >
-                        <Play className="h-7 w-7 ml-1" /> 
-                      </Button>
+                        <Button
+                          variant="secondary"
+                          size="icon"
+                          className="h-16 w-16 rounded-full 
+                            bg-background/80 hover:bg-background/95 backdrop-blur-md
+                            transition-all duration-200 shadow-xl border-0
+                            flex items-center justify-center"
+                          onClick={(e) => {
+                            e.stopPropagation();
+                            playPodcast(podcast);
+                          }}
+                          disabled={isAudioLoading}
+                        >
+                          <motion.div
+                            initial={{ scale: 0.8 }}
+                            animate={{ scale: 1 }}
+                            transition={{ type: "spring", stiffness: 400, damping: 10 }}
+                            className="text-primary w-10 h-10 flex items-center justify-center"
+                          >
+                            <Play className="h-8 w-8 ml-1" /> 
+                          </motion.div>
+                        </Button>
+                      </motion.div>
                     )}
                     
-                    {/* Pause button */}
+                    {/* Pause button with animations */}
                     {currentPodcast?.id === podcast.id && isPlaying && !isAudioLoading && (
-                      <Button
-                        variant="outline"
-                        size="icon"
-                        className="absolute top-1/2 left-1/2 -translate-x-1/2 -translate-y-1/2 h-14 w-14 rounded-full 
-                          bg-background/70 hover:bg-background/90 backdrop-blur-sm scale-90 group-hover:scale-100 
-                          transition-transform duration-200 z-0 shadow-lg"
-                        onClick={(e) => {
-                          e.stopPropagation();
-                          togglePlayPause();
-                        }}
-                        disabled={isAudioLoading}
+                      <motion.div
+                        className="absolute top-1/2 left-1/2 -translate-x-1/2 -translate-y-1/2 z-10"
+                        whileHover={{ scale: 1.1 }}
+                        whileTap={{ scale: 0.9 }}
                       >
-                        <Pause className="h-7 w-7" /> 
-                      </Button>
+                        <Button
+                          variant="secondary"
+                          size="icon"
+                          className="h-16 w-16 rounded-full 
+                            bg-background/80 hover:bg-background/95 backdrop-blur-md
+                            transition-all duration-200 shadow-xl border-0
+                            flex items-center justify-center"
+                          onClick={(e) => {
+                            e.stopPropagation();
+                            togglePlayPause();
+                          }}
+                          disabled={isAudioLoading}
+                        >
+                          <motion.div
+                            initial={{ scale: 0.8 }}
+                            animate={{ scale: 1 }}
+                            transition={{ type: "spring", stiffness: 400, damping: 10 }}
+                            className="text-primary w-10 h-10 flex items-center justify-center"
+                          >
+                            <Pause className="h-8 w-8" /> 
+                          </motion.div>
+                        </Button>
+                      </motion.div>
+                    )}
+                    
+                    {/* Now playing indicator */}
+                    {currentPodcast?.id === podcast.id && !isAudioLoading && (
+                      <div className="absolute top-2 left-2 bg-primary text-primary-foreground text-xs px-2 py-1 rounded-full z-10 flex items-center gap-1.5">
+                        <span className="relative flex h-2 w-2">
+                          <span className="animate-ping absolute inline-flex h-full w-full rounded-full bg-primary-foreground opacity-75"></span>
+                          <span className="relative inline-flex rounded-full h-2 w-2 bg-primary-foreground"></span>
+                        </span>
+                        Now Playing
+                      </div>
                     )}
                   </div>
 
@@ -543,10 +629,16 @@ export default function PodcastsPageClient({ searchSpaceId }: PodcastsPageClient
                   </div>
                   
                   {currentPodcast?.id === podcast.id && !isAudioLoading && (
-                    <div className="mb-3 px-1">
+                    <motion.div 
+                      className="mb-3 px-1"
+                      initial={{ opacity: 0, y: 5 }}
+                      animate={{ opacity: 1, y: 0 }}
+                      transition={{ delay: 0.1 }}
+                    >
                       <div
-                        className="h-1.5 bg-muted rounded-full cursor-pointer group relative"
+                        className="h-1.5 bg-muted rounded-full cursor-pointer group relative overflow-hidden"
                         onClick={(e) => {
+                          e.stopPropagation();
                           if (!audioRef.current || !duration) return;
                           const container = e.currentTarget;
                           const rect = container.getBoundingClientRect();
@@ -556,86 +648,115 @@ export default function PodcastsPageClient({ searchSpaceId }: PodcastsPageClient
                           handleSeek([newTime]);
                         }}
                       >
-                        <div
-                          className="h-full bg-primary rounded-full relative transition-all duration-75 ease-linear"
+                        <motion.div
+                          className="h-full bg-primary rounded-full relative"
                           style={{ width: `${(currentTime / duration) * 100}%` }}
+                          transition={{ ease: "linear" }}
                         >
-                          <div className="absolute right-0 top-1/2 -translate-y-1/2 w-3 h-3 
-                            bg-primary rounded-full shadow-md transform scale-0 translate-x-1/2 
-                            group-hover:scale-100 transition-transform"
+                          <motion.div 
+                            className="absolute right-0 top-1/2 -translate-y-1/2 w-3 h-3 
+                              bg-primary rounded-full shadow-md transform scale-0
+                              group-hover:scale-100 transition-transform"
+                            whileHover={{ scale: 1.5 }}
                           />
-                        </div>
+                        </motion.div>
                       </div>
                       <div className="flex justify-between mt-1.5 text-xs text-muted-foreground">
                         <span>{formatTime(currentTime)}</span>
                         <span>{formatTime(duration)}</span>
                       </div>
-                    </div>
+                    </motion.div>
                   )}
 
                   {currentPodcast?.id === podcast.id && !isAudioLoading && (
-                    <div className="flex items-center justify-between px-2 mt-1">
-                      <Button
-                        variant="ghost"
-                        size="icon"
-                        onClick={skipBackward}
-                        className="w-9 h-9 text-muted-foreground hover:text-primary transition-colors"
-                        title="Rewind 10 seconds"
-                        disabled={!duration}
-                      >
-                        <SkipBack className="w-5 h-5" />
-                      </Button>
-                      <Button
-                        variant="ghost"
-                        size="icon"
-                        onClick={togglePlayPause}
-                        className="w-10 h-10 text-primary hover:bg-primary/10 rounded-full transition-colors"
-                        disabled={!duration}
-                      >
-                        {isPlaying ?
-                          <Pause className="w-6 h-6" /> :
-                          <Play className="w-6 h-6 ml-0.5" />
-                        }
-                      </Button>
-                      <Button
-                        variant="ghost"
-                        size="icon"
-                        onClick={skipForward}
-                        className="w-9 h-9 text-muted-foreground hover:text-primary transition-colors"
-                        title="Forward 10 seconds"
-                        disabled={!duration}
-                      >
-                        <SkipForward className="w-5 h-5" />
-                      </Button>
-                    </div>
+                    <motion.div 
+                      className="flex items-center justify-between px-2 mt-1"
+                      initial={{ opacity: 0, y: 5 }}
+                      animate={{ opacity: 1, y: 0 }}
+                      transition={{ delay: 0.2 }}
+                    >
+                      <motion.div whileHover={{ scale: 1.2 }} whileTap={{ scale: 0.95 }}>
+                        <Button
+                          variant="ghost"
+                          size="icon"
+                          onClick={(e) => {
+                            e.stopPropagation();
+                            skipBackward();
+                          }}
+                          className="w-9 h-9 text-muted-foreground hover:text-primary transition-colors"
+                          title="Rewind 10 seconds"
+                          disabled={!duration}
+                        >
+                          <SkipBack className="w-5 h-5" />
+                        </Button>
+                      </motion.div>
+                      <motion.div whileHover={{ scale: 1.2 }} whileTap={{ scale: 0.95 }}>
+                        <Button
+                          variant="ghost"
+                          size="icon"
+                          onClick={(e) => {
+                            e.stopPropagation();
+                            togglePlayPause();
+                          }}
+                          className="w-10 h-10 text-primary hover:bg-primary/10 rounded-full transition-colors"
+                          disabled={!duration}
+                        >
+                          {isPlaying ?
+                            <Pause className="w-6 h-6" /> :
+                            <Play className="w-6 h-6 ml-0.5" />
+                          }
+                        </Button>
+                      </motion.div>
+                      <motion.div whileHover={{ scale: 1.2 }} whileTap={{ scale: 0.95 }}>
+                        <Button
+                          variant="ghost"
+                          size="icon"
+                          onClick={(e) => {
+                            e.stopPropagation();
+                            skipForward();
+                          }}
+                          className="w-9 h-9 text-muted-foreground hover:text-primary transition-colors"
+                          title="Forward 10 seconds"
+                          disabled={!duration}
+                        >
+                          <SkipForward className="w-5 h-5" />
+                        </Button>
+                      </motion.div>
+                    </motion.div>
                   )}
                   
                   <div className="absolute top-2 right-2 z-20">
-                      <DropdownMenu>
-                        <DropdownMenuTrigger asChild>
-                          <Button variant="ghost" size="icon" className="h-7 w-7 bg-background/50 hover:bg-background/80 rounded-full backdrop-blur-sm">
-                            <MoreHorizontal className="h-4 w-4" />
-                            <span className="sr-only">Open menu</span>
-                          </Button>
-                        </DropdownMenuTrigger>
-                        <DropdownMenuContent align="end">
-                          <DropdownMenuItem
-                            className="text-destructive focus:text-destructive"
-                            onClick={() => {
-                              setPodcastToDelete({ id: podcast.id, title: podcast.title });
-                              setDeleteDialogOpen(true);
-                            }}
-                          >
-                            <Trash2 className="mr-2 h-4 w-4" />
-                            <span>Delete Podcast</span>
-                          </DropdownMenuItem>
-                        </DropdownMenuContent>
-                      </DropdownMenu>
+                    <DropdownMenu>
+                      <DropdownMenuTrigger asChild>
+                        <Button 
+                          variant="ghost" 
+                          size="icon" 
+                          className="h-7 w-7 bg-background/50 hover:bg-background/80 rounded-full backdrop-blur-sm"
+                          onClick={(e) => e.stopPropagation()}
+                        >
+                          <MoreHorizontal className="h-4 w-4" />
+                          <span className="sr-only">Open menu</span>
+                        </Button>
+                      </DropdownMenuTrigger>
+                      <DropdownMenuContent align="end">
+                        <DropdownMenuItem
+                          className="text-destructive focus:text-destructive"
+                          onClick={(e) => {
+                            e.stopPropagation();
+                            setPodcastToDelete({ id: podcast.id, title: podcast.title });
+                            setDeleteDialogOpen(true);
+                          }}
+                        >
+                          <Trash2 className="mr-2 h-4 w-4" />
+                          <span>Delete Podcast</span>
+                        </DropdownMenuItem>
+                      </DropdownMenuContent>
+                    </DropdownMenu>
                   </div>
 
                 </MotionCard>
               ))}
-            </div>
+            </motion.div>
           </AnimatePresence>
         )}
         
@@ -645,27 +766,38 @@ export default function PodcastsPageClient({ searchSpaceId }: PodcastsPageClient
             initial={{ y: 100, opacity: 0 }}
             animate={{ y: 0, opacity: 1 }}
             exit={{ y: 100, opacity: 0 }}
-            className="fixed bottom-0 left-0 right-0 bg-background border-t p-4 shadow-lg z-50"
+            transition={{ type: "spring", stiffness: 300, damping: 30 }}
+            className="fixed bottom-0 left-0 right-0 bg-background/95 backdrop-blur-sm border-t p-4 shadow-lg z-50"
           >
             <div className="container mx-auto">
               <div className="flex flex-col md:flex-row items-center gap-4">
                 <div className="flex-shrink-0">
-                  <div className="w-12 h-12 bg-primary/20 rounded-md flex items-center justify-center">
+                  <motion.div 
+                    className="w-12 h-12 bg-primary/20 rounded-md flex items-center justify-center"
+                    animate={{ scale: isPlaying ? [1, 1.05, 1] : 1 }}
+                    transition={{ repeat: isPlaying ? Infinity : 0, duration: 2 }}
+                  >
                     <Podcast className="h-6 w-6 text-primary" />
-                  </div>
+                  </motion.div>
                 </div>
                 
                 <div className="flex-grow min-w-0">
                   <h4 className="font-medium text-sm line-clamp-1">{currentPodcast.title}</h4>
                   
                   <div className="flex items-center gap-2 mt-2">
-                    <div className="flex-grow">
+                    <div className="flex-grow relative">
                       <Slider
                         value={[currentTime]}
                         min={0}
                         max={duration || 100}
                         step={0.1}
                         onValueChange={handleSeek}
+                        className="relative z-10"
+                      />
+                      <motion.div 
+                        className="absolute left-0 top-1/2 h-2 bg-primary/25 rounded-full -translate-y-1/2"
+                        style={{ width: `${(currentTime / (duration || 100)) * 100}%` }}
+                        transition={{ ease: "linear" }}
                       />
                     </div>
                     <div className="flex-shrink-0 text-xs text-muted-foreground whitespace-nowrap">
@@ -675,51 +807,67 @@ export default function PodcastsPageClient({ searchSpaceId }: PodcastsPageClient
                 </div>
                 
                 <div className="flex items-center gap-2">
-                  <Button
-                    variant="ghost"
-                    size="icon"
-                    onClick={skipBackward}
-                    className="h-8 w-8"
-                  >
-                    <SkipBack className="h-4 w-4" />
-                  </Button>
-                  
-                  <Button
-                    variant="default"
-                    size="icon"
-                    onClick={togglePlayPause}
-                    className="h-10 w-10 rounded-full"
-                  >
-                    {isPlaying ? <Pause className="h-5 w-5" /> : <Play className="h-5 w-5 ml-0.5" />}
-                  </Button>
-                  
-                  <Button
-                    variant="ghost"
-                    size="icon"
-                    onClick={skipForward}
-                    className="h-8 w-8"
-                  >
-                    <SkipForward className="h-4 w-4" />
-                  </Button>
-                  
-                  <div className="hidden md:flex items-center gap-2 ml-4 w-28">
+                  <motion.div whileHover={{ scale: 1.1 }} whileTap={{ scale: 0.95 }}>
                     <Button
                       variant="ghost"
                       size="icon"
-                      onClick={toggleMute}
+                      onClick={skipBackward}
                       className="h-8 w-8"
                     >
-                      {isMuted ? <VolumeX className="h-4 w-4" /> : <Volume2 className="h-4 w-4" />}
+                      <SkipBack className="h-4 w-4" />
                     </Button>
+                  </motion.div>
+                  
+                  <motion.div whileHover={{ scale: 1.1 }} whileTap={{ scale: 0.95 }}>
+                    <Button
+                      variant="default"
+                      size="icon"
+                      onClick={togglePlayPause}
+                      className="h-10 w-10 rounded-full"
+                    >
+                      {isPlaying ? <Pause className="h-5 w-5" /> : <Play className="h-5 w-5 ml-0.5" />}
+                    </Button>
+                  </motion.div>
+                  
+                  <motion.div whileHover={{ scale: 1.1 }} whileTap={{ scale: 0.95 }}>
+                    <Button
+                      variant="ghost"
+                      size="icon"
+                      onClick={skipForward}
+                      className="h-8 w-8"
+                    >
+                      <SkipForward className="h-4 w-4" />
+                    </Button>
+                  </motion.div>
+                  
+                  <div className="hidden md:flex items-center gap-2 ml-4 w-32">
+                    <motion.div whileHover={{ scale: 1.1 }} whileTap={{ scale: 0.95 }}>
+                      <Button
+                        variant="ghost"
+                        size="icon"
+                        onClick={toggleMute}
+                        className={`h-8 w-8 ${isMuted ? "text-muted-foreground" : "text-primary"}`}
+                      >
+                        {isMuted ? <VolumeX className="h-4 w-4" /> : <Volume2 className="h-4 w-4" />}
+                      </Button>
+                    </motion.div>
                     
-                    <Slider
-                      value={[isMuted ? 0 : volume]}
-                      min={0}
-                      max={1}
-                      step={0.01}
-                      onValueChange={handleVolumeChange}
-                      className="w-20"
-                    />
+                    <div className="relative w-24">
+                      <Slider
+                        value={[isMuted ? 0 : volume]}
+                        min={0}
+                        max={1}
+                        step={0.01}
+                        onValueChange={handleVolumeChange}
+                        className="w-24"
+                        disabled={isMuted}
+                      />
+                      <motion.div 
+                        className={`absolute left-0 bottom-0 h-1 bg-primary/30 rounded-full ${isMuted ? "opacity-50" : ""}`}
+                        initial={false}
+                        animate={{ width: `${(isMuted ? 0 : volume) * 100}%` }}
+                      />
+                    </div>
                   </div>
                 </div>
               </div>
@@ -774,14 +922,45 @@ export default function PodcastsPageClient({ searchSpaceId }: PodcastsPageClient
       <audio
         ref={audioRef}
         src={audioSrc}
+        preload="auto"
         onTimeUpdate={handleTimeUpdate}
         onLoadedMetadata={handleMetadataLoaded}
+        onLoadedData={() => {
+          // Only auto-play when audio is fully loaded
+          if (audioRef.current && currentPodcast && audioSrc) {
+            // Small delay to ensure browser is ready to play
+            setTimeout(() => {
+              if (audioRef.current) {
+                audioRef.current.play()
+                  .then(() => {
+                    setIsPlaying(true);
+                  })
+                  .catch(error => {
+                    console.error('Error playing audio:', error);
+                    // Don't show error if it's just the user navigating away
+                    if (error.name !== 'AbortError') {
+                      toast.error('Failed to play audio.');
+                    }
+                    setIsPlaying(false);
+                  });
+              }
+            }, 100);
+          }
+        }}
         onEnded={() => setIsPlaying(false)}
         onError={(e) => {
           console.error('Audio error:', e);
-          if (audioRef.current?.error?.code !== audioRef.current?.error?.MEDIA_ERR_ABORTED) {
-             toast.error('Error playing audio.');
+          if (audioRef.current?.error) {
+            // Log the specific error code for debugging
+            console.error('Audio error code:', audioRef.current.error.code);
+            
+            // Don't show error message for aborted loads
+            if (audioRef.current.error.code !== audioRef.current.error.MEDIA_ERR_ABORTED) {
+              toast.error('Error playing audio. Please try again.');
+            }
           }
+          // Reset playing state on error
+          setIsPlaying(false);
         }}
       />
     </motion.div>

From babaefc5e978718b7f1a94ff5f02129c60087b87 Mon Sep 17 00:00:00 2001
From: "DESKTOP-RTLN3BA\\$punk" <vermarohanfinal@gmail.com>
Date: Wed, 7 May 2025 23:05:47 -0700
Subject: [PATCH 23/70] chore: UI Fixes

---
 .../[search_space_id]/chats/page.tsx          |   4 +-
 .../[search_space_id]/podcasts/page.tsx       |   2 -
 .../podcasts/podcasts-client.tsx              | 617 +++++++++++-------
 3 files changed, 400 insertions(+), 223 deletions(-)

diff --git a/surfsense_web/app/dashboard/[search_space_id]/chats/page.tsx b/surfsense_web/app/dashboard/[search_space_id]/chats/page.tsx
index 58c89f421..4d71dac77 100644
--- a/surfsense_web/app/dashboard/[search_space_id]/chats/page.tsx
+++ b/surfsense_web/app/dashboard/[search_space_id]/chats/page.tsx
@@ -8,8 +8,8 @@ interface PageProps {
 }
 
 export default async function ChatsPage({ params }: PageProps) {
-  // Await params to properly access dynamic route parameters
-  const searchSpaceId = params.search_space_id;
+  // Get search space ID from the route parameter
+  const { search_space_id: searchSpaceId } = params;
   
   return (
     <Suspense fallback={<div className="flex items-center justify-center h-[60vh]">
diff --git a/surfsense_web/app/dashboard/[search_space_id]/podcasts/page.tsx b/surfsense_web/app/dashboard/[search_space_id]/podcasts/page.tsx
index 394177c88..429260724 100644
--- a/surfsense_web/app/dashboard/[search_space_id]/podcasts/page.tsx
+++ b/surfsense_web/app/dashboard/[search_space_id]/podcasts/page.tsx
@@ -8,8 +8,6 @@ interface PageProps {
 }
 
 export default async function PodcastsPage({ params }: PageProps) {
-  // Access dynamic route parameters
-  // Need to await params before accessing its properties in an async component
   const { search_space_id: searchSpaceId } = await Promise.resolve(params);
   
   return (
diff --git a/surfsense_web/app/dashboard/[search_space_id]/podcasts/podcasts-client.tsx b/surfsense_web/app/dashboard/[search_space_id]/podcasts/podcasts-client.tsx
index 72e50ab95..16527fbee 100644
--- a/surfsense_web/app/dashboard/[search_space_id]/podcasts/podcasts-client.tsx
+++ b/surfsense_web/app/dashboard/[search_space_id]/podcasts/podcasts-client.tsx
@@ -1,25 +1,24 @@
 'use client';
 
-import { useState, useEffect, useRef } from 'react';
-import { motion, AnimatePresence } from 'framer-motion';
 import { format } from 'date-fns';
-import { 
-  Search, Calendar, Trash2, MoreHorizontal, Podcast, 
-  Play, Pause, SkipForward, SkipBack, Volume2, VolumeX
+import { AnimatePresence, motion } from 'framer-motion';
+import {
+  Calendar,
+  MoreHorizontal,
+  Pause,
+  Play,
+  Podcast,
+  Search,
+  SkipBack,
+  SkipForward,
+  Trash2,
+  Volume2, VolumeX
 } from 'lucide-react';
+import { useEffect, useRef, useState } from 'react';
 
 // UI Components
-import { Input } from '@/components/ui/input';
 import { Button } from '@/components/ui/button';
-import { Card, CardContent, CardFooter, CardHeader, CardTitle } from '@/components/ui/card';
-import { Slider } from '@/components/ui/slider';
-import { 
-  DropdownMenu, 
-  DropdownMenuContent, 
-  DropdownMenuItem, 
-  DropdownMenuTrigger,
-  DropdownMenuSeparator
-} from '@/components/ui/dropdown-menu';
+import { Card } from '@/components/ui/card';
 import {
   Dialog,
   DialogContent,
@@ -28,6 +27,13 @@ import {
   DialogHeader,
   DialogTitle,
 } from "@/components/ui/dialog";
+import {
+  DropdownMenu,
+  DropdownMenuContent,
+  DropdownMenuItem,
+  DropdownMenuTrigger
+} from '@/components/ui/dropdown-menu';
+import { Input } from '@/components/ui/input';
 import {
   Select,
   SelectContent,
@@ -36,6 +42,7 @@ import {
   SelectTrigger,
   SelectValue,
 } from "@/components/ui/select";
+import { Slider } from '@/components/ui/slider';
 import { toast } from "sonner";
 
 interface PodcastItem {
@@ -53,14 +60,15 @@ interface PodcastsPageClientProps {
 
 const pageVariants = {
   initial: { opacity: 0 },
-  enter: { opacity: 1, transition: { duration: 0.3, ease: 'easeInOut' } },
+  enter: { opacity: 1, transition: { duration: 0.4, ease: 'easeInOut', staggerChildren: 0.1 } },
   exit: { opacity: 0, transition: { duration: 0.3, ease: 'easeInOut' } }
 };
 
 const podcastCardVariants = {
-  initial: { y: 20, opacity: 0 },
-  animate: { y: 0, opacity: 1 },
-  exit: { y: -20, opacity: 0 }
+  initial: { scale: 0.95, y: 20, opacity: 0 },
+  animate: { scale: 1, y: 0, opacity: 1, transition: { type: "spring", stiffness: 300, damping: 25 } },
+  exit: { scale: 0.95, y: -20, opacity: 0 },
+  hover: { y: -5, scale: 1.02, transition: { duration: 0.2 } }
 };
 
 const MotionCard = motion(Card);
@@ -216,12 +224,17 @@ export default function PodcastsPageClient({ searchSpaceId }: PodcastsPageClient
   const handleVolumeChange = (value: number[]) => {
     if (audioRef.current) {
       const newVolume = value[0];
+      
+      // Set volume
       audioRef.current.volume = newVolume;
       setVolume(newVolume);
       
+      // Handle mute state based on volume
       if (newVolume === 0) {
+        audioRef.current.muted = true;
         setIsMuted(true);
-      } else if (isMuted) {
+      } else {
+        audioRef.current.muted = false;
         setIsMuted(false);
       }
     }
@@ -230,8 +243,16 @@ export default function PodcastsPageClient({ searchSpaceId }: PodcastsPageClient
   // Toggle mute
   const toggleMute = () => {
     if (audioRef.current) {
-      audioRef.current.muted = !isMuted;
-      setIsMuted(!isMuted);
+      const newMutedState = !isMuted;
+      audioRef.current.muted = newMutedState;
+      setIsMuted(newMutedState);
+      
+      // If unmuting, restore previous volume if it was 0
+      if (!newMutedState && volume === 0) {
+        const restoredVolume = 0.5;
+        audioRef.current.volume = restoredVolume;
+        setVolume(restoredVolume);
+      }
     }
   };
 
@@ -264,68 +285,73 @@ export default function PodcastsPageClient({ searchSpaceId }: PodcastsPageClient
       return;
     }
 
-    // Revoke previous object URL if exists
-    if (currentObjectUrlRef.current) {
-      URL.revokeObjectURL(currentObjectUrlRef.current);
-      currentObjectUrlRef.current = null;
+    // Prevent multiple simultaneous loading requests
+    if (isAudioLoading) {
+      return;
     }
     
-    // Reset player state and show loading
-    setCurrentPodcast(podcast);
-    setAudioSrc(undefined);
-    setCurrentTime(0);
-    setDuration(0);
-    setIsPlaying(false);
-    setIsAudioLoading(true);
-    
     try {
+      // Reset player state and show loading
+      setCurrentPodcast(podcast);
+      setAudioSrc(undefined);
+      setCurrentTime(0);
+      setDuration(0);
+      setIsPlaying(false);
+      setIsAudioLoading(true);
+      
       const token = localStorage.getItem('surfsense_bearer_token');
       if (!token) {
-        toast.error('Authentication token not found.');
-        setIsAudioLoading(false);
-        return;
+        throw new Error('Authentication token not found.');
       }
 
-      const response = await fetch(
-        `${process.env.NEXT_PUBLIC_FASTAPI_BACKEND_URL}/api/v1/podcasts/${podcast.id}/stream`,
-        {
-          headers: {
-            'Authorization': `Bearer ${token}`,
-          },
-        }
-      );
-
-      if (!response.ok) {
-        throw new Error(`Failed to fetch audio stream: ${response.statusText}`);
+      // Revoke previous object URL if exists (only after we've started the new request)
+      if (currentObjectUrlRef.current) {
+        URL.revokeObjectURL(currentObjectUrlRef.current);
+        currentObjectUrlRef.current = null;
       }
 
-      const blob = await response.blob();
-      const objectUrl = URL.createObjectURL(blob);
-      currentObjectUrlRef.current = objectUrl;
-      
-      // Wait for React to commit the new `src`
-      setAudioSrc(objectUrl);
-      
-      // Use requestAnimationFrame instead of setTimeout for more reliable DOM updates
-      requestAnimationFrame(() => {
-        if (audioRef.current) {
-          // The <audio> element has the new src now
-          audioRef.current.play()
-            .then(() => {
-              setIsPlaying(true);
-            })
-            .catch(error => {
-              console.error('Error playing audio:', error);
-              toast.error('Failed to play audio.');
-              setIsPlaying(false);
-            });
-        }
-      });
+      // Use AbortController to handle timeout or cancellation
+      const controller = new AbortController();
+      const timeoutId = setTimeout(() => controller.abort(), 30000); // 30 second timeout
 
+      try {
+        const response = await fetch(
+          `${process.env.NEXT_PUBLIC_FASTAPI_BACKEND_URL}/api/v1/podcasts/${podcast.id}/stream`,
+          {
+            headers: {
+              'Authorization': `Bearer ${token}`,
+            },
+            signal: controller.signal
+          }
+        );
+
+        clearTimeout(timeoutId);
+
+        if (!response.ok) {
+          throw new Error(`Failed to fetch audio stream: ${response.statusText}`);
+        }
+
+        const blob = await response.blob();
+        const objectUrl = URL.createObjectURL(blob);
+        currentObjectUrlRef.current = objectUrl;
+        
+        // Set audio source
+        setAudioSrc(objectUrl);
+        
+        // Wait for the audio to be ready before playing
+        // We'll handle actual playback in the onLoadedData event instead of here
+      } catch (error) {
+        if (error instanceof DOMException && error.name === 'AbortError') {
+          throw new Error('Request timed out. Please try again.');
+        }
+        throw error;
+      }
     } catch (error) {
       console.error('Error fetching or playing podcast:', error);
       toast.error(error instanceof Error ? error.message : 'Failed to load podcast audio.');
+      // Reset state on error
       setCurrentPodcast(null);
+      setAudioSrc(undefined);
     } finally {
       setIsAudioLoading(false);
     }
@@ -456,7 +482,13 @@ export default function PodcastsPageClient({ searchSpaceId }: PodcastsPageClient
         {/* Podcast Grid */}
         {!isLoading && !error && filteredPodcasts.length > 0 && (
           <AnimatePresence mode="wait">
-            <div className="grid grid-cols-1 md:grid-cols-2 lg:grid-cols-3 gap-6">
+            <motion.div 
+              className="grid grid-cols-1 md:grid-cols-2 lg:grid-cols-3 gap-6"
+              variants={pageVariants}
+              initial="initial"
+              animate="enter"
+              exit="exit"
+            >
               {filteredPodcasts.map((podcast, index) => (
                 <MotionCard
                   key={podcast.id}
@@ -464,71 +496,125 @@ export default function PodcastsPageClient({ searchSpaceId }: PodcastsPageClient
                   initial="initial"
                   animate="animate"
                   exit="exit"
+                  whileHover="hover"
                   transition={{ duration: 0.2, delay: index * 0.05 }}
                   className={`
                     bg-card/60 dark:bg-card/40 backdrop-blur-lg rounded-xl p-4 
-                    shadow-lg hover:shadow-xl transition-all duration-300 
-                    border-border overflow-hidden 
+                    shadow-md hover:shadow-xl transition-all duration-300 
+                    border-border overflow-hidden cursor-pointer
                     ${currentPodcast?.id === podcast.id ? 'ring-2 ring-primary ring-offset-2 ring-offset-background' : ''}
                   `}
                   layout
+                  onClick={() => playPodcast(podcast)}
                 >
                   <div 
-                    className="relative w-full aspect-[16/10] mb-4 rounded-lg overflow-hidden group cursor-pointer"
-                    onClick={() => playPodcast(podcast)}
+                    className="relative w-full aspect-[16/10] mb-4 rounded-lg overflow-hidden"
                   >
-                    {/* Podcast image */}
+                    {/* Podcast image with gradient overlay */}
                     <img 
                       src={PODCAST_IMAGE_URL} 
                       alt="Podcast illustration" 
-                      className="w-full h-full object-cover transition-transform duration-300 group-hover:scale-105 brightness-[0.85] contrast-[1.1]"
+                      className="w-full h-full object-cover transition-transform duration-500 group-hover:scale-105 brightness-[0.85] contrast-[1.1]"
                       loading="lazy"
                     />
                     
-                    {/* Overlay for better contrast with controls */}
-                    <div className="absolute inset-0 bg-black/20 group-hover:bg-black/30 transition-colors"></div>
+                    {/* Better overlay with gradient for improved text legibility */}
+                    <div className="absolute inset-0 bg-gradient-to-t from-black/60 to-black/10 transition-opacity duration-300"></div>
                     
-                    {/* Loading indicator */}
+                    {/* Loading indicator with improved animation */}
                     {currentPodcast?.id === podcast.id && isAudioLoading && (
-                      <div className="absolute inset-0 flex items-center justify-center bg-background/50 z-10">
-                        <div className="h-8 w-8 animate-spin rounded-full border-2 border-primary border-t-transparent"></div>
-                      </div>
+                      <motion.div 
+                        className="absolute inset-0 flex items-center justify-center bg-background/60 backdrop-blur-md z-10"
+                        initial={{ opacity: 0 }}
+                        animate={{ opacity: 1 }}
+                        exit={{ opacity: 0 }}
+                        transition={{ duration: 0.2 }}
+                      >
+                        <motion.div 
+                          className="flex flex-col items-center gap-3"
+                          initial={{ scale: 0.9 }}
+                          animate={{ scale: 1 }}
+                          transition={{ type: "spring", damping: 20 }}
+                        >
+                          <div className="h-14 w-14 rounded-full border-4 border-primary/30 border-t-primary animate-spin"></div>
+                          <p className="text-sm text-foreground font-medium">Loading podcast...</p>
+                        </motion.div>
+                      </motion.div>
                     )}
 
-                    {/* Play button */}
+                    {/* Play button with animations */}
                     {!(currentPodcast?.id === podcast.id && (isPlaying || isAudioLoading)) && (
-                      <Button
-                        variant="outline"
-                        size="icon"
-                        className="absolute top-1/2 left-1/2 -translate-x-1/2 -translate-y-1/2 h-14 w-14 rounded-full 
-                          bg-background/70 hover:bg-background/90 backdrop-blur-sm scale-90 group-hover:scale-100 
-                          transition-transform duration-200 z-0 shadow-lg"
-                        onClick={(e) => {
-                          e.stopPropagation();
-                          playPodcast(podcast);
-                        }}
-                        disabled={isAudioLoading}
+                      <motion.div
+                        className="absolute top-1/2 left-1/2 -translate-x-1/2 -translate-y-1/2 z-10"
+                        whileHover={{ scale: 1.1 }}
+                        whileTap={{ scale: 0.9 }}
                       >
-                        <Play className="h-7 w-7 ml-1" /> 
-                      </Button>
+                        <Button
+                          variant="secondary"
+                          size="icon"
+                          className="h-16 w-16 rounded-full 
+                            bg-background/80 hover:bg-background/95 backdrop-blur-md
+                            transition-all duration-200 shadow-xl border-0
+                            flex items-center justify-center"
+                          onClick={(e) => {
+                            e.stopPropagation();
+                            playPodcast(podcast);
+                          }}
+                          disabled={isAudioLoading}
+                        >
+                          <motion.div
+                            initial={{ scale: 0.8 }}
+                            animate={{ scale: 1 }}
+                            transition={{ type: "spring", stiffness: 400, damping: 10 }}
+                            className="text-primary w-10 h-10 flex items-center justify-center"
+                          >
+                            <Play className="h-8 w-8 ml-1" /> 
+                          </motion.div>
+                        </Button>
+                      </motion.div>
                     )}
                     
-                    {/* Pause button */}
+                    {/* Pause button with animations */}
                     {currentPodcast?.id === podcast.id && isPlaying && !isAudioLoading && (
-                      <Button
-                        variant="outline"
-                        size="icon"
-                        className="absolute top-1/2 left-1/2 -translate-x-1/2 -translate-y-1/2 h-14 w-14 rounded-full 
-                          bg-background/70 hover:bg-background/90 backdrop-blur-sm scale-90 group-hover:scale-100 
-                          transition-transform duration-200 z-0 shadow-lg"
-                        onClick={(e) => {
-                          e.stopPropagation();
-                          togglePlayPause();
-                        }}
-                        disabled={isAudioLoading}
+                      <motion.div
+                        className="absolute top-1/2 left-1/2 -translate-x-1/2 -translate-y-1/2 z-10"
+                        whileHover={{ scale: 1.1 }}
+                        whileTap={{ scale: 0.9 }}
                       >
-                        <Pause className="h-7 w-7" /> 
-                      </Button>
+                        <Button
+                          variant="secondary"
+                          size="icon"
+                          className="h-16 w-16 rounded-full 
+                            bg-background/80 hover:bg-background/95 backdrop-blur-md
+                            transition-all duration-200 shadow-xl border-0
+                            flex items-center justify-center"
+                          onClick={(e) => {
+                            e.stopPropagation();
+                            togglePlayPause();
+                          }}
+                          disabled={isAudioLoading}
+                        >
+                          <motion.div
+                            initial={{ scale: 0.8 }}
+                            animate={{ scale: 1 }}
+                            transition={{ type: "spring", stiffness: 400, damping: 10 }}
+                            className="text-primary w-10 h-10 flex items-center justify-center"
+                          >
+                            <Pause className="h-8 w-8" /> 
+                          </motion.div>
+                        </Button>
+                      </motion.div>
+                    )}
+                    
+                    {/* Now playing indicator */}
+                    {currentPodcast?.id === podcast.id && !isAudioLoading && (
+                      <div className="absolute top-2 left-2 bg-primary text-primary-foreground text-xs px-2 py-1 rounded-full z-10 flex items-center gap-1.5">
+                        <span className="relative flex h-2 w-2">
+                          <span className="animate-ping absolute inline-flex h-full w-full rounded-full bg-primary-foreground opacity-75"></span>
+                          <span className="relative inline-flex rounded-full h-2 w-2 bg-primary-foreground"></span>
+                        </span>
+                        Now Playing
+                      </div>
                     )}
                   </div>
 
@@ -543,10 +629,16 @@ export default function PodcastsPageClient({ searchSpaceId }: PodcastsPageClient
                   </div>
                   
                   {currentPodcast?.id === podcast.id && !isAudioLoading && (
-                    <div className="mb-3 px-1">
+                    <motion.div 
+                      className="mb-3 px-1"
+                      initial={{ opacity: 0, y: 5 }}
+                      animate={{ opacity: 1, y: 0 }}
+                      transition={{ delay: 0.1 }}
+                    >
                       <div
-                        className="h-1.5 bg-muted rounded-full cursor-pointer group relative"
+                        className="h-1.5 bg-muted rounded-full cursor-pointer group relative overflow-hidden"
                         onClick={(e) => {
+                          e.stopPropagation();
                           if (!audioRef.current || !duration) return;
                           const container = e.currentTarget;
                           const rect = container.getBoundingClientRect();
@@ -556,86 +648,115 @@ export default function PodcastsPageClient({ searchSpaceId }: PodcastsPageClient
                           handleSeek([newTime]);
                         }}
                       >
-                        <div
-                          className="h-full bg-primary rounded-full relative transition-all duration-75 ease-linear"
+                        <motion.div
+                          className="h-full bg-primary rounded-full relative"
                           style={{ width: `${(currentTime / duration) * 100}%` }}
+                          transition={{ ease: "linear" }}
                         >
-                          <div className="absolute right-0 top-1/2 -translate-y-1/2 w-3 h-3 
-                            bg-primary rounded-full shadow-md transform scale-0 translate-x-1/2 
-                            group-hover:scale-100 transition-transform"
+                          <motion.div 
+                            className="absolute right-0 top-1/2 -translate-y-1/2 w-3 h-3 
+                              bg-primary rounded-full shadow-md transform scale-0
+                              group-hover:scale-100 transition-transform"
+                            whileHover={{ scale: 1.5 }}
                           />
-                        </div>
+                        </motion.div>
                       </div>
                       <div className="flex justify-between mt-1.5 text-xs text-muted-foreground">
                         <span>{formatTime(currentTime)}</span>
                         <span>{formatTime(duration)}</span>
                       </div>
-                    </div>
+                    </motion.div>
                   )}
 
                   {currentPodcast?.id === podcast.id && !isAudioLoading && (
-                    <div className="flex items-center justify-between px-2 mt-1">
-                      <Button
-                        variant="ghost"
-                        size="icon"
-                        onClick={skipBackward}
-                        className="w-9 h-9 text-muted-foreground hover:text-primary transition-colors"
-                        title="Rewind 10 seconds"
-                        disabled={!duration}
-                      >
-                        <SkipBack className="w-5 h-5" />
-                      </Button>
-                      <Button
-                        variant="ghost"
-                        size="icon"
-                        onClick={togglePlayPause}
-                        className="w-10 h-10 text-primary hover:bg-primary/10 rounded-full transition-colors"
-                        disabled={!duration}
-                      >
-                        {isPlaying ?
-                          <Pause className="w-6 h-6" /> :
-                          <Play className="w-6 h-6 ml-0.5" />
-                        }
-                      </Button>
-                      <Button
-                        variant="ghost"
-                        size="icon"
-                        onClick={skipForward}
-                        className="w-9 h-9 text-muted-foreground hover:text-primary transition-colors"
-                        title="Forward 10 seconds"
-                        disabled={!duration}
-                      >
-                        <SkipForward className="w-5 h-5" />
-                      </Button>
-                    </div>
+                    <motion.div 
+                      className="flex items-center justify-between px-2 mt-1"
+                      initial={{ opacity: 0, y: 5 }}
+                      animate={{ opacity: 1, y: 0 }}
+                      transition={{ delay: 0.2 }}
+                    >
+                      <motion.div whileHover={{ scale: 1.2 }} whileTap={{ scale: 0.95 }}>
+                        <Button
+                          variant="ghost"
+                          size="icon"
+                          onClick={(e) => {
+                            e.stopPropagation();
+                            skipBackward();
+                          }}
+                          className="w-9 h-9 text-muted-foreground hover:text-primary transition-colors"
+                          title="Rewind 10 seconds"
+                          disabled={!duration}
+                        >
+                          <SkipBack className="w-5 h-5" />
+                        </Button>
+                      </motion.div>
+                      <motion.div whileHover={{ scale: 1.2 }} whileTap={{ scale: 0.95 }}>
+                        <Button
+                          variant="ghost"
+                          size="icon"
+                          onClick={(e) => {
+                            e.stopPropagation();
+                            togglePlayPause();
+                          }}
+                          className="w-10 h-10 text-primary hover:bg-primary/10 rounded-full transition-colors"
+                          disabled={!duration}
+                        >
+                          {isPlaying ?
+                            <Pause className="w-6 h-6" /> :
+                            <Play className="w-6 h-6 ml-0.5" />
+                          }
+                        </Button>
+                      </motion.div>
+                      <motion.div whileHover={{ scale: 1.2 }} whileTap={{ scale: 0.95 }}>
+                        <Button
+                          variant="ghost"
+                          size="icon"
+                          onClick={(e) => {
+                            e.stopPropagation();
+                            skipForward();
+                          }}
+                          className="w-9 h-9 text-muted-foreground hover:text-primary transition-colors"
+                          title="Forward 10 seconds"
+                          disabled={!duration}
+                        >
+                          <SkipForward className="w-5 h-5" />
+                        </Button>
+                      </motion.div>
+                    </motion.div>
                   )}
                   
                   <div className="absolute top-2 right-2 z-20">
-                      <DropdownMenu>
-                        <DropdownMenuTrigger asChild>
-                          <Button variant="ghost" size="icon" className="h-7 w-7 bg-background/50 hover:bg-background/80 rounded-full backdrop-blur-sm">
-                            <MoreHorizontal className="h-4 w-4" />
-                            <span className="sr-only">Open menu</span>
-                          </Button>
-                        </DropdownMenuTrigger>
-                        <DropdownMenuContent align="end">
-                          <DropdownMenuItem
-                            className="text-destructive focus:text-destructive"
-                            onClick={() => {
-                              setPodcastToDelete({ id: podcast.id, title: podcast.title });
-                              setDeleteDialogOpen(true);
-                            }}
-                          >
-                            <Trash2 className="mr-2 h-4 w-4" />
-                            <span>Delete Podcast</span>
-                          </DropdownMenuItem>
-                        </DropdownMenuContent>
-                      </DropdownMenu>
+                    <DropdownMenu>
+                      <DropdownMenuTrigger asChild>
+                        <Button 
+                          variant="ghost" 
+                          size="icon" 
+                          className="h-7 w-7 bg-background/50 hover:bg-background/80 rounded-full backdrop-blur-sm"
+                          onClick={(e) => e.stopPropagation()}
+                        >
+                          <MoreHorizontal className="h-4 w-4" />
+                          <span className="sr-only">Open menu</span>
+                        </Button>
+                      </DropdownMenuTrigger>
+                      <DropdownMenuContent align="end">
+                        <DropdownMenuItem
+                          className="text-destructive focus:text-destructive"
+                          onClick={(e) => {
+                            e.stopPropagation();
+                            setPodcastToDelete({ id: podcast.id, title: podcast.title });
+                            setDeleteDialogOpen(true);
+                          }}
+                        >
+                          <Trash2 className="mr-2 h-4 w-4" />
+                          <span>Delete Podcast</span>
+                        </DropdownMenuItem>
+                      </DropdownMenuContent>
+                    </DropdownMenu>
                   </div>
 
                 </MotionCard>
               ))}
-            </div>
+            </motion.div>
           </AnimatePresence>
         )}
         
@@ -645,27 +766,38 @@ export default function PodcastsPageClient({ searchSpaceId }: PodcastsPageClient
             initial={{ y: 100, opacity: 0 }}
             animate={{ y: 0, opacity: 1 }}
             exit={{ y: 100, opacity: 0 }}
-            className="fixed bottom-0 left-0 right-0 bg-background border-t p-4 shadow-lg z-50"
+            transition={{ type: "spring", stiffness: 300, damping: 30 }}
+            className="fixed bottom-0 left-0 right-0 bg-background/95 backdrop-blur-sm border-t p-4 shadow-lg z-50"
           >
             <div className="container mx-auto">
               <div className="flex flex-col md:flex-row items-center gap-4">
                 <div className="flex-shrink-0">
-                  <div className="w-12 h-12 bg-primary/20 rounded-md flex items-center justify-center">
+                  <motion.div 
+                    className="w-12 h-12 bg-primary/20 rounded-md flex items-center justify-center"
+                    animate={{ scale: isPlaying ? [1, 1.05, 1] : 1 }}
+                    transition={{ repeat: isPlaying ? Infinity : 0, duration: 2 }}
+                  >
                     <Podcast className="h-6 w-6 text-primary" />
-                  </div>
+                  </motion.div>
                 </div>
                 
                 <div className="flex-grow min-w-0">
                   <h4 className="font-medium text-sm line-clamp-1">{currentPodcast.title}</h4>
                   
                   <div className="flex items-center gap-2 mt-2">
-                    <div className="flex-grow">
+                    <div className="flex-grow relative">
                       <Slider
                         value={[currentTime]}
                         min={0}
                         max={duration || 100}
                         step={0.1}
                         onValueChange={handleSeek}
+                        className="relative z-10"
+                      />
+                      <motion.div 
+                        className="absolute left-0 top-1/2 h-2 bg-primary/25 rounded-full -translate-y-1/2"
+                        style={{ width: `${(currentTime / (duration || 100)) * 100}%` }}
+                        transition={{ ease: "linear" }}
                       />
                     </div>
                     <div className="flex-shrink-0 text-xs text-muted-foreground whitespace-nowrap">
@@ -675,51 +807,67 @@ export default function PodcastsPageClient({ searchSpaceId }: PodcastsPageClient
                 </div>
                 
                 <div className="flex items-center gap-2">
-                  <Button
-                    variant="ghost"
-                    size="icon"
-                    onClick={skipBackward}
-                    className="h-8 w-8"
-                  >
-                    <SkipBack className="h-4 w-4" />
-                  </Button>
-                  
-                  <Button
-                    variant="default"
-                    size="icon"
-                    onClick={togglePlayPause}
-                    className="h-10 w-10 rounded-full"
-                  >
-                    {isPlaying ? <Pause className="h-5 w-5" /> : <Play className="h-5 w-5 ml-0.5" />}
-                  </Button>
-                  
-                  <Button
-                    variant="ghost"
-                    size="icon"
-                    onClick={skipForward}
-                    className="h-8 w-8"
-                  >
-                    <SkipForward className="h-4 w-4" />
-                  </Button>
-                  
-                  <div className="hidden md:flex items-center gap-2 ml-4 w-28">
+                  <motion.div whileHover={{ scale: 1.1 }} whileTap={{ scale: 0.95 }}>
                     <Button
                       variant="ghost"
                       size="icon"
-                      onClick={toggleMute}
+                      onClick={skipBackward}
                       className="h-8 w-8"
                     >
-                      {isMuted ? <VolumeX className="h-4 w-4" /> : <Volume2 className="h-4 w-4" />}
+                      <SkipBack className="h-4 w-4" />
                     </Button>
+                  </motion.div>
+                  
+                  <motion.div whileHover={{ scale: 1.1 }} whileTap={{ scale: 0.95 }}>
+                    <Button
+                      variant="default"
+                      size="icon"
+                      onClick={togglePlayPause}
+                      className="h-10 w-10 rounded-full"
+                    >
+                      {isPlaying ? <Pause className="h-5 w-5" /> : <Play className="h-5 w-5 ml-0.5" />}
+                    </Button>
+                  </motion.div>
+                  
+                  <motion.div whileHover={{ scale: 1.1 }} whileTap={{ scale: 0.95 }}>
+                    <Button
+                      variant="ghost"
+                      size="icon"
+                      onClick={skipForward}
+                      className="h-8 w-8"
+                    >
+                      <SkipForward className="h-4 w-4" />
+                    </Button>
+                  </motion.div>
+                  
+                  <div className="hidden md:flex items-center gap-2 ml-4 w-32">
+                    <motion.div whileHover={{ scale: 1.1 }} whileTap={{ scale: 0.95 }}>
+                      <Button
+                        variant="ghost"
+                        size="icon"
+                        onClick={toggleMute}
+                        className={`h-8 w-8 ${isMuted ? "text-muted-foreground" : "text-primary"}`}
+                      >
+                        {isMuted ? <VolumeX className="h-4 w-4" /> : <Volume2 className="h-4 w-4" />}
+                      </Button>
+                    </motion.div>
                     
-                    <Slider
-                      value={[isMuted ? 0 : volume]}
-                      min={0}
-                      max={1}
-                      step={0.01}
-                      onValueChange={handleVolumeChange}
-                      className="w-20"
-                    />
+                    <div className="relative w-24">
+                      <Slider
+                        value={[isMuted ? 0 : volume]}
+                        min={0}
+                        max={1}
+                        step={0.01}
+                        onValueChange={handleVolumeChange}
+                        className="w-24"
+                        disabled={isMuted}
+                      />
+                      <motion.div 
+                        className={`absolute left-0 bottom-0 h-1 bg-primary/30 rounded-full ${isMuted ? "opacity-50" : ""}`}
+                        initial={false}
+                        animate={{ width: `${(isMuted ? 0 : volume) * 100}%` }}
+                      />
+                    </div>
                   </div>
                 </div>
               </div>
@@ -774,14 +922,45 @@ export default function PodcastsPageClient({ searchSpaceId }: PodcastsPageClient
       <audio
         ref={audioRef}
         src={audioSrc}
+        preload="auto"
         onTimeUpdate={handleTimeUpdate}
         onLoadedMetadata={handleMetadataLoaded}
+        onLoadedData={() => {
+          // Only auto-play when audio is fully loaded
+          if (audioRef.current && currentPodcast && audioSrc) {
+            // Small delay to ensure browser is ready to play
+            setTimeout(() => {
+              if (audioRef.current) {
+                audioRef.current.play()
+                  .then(() => {
+                    setIsPlaying(true);
+                  })
+                  .catch(error => {
+                    console.error('Error playing audio:', error);
+                    // Don't show error if it's just the user navigating away
+                    if (error.name !== 'AbortError') {
+                      toast.error('Failed to play audio.');
+                    }
+                    setIsPlaying(false);
+                  });
+              }
+            }, 100);
+          }
+        }}
         onEnded={() => setIsPlaying(false)}
         onError={(e) => {
           console.error('Audio error:', e);
-          if (audioRef.current?.error?.code !== audioRef.current?.error?.MEDIA_ERR_ABORTED) {
-             toast.error('Error playing audio.');
+          if (audioRef.current?.error) {
+            // Log the specific error code for debugging
+            console.error('Audio error code:', audioRef.current.error.code);
+            
+            // Don't show error message for aborted loads
+            if (audioRef.current.error.code !== audioRef.current.error.MEDIA_ERR_ABORTED) {
+              toast.error('Error playing audio. Please try again.');
+            }
           }
+          // Reset playing state on error
+          setIsPlaying(false);
         }}
       />
     </motion.div>

From 9f3d49ab939ddd52abb9cd96ed581fada007eee1 Mon Sep 17 00:00:00 2001
From: "DESKTOP-RTLN3BA\\$punk" <vermarohanfinal@gmail.com>
Date: Wed, 7 May 2025 23:18:49 -0700
Subject: [PATCH 24/70] recurse fix

---
 .../dashboard/[search_space_id]/podcasts/podcasts-client.tsx  | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/surfsense_web/app/dashboard/[search_space_id]/podcasts/podcasts-client.tsx b/surfsense_web/app/dashboard/[search_space_id]/podcasts/podcasts-client.tsx
index 16527fbee..5489d8672 100644
--- a/surfsense_web/app/dashboard/[search_space_id]/podcasts/podcasts-client.tsx
+++ b/surfsense_web/app/dashboard/[search_space_id]/podcasts/podcasts-client.tsx
@@ -325,8 +325,6 @@ export default function PodcastsPageClient({ searchSpaceId }: PodcastsPageClient
           }
         );
 
-        clearTimeout(timeoutId);
-
         if (!response.ok) {
           throw new Error(`Failed to fetch audio stream: ${response.statusText}`);
         }
@@ -345,6 +343,8 @@ export default function PodcastsPageClient({ searchSpaceId }: PodcastsPageClient
           throw new Error('Request timed out. Please try again.');
         }
         throw error;
+      } finally {
+        clearTimeout(timeoutId);
       }
     } catch (error) {
       console.error('Error fetching or playing podcast:', error);

From 934aff25188bcc6895054e2b2d6cebf432df2564 Mon Sep 17 00:00:00 2001
From: "DESKTOP-RTLN3BA\\$punk" <vermarohanfinal@gmail.com>
Date: Thu, 8 May 2025 19:30:56 -0700
Subject: [PATCH 25/70] fix: Added API_BASE param for LiteLLM.

---
 surfsense_backend/.env.example                |   5 +
 surfsense_backend/.gitignore                  |   1 +
 .../app/agents/researcher/state.py            |   4 +-
 surfsense_backend/app/config/__init__.py      |  23 +++-
 .../[search_space_id]/chats/page.tsx          |   2 +-
 .../content/docs/docker-installation.mdx      | 113 +++++++++++-------
 .../content/docs/manual-installation.mdx      |  76 ++++++++----
 7 files changed, 151 insertions(+), 73 deletions(-)

diff --git a/surfsense_backend/.env.example b/surfsense_backend/.env.example
index 8e834bf1d..53a8fb58a 100644
--- a/surfsense_backend/.env.example
+++ b/surfsense_backend/.env.example
@@ -30,3 +30,8 @@ LANGSMITH_TRACING=true
 LANGSMITH_ENDPOINT="https://api.smith.langchain.com"
 LANGSMITH_API_KEY="lsv2_pt_....."
 LANGSMITH_PROJECT="surfsense"
+
+# OPTIONAL: LiteLLM API Base
+FAST_LLM_API_BASE=""
+STRATEGIC_LLM_API_BASE=""
+LONG_CONTEXT_LLM_API_BASE=""
diff --git a/surfsense_backend/.gitignore b/surfsense_backend/.gitignore
index ee59e4764..b89ba2402 100644
--- a/surfsense_backend/.gitignore
+++ b/surfsense_backend/.gitignore
@@ -5,3 +5,4 @@ data/
 __pycache__/
 .flashrank_cache
 surf_new_backend.egg-info/
+podcasts/
diff --git a/surfsense_backend/app/agents/researcher/state.py b/surfsense_backend/app/agents/researcher/state.py
index dd36163b6..7850c5b7a 100644
--- a/surfsense_backend/app/agents/researcher/state.py
+++ b/surfsense_backend/app/agents/researcher/state.py
@@ -3,7 +3,7 @@
 from __future__ import annotations
 
 from dataclasses import dataclass, field
-from typing import Optional, Any
+from typing import List, Optional, Any
 from sqlalchemy.ext.asyncio import AsyncSession
 from app.utils.streaming_service import StreamingService
 
@@ -21,7 +21,7 @@ class State:
     # Streaming service
     streaming_service: StreamingService
     
-    # Intermediate state - populated during workflow
+    # chat_history: Optional[List[Any]] = field(default=None)
     # Using field to explicitly mark as part of state
     answer_outline: Optional[Any] = field(default=None)
     
diff --git a/surfsense_backend/app/config/__init__.py b/surfsense_backend/app/config/__init__.py
index bdc370ea3..d51a74bae 100644
--- a/surfsense_backend/app/config/__init__.py
+++ b/surfsense_backend/app/config/__init__.py
@@ -46,14 +46,27 @@ class Config:
     
     # LONG-CONTEXT LLMS
     LONG_CONTEXT_LLM = os.getenv("LONG_CONTEXT_LLM")
-    long_context_llm_instance = ChatLiteLLM(model=LONG_CONTEXT_LLM)
+    FAST_LLM_API_BASE = os.getenv("FAST_LLM_API_BASE")
+    if FAST_LLM_API_BASE:
+        long_context_llm_instance = ChatLiteLLM(model=LONG_CONTEXT_LLM, api_base=FAST_LLM_API_BASE)
+    else:
+        long_context_llm_instance = ChatLiteLLM(model=LONG_CONTEXT_LLM)
     
-    # FAST & STRATEGIC LLM's
+    # FAST LLM
     FAST_LLM = os.getenv("FAST_LLM")
-    STRATEGIC_LLM = os.getenv("STRATEGIC_LLM")
-    fast_llm_instance = ChatLiteLLM(model=FAST_LLM)
-    strategic_llm_instance = ChatLiteLLM(model=STRATEGIC_LLM)
+    FAST_LLM_API_BASE = os.getenv("FAST_LLM_API_BASE")
+    if FAST_LLM_API_BASE:
+        fast_llm_instance = ChatLiteLLM(model=FAST_LLM, api_base=FAST_LLM_API_BASE)
+    else:
+        fast_llm_instance = ChatLiteLLM(model=FAST_LLM)
     
+    # STRATEGIC LLM
+    STRATEGIC_LLM = os.getenv("STRATEGIC_LLM")
+    STRATEGIC_LLM_API_BASE = os.getenv("STRATEGIC_LLM_API_BASE")
+    if STRATEGIC_LLM_API_BASE:
+        strategic_llm_instance = ChatLiteLLM(model=STRATEGIC_LLM, api_base=STRATEGIC_LLM_API_BASE)
+    else:
+        strategic_llm_instance = ChatLiteLLM(model=STRATEGIC_LLM)
 
     # Chonkie Configuration | Edit this to your needs
     EMBEDDING_MODEL = os.getenv("EMBEDDING_MODEL")
diff --git a/surfsense_web/app/dashboard/[search_space_id]/chats/page.tsx b/surfsense_web/app/dashboard/[search_space_id]/chats/page.tsx
index 4d71dac77..f382d633c 100644
--- a/surfsense_web/app/dashboard/[search_space_id]/chats/page.tsx
+++ b/surfsense_web/app/dashboard/[search_space_id]/chats/page.tsx
@@ -9,7 +9,7 @@ interface PageProps {
 
 export default async function ChatsPage({ params }: PageProps) {
   // Get search space ID from the route parameter
-  const { search_space_id: searchSpaceId } = params;
+  const { search_space_id: searchSpaceId } = await Promise.resolve(params);
   
   return (
     <Suspense fallback={<div className="flex items-center justify-center h-[60vh]">
diff --git a/surfsense_web/content/docs/docker-installation.mdx b/surfsense_web/content/docs/docker-installation.mdx
index 47053c915..7d7d36909 100644
--- a/surfsense_web/content/docs/docker-installation.mdx
+++ b/surfsense_web/content/docs/docker-installation.mdx
@@ -1,8 +1,9 @@
 ---
 title: Docker Installation
-description: Setting up SurfSense using Docker 
+description: Setting up SurfSense using Docker
 full: true
 ---
+
 ## Known Limitations
 
 ⚠️ **Important Note:** Currently, the following features have limited functionality when running in Docker:
@@ -12,8 +13,7 @@ full: true
 
 We're actively working to resolve these limitations in future releases.
 
-
-# Docker Installation 
+# Docker Installation
 
 This guide explains how to run SurfSense using Docker Compose, which is the preferred and recommended method for deployment.
 
@@ -32,23 +32,26 @@ Before you begin, ensure you have:
 ## Installation Steps
 
 1. **Configure Environment Variables**
-   
+
    Set up the necessary environment variables:
-   
+
    **Linux/macOS:**
+
    ```bash
    # Copy example environment files
    cp surfsense_backend/.env.example surfsense_backend/.env
    cp surfsense_web/.env.example surfsense_web/.env
    ```
-   
+
    **Windows (Command Prompt):**
+
    ```cmd
    copy surfsense_backend\.env.example surfsense_backend\.env
    copy surfsense_web\.env.example surfsense_web\.env
    ```
-   
+
    **Windows (PowerShell):**
+
    ```powershell
    Copy-Item -Path surfsense_backend\.env.example -Destination surfsense_backend\.env
    Copy-Item -Path surfsense_web\.env.example -Destination surfsense_web\.env
@@ -58,47 +61,65 @@ Before you begin, ensure you have:
 
    **Backend Environment Variables:**
 
-   | ENV VARIABLE | DESCRIPTION |
-   |--------------|-------------|
-   | DATABASE_URL | PostgreSQL connection string (e.g., `postgresql+asyncpg://postgres:postgres@localhost:5432/surfsense`) |
-   | SECRET_KEY | JWT Secret key for authentication (should be a secure random string) |
-   | GOOGLE_OAUTH_CLIENT_ID | Google OAuth client ID obtained from Google Cloud Console |
-   | GOOGLE_OAUTH_CLIENT_SECRET | Google OAuth client secret obtained from Google Cloud Console |
-   | NEXT_FRONTEND_URL | URL where your frontend application is hosted (e.g., `http://localhost:3000`) |
-   | EMBEDDING_MODEL | Name of the embedding model (e.g., `openai://text-embedding-ada-002`, `anthropic://claude-v1`, `mixedbread-ai/mxbai-embed-large-v1`) |
-   | RERANKERS_MODEL_NAME | Name of the reranker model (e.g., `ms-marco-MiniLM-L-12-v2`) |
-   | RERANKERS_MODEL_TYPE | Type of reranker model (e.g., `flashrank`) |
-   | FAST_LLM | LiteLLM routed smaller, faster LLM (e.g., `openai/gpt-4o-mini`, `ollama/deepseek-r1:8b`) |
-   | STRATEGIC_LLM | LiteLLM routed advanced LLM for complex tasks (e.g., `openai/gpt-4o`, `ollama/gemma3:12b`) |
-   | LONG_CONTEXT_LLM | LiteLLM routed LLM for longer context windows (e.g., `gemini/gemini-2.0-flash`, `ollama/deepseek-r1:8b`) |
-   | UNSTRUCTURED_API_KEY | API key for Unstructured.io service for document parsing |
-   | FIRECRAWL_API_KEY | API key for Firecrawl service for web crawling |
-   | TTS_SERVICE | Text-to-Speech API provider for Podcasts (e.g., `openai/tts-1`, `azure/neural`, `vertex_ai/`). See [supported providers](https://docs.litellm.ai/docs/text_to_speech#supported-providers) |
+   | ENV VARIABLE               | DESCRIPTION                                                                                                                                                                               |
+   | -------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+   | DATABASE_URL               | PostgreSQL connection string (e.g., `postgresql+asyncpg://postgres:postgres@localhost:5432/surfsense`)                                                                                    |
+   | SECRET_KEY                 | JWT Secret key for authentication (should be a secure random string)                                                                                                                      |
+   | GOOGLE_OAUTH_CLIENT_ID     | Google OAuth client ID obtained from Google Cloud Console                                                                                                                                 |
+   | GOOGLE_OAUTH_CLIENT_SECRET | Google OAuth client secret obtained from Google Cloud Console                                                                                                                             |
+   | NEXT_FRONTEND_URL          | URL where your frontend application is hosted (e.g., `http://localhost:3000`)                                                                                                             |
+   | EMBEDDING_MODEL            | Name of the embedding model (e.g., `openai://text-embedding-ada-002`, `anthropic://claude-v1`, `mixedbread-ai/mxbai-embed-large-v1`)                                                      |
+   | RERANKERS_MODEL_NAME       | Name of the reranker model (e.g., `ms-marco-MiniLM-L-12-v2`)                                                                                                                              |
+   | RERANKERS_MODEL_TYPE       | Type of reranker model (e.g., `flashrank`)                                                                                                                                                |
+   | FAST_LLM                   | LiteLLM routed smaller, faster LLM (e.g., `openai/gpt-4o-mini`, `ollama/deepseek-r1:8b`)                                                                                                  |
+   | STRATEGIC_LLM              | LiteLLM routed advanced LLM for complex tasks (e.g., `openai/gpt-4o`, `ollama/gemma3:12b`)                                                                                                |
+   | LONG_CONTEXT_LLM           | LiteLLM routed LLM for longer context windows (e.g., `gemini/gemini-2.0-flash`, `ollama/deepseek-r1:8b`)                                                                                  |
+   | UNSTRUCTURED_API_KEY       | API key for Unstructured.io service for document parsing                                                                                                                                  |
+   | FIRECRAWL_API_KEY          | API key for Firecrawl service for web crawling                                                                                                                                            |
+   | TTS_SERVICE                | Text-to-Speech API provider for Podcasts (e.g., `openai/tts-1`, `azure/neural`, `vertex_ai/`). See [supported providers](https://docs.litellm.ai/docs/text_to_speech#supported-providers) |
 
-   Include API keys for the LLM providers you're using. For example:
-   - `OPENAI_API_KEY`: If using OpenAI models
-   - `GEMINI_API_KEY`: If using Google Gemini models
-   
-   For other LLM providers, refer to the [LiteLLM documentation](https://docs.litellm.ai/docs/providers).
+Include API keys for the LLM providers you're using. For example:
 
-   **Frontend Environment Variables:**
+- `OPENAI_API_KEY`: If using OpenAI models
+- `GEMINI_API_KEY`: If using Google Gemini models
 
-   | ENV VARIABLE | DESCRIPTION |
-   |--------------|-------------|
-   | NEXT_PUBLIC_FASTAPI_BACKEND_URL | URL of the backend service (e.g., `http://localhost:8000`) |
+  **Optional LangSmith Observability:**
+  | ENV VARIABLE | DESCRIPTION |
+  |--------------|-------------|
+  | LANGSMITH_TRACING | Enable LangSmith tracing (e.g., `true`) |
+  | LANGSMITH_ENDPOINT | LangSmith API endpoint (e.g., `https://api.smith.langchain.com`) |
+  | LANGSMITH_API_KEY | Your LangSmith API key |
+  | LANGSMITH_PROJECT | LangSmith project name (e.g., `surfsense`) |
+
+  **Optional LiteLLM API Base URLs:**
+  | ENV VARIABLE | DESCRIPTION |
+  |--------------|-------------|
+  | FAST_LLM_API_BASE | Custom API base URL for the fast LLM |
+  | STRATEGIC_LLM_API_BASE | Custom API base URL for the strategic LLM |
+  | LONG_CONTEXT_LLM_API_BASE | Custom API base URL for the long context LLM |
+
+For other LLM providers, refer to the [LiteLLM documentation](https://docs.litellm.ai/docs/providers).
+
+**Frontend Environment Variables:**
+
+| ENV VARIABLE                    | DESCRIPTION                                                |
+| ------------------------------- | ---------------------------------------------------------- |
+| NEXT_PUBLIC_FASTAPI_BACKEND_URL | URL of the backend service (e.g., `http://localhost:8000`) |
 
 2. **Build and Start Containers**
-   
+
    Start the Docker containers:
-   
+
    **Linux/macOS/Windows:**
+
    ```bash
    docker-compose up --build
    ```
 
    To run in detached mode (in the background):
-   
+
    **Linux/macOS/Windows:**
+
    ```bash
    docker-compose up -d
    ```
@@ -106,8 +127,9 @@ Before you begin, ensure you have:
    **Note for Windows users:** If you're using older Docker Desktop versions, you might need to use `docker compose` (with a space) instead of `docker-compose`.
 
 3. **Access the Applications**
-   
+
    Once the containers are running, you can access:
+
    - Frontend: [http://localhost:3000](http://localhost:3000)
    - Backend API: [http://localhost:8000](http://localhost:8000)
    - API Documentation: [http://localhost:8000/docs](http://localhost:8000/docs)
@@ -117,19 +139,21 @@ Before you begin, ensure you have:
 ### Container Management
 
 - **Stop containers:**
-  
+
   **Linux/macOS/Windows:**
+
   ```bash
   docker-compose down
   ```
 
 - **View logs:**
-  
+
   **Linux/macOS/Windows:**
+
   ```bash
   # All services
   docker-compose logs -f
-  
+
   # Specific service
   docker-compose logs -f backend
   docker-compose logs -f frontend
@@ -137,19 +161,21 @@ Before you begin, ensure you have:
   ```
 
 - **Restart a specific service:**
-  
+
   **Linux/macOS/Windows:**
+
   ```bash
   docker-compose restart backend
   ```
 
 - **Execute commands in a running container:**
-  
+
   **Linux/macOS/Windows:**
+
   ```bash
   # Backend
   docker-compose exec backend python -m pytest
-  
+
   # Frontend
   docker-compose exec frontend pnpm lint
   ```
@@ -163,7 +189,6 @@ Before you begin, ensure you have:
 - For frontend dependency issues, check the `Dockerfile` in the frontend directory.
 - **Windows-specific:** If you encounter line ending issues (CRLF vs LF), configure Git to handle line endings properly with `git config --global core.autocrlf true` before cloning the repository.
 
-
 ## Next Steps
 
-Once your installation is complete, you can start using SurfSense! Navigate to the frontend URL and log in using your Google account. 
\ No newline at end of file
+Once your installation is complete, you can start using SurfSense! Navigate to the frontend URL and log in using your Google account.
diff --git a/surfsense_web/content/docs/manual-installation.mdx b/surfsense_web/content/docs/manual-installation.mdx
index b1fed6aa4..b2bffae1a 100644
--- a/surfsense_web/content/docs/manual-installation.mdx
+++ b/surfsense_web/content/docs/manual-installation.mdx
@@ -27,18 +27,21 @@ The backend is the core of SurfSense. Follow these steps to set it up:
 First, create and configure your environment variables by copying the example file:
 
 **Linux/macOS:**
+
 ```bash
 cd surfsense_backend
 cp .env.example .env
 ```
 
 **Windows (Command Prompt):**
+
 ```cmd
 cd surfsense_backend
 copy .env.example .env
 ```
 
 **Windows (PowerShell):**
+
 ```powershell
 cd surfsense_backend
 Copy-Item -Path .env.example -Destination .env
@@ -46,33 +49,50 @@ Copy-Item -Path .env.example -Destination .env
 
 Edit the `.env` file and set the following variables:
 
-| ENV VARIABLE | DESCRIPTION |
-|--------------|-------------|
-| DATABASE_URL | PostgreSQL connection string (e.g., `postgresql+asyncpg://postgres:postgres@localhost:5432/surfsense`) |
-| SECRET_KEY | JWT Secret key for authentication (should be a secure random string) |
-| GOOGLE_OAUTH_CLIENT_ID | Google OAuth client ID |
-| GOOGLE_OAUTH_CLIENT_SECRET | Google OAuth client secret |
-| NEXT_FRONTEND_URL | Frontend application URL (e.g., `http://localhost:3000`) |
-| EMBEDDING_MODEL | Name of the embedding model (e.g., `openai://text-embedding-ada-002`, `anthropic://claude-v1`, `mixedbread-ai/mxbai-embed-large-v1`) |
-| RERANKERS_MODEL_NAME | Name of the reranker model (e.g., `ms-marco-MiniLM-L-12-v2`) |
-| RERANKERS_MODEL_TYPE | Type of reranker model (e.g., `flashrank`) |
-| FAST_LLM | LiteLLM routed faster LLM (e.g., `openai/gpt-4o-mini`, `ollama/deepseek-r1:8b`) |
-| STRATEGIC_LLM | LiteLLM routed advanced LLM (e.g., `openai/gpt-4o`, `ollama/gemma3:12b`) |
-| LONG_CONTEXT_LLM | LiteLLM routed long-context LLM (e.g., `gemini/gemini-2.0-flash`, `ollama/deepseek-r1:8b`) |
-| UNSTRUCTURED_API_KEY | API key for Unstructured.io service |
-| FIRECRAWL_API_KEY | API key for Firecrawl service (if using crawler) |
-| TTS_SERVICE | Text-to-Speech API provider for Podcasts (e.g., `openai/tts-1`, `azure/neural`, `vertex_ai/`). See [supported providers](https://docs.litellm.ai/docs/text_to_speech#supported-providers) |
+| ENV VARIABLE               | DESCRIPTION                                                                                                                                                                               |
+| -------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| DATABASE_URL               | PostgreSQL connection string (e.g., `postgresql+asyncpg://postgres:postgres@localhost:5432/surfsense`)                                                                                    |
+| SECRET_KEY                 | JWT Secret key for authentication (should be a secure random string)                                                                                                                      |
+| GOOGLE_OAUTH_CLIENT_ID     | Google OAuth client ID                                                                                                                                                                    |
+| GOOGLE_OAUTH_CLIENT_SECRET | Google OAuth client secret                                                                                                                                                                |
+| NEXT_FRONTEND_URL          | Frontend application URL (e.g., `http://localhost:3000`)                                                                                                                                  |
+| EMBEDDING_MODEL            | Name of the embedding model (e.g., `openai://text-embedding-ada-002`, `anthropic://claude-v1`, `mixedbread-ai/mxbai-embed-large-v1`)                                                      |
+| RERANKERS_MODEL_NAME       | Name of the reranker model (e.g., `ms-marco-MiniLM-L-12-v2`)                                                                                                                              |
+| RERANKERS_MODEL_TYPE       | Type of reranker model (e.g., `flashrank`)                                                                                                                                                |
+| FAST_LLM                   | LiteLLM routed faster LLM (e.g., `openai/gpt-4o-mini`, `ollama/deepseek-r1:8b`)                                                                                                           |
+| STRATEGIC_LLM              | LiteLLM routed advanced LLM (e.g., `openai/gpt-4o`, `ollama/gemma3:12b`)                                                                                                                  |
+| LONG_CONTEXT_LLM           | LiteLLM routed long-context LLM (e.g., `gemini/gemini-2.0-flash`, `ollama/deepseek-r1:8b`)                                                                                                |
+| UNSTRUCTURED_API_KEY       | API key for Unstructured.io service                                                                                                                                                       |
+| FIRECRAWL_API_KEY          | API key for Firecrawl service (if using crawler)                                                                                                                                          |
+| TTS_SERVICE                | Text-to-Speech API provider for Podcasts (e.g., `openai/tts-1`, `azure/neural`, `vertex_ai/`). See [supported providers](https://docs.litellm.ai/docs/text_to_speech#supported-providers) |
 
 **Important**: Since LLM calls are routed through LiteLLM, include API keys for the LLM providers you're using:
+
 - For OpenAI models: `OPENAI_API_KEY`
 - For Google Gemini models: `GEMINI_API_KEY`
 - For other providers, refer to the [LiteLLM documentation](https://docs.litellm.ai/docs/providers)
 
+  **Optional LangSmith Observability:**
+  | ENV VARIABLE | DESCRIPTION |
+  |--------------|-------------|
+  | LANGSMITH_TRACING | Enable LangSmith tracing (e.g., `true`) |
+  | LANGSMITH_ENDPOINT | LangSmith API endpoint (e.g., `https://api.smith.langchain.com`) |
+  | LANGSMITH_API_KEY | Your LangSmith API key |
+  | LANGSMITH_PROJECT | LangSmith project name (e.g., `surfsense`) |
+
+  **Optional LiteLLM API Base URLs:**
+  | ENV VARIABLE | DESCRIPTION |
+  |--------------|-------------|
+  | FAST_LLM_API_BASE | Custom API base URL for the fast LLM |
+  | STRATEGIC_LLM_API_BASE | Custom API base URL for the strategic LLM |
+  | LONG_CONTEXT_LLM_API_BASE | Custom API base URL for the long context LLM |
+
 ### 2. Install Dependencies
 
 Install the backend dependencies using `uv`:
 
 **Linux/macOS:**
+
 ```bash
 # Install uv if you don't have it
 curl -fsSL https://astral.sh/uv/install.sh | bash
@@ -82,6 +102,7 @@ uv sync
 ```
 
 **Windows (PowerShell):**
+
 ```powershell
 # Install uv if you don't have it
 iwr -useb https://astral.sh/uv/install.ps1 | iex
@@ -91,6 +112,7 @@ uv sync
 ```
 
 **Windows (Command Prompt):**
+
 ```cmd
 # Install dependencies with uv (after installing uv)
 uv sync
@@ -101,6 +123,7 @@ uv sync
 Start the backend server:
 
 **Linux/macOS/Windows:**
+
 ```bash
 # Run without hot reloading
 uv run main.py
@@ -118,18 +141,21 @@ If everything is set up correctly, you should see output indicating the server i
 Set up the frontend environment:
 
 **Linux/macOS:**
+
 ```bash
 cd surfsense_web
 cp .env.example .env
 ```
 
 **Windows (Command Prompt):**
+
 ```cmd
 cd surfsense_web
 copy .env.example .env
 ```
 
 **Windows (PowerShell):**
+
 ```powershell
 cd surfsense_web
 Copy-Item -Path .env.example -Destination .env
@@ -137,8 +163,8 @@ Copy-Item -Path .env.example -Destination .env
 
 Edit the `.env` file and set:
 
-| ENV VARIABLE | DESCRIPTION |
-|--------------|-------------|
+| ENV VARIABLE                    | DESCRIPTION                                 |
+| ------------------------------- | ------------------------------------------- |
 | NEXT_PUBLIC_FASTAPI_BACKEND_URL | Backend URL (e.g., `http://localhost:8000`) |
 
 ### 2. Install Dependencies
@@ -146,6 +172,7 @@ Edit the `.env` file and set:
 Install the frontend dependencies:
 
 **Linux/macOS:**
+
 ```bash
 # Install pnpm if you don't have it
 npm install -g pnpm
@@ -155,6 +182,7 @@ pnpm install
 ```
 
 **Windows:**
+
 ```powershell
 # Install pnpm if you don't have it
 npm install -g pnpm
@@ -168,6 +196,7 @@ pnpm install
 Start the Next.js development server:
 
 **Linux/macOS/Windows:**
+
 ```bash
 pnpm run dev
 ```
@@ -181,18 +210,21 @@ The SurfSense browser extension allows you to save any webpage, including those
 ### 1. Environment Configuration
 
 **Linux/macOS:**
+
 ```bash
 cd surfsense_browser_extension
 cp .env.example .env
 ```
 
 **Windows (Command Prompt):**
+
 ```cmd
 cd surfsense_browser_extension
 copy .env.example .env
 ```
 
 **Windows (PowerShell):**
+
 ```powershell
 cd surfsense_browser_extension
 Copy-Item -Path .env.example -Destination .env
@@ -200,8 +232,8 @@ Copy-Item -Path .env.example -Destination .env
 
 Edit the `.env` file:
 
-| ENV VARIABLE | DESCRIPTION |
-|--------------|-------------|
+| ENV VARIABLE              | DESCRIPTION                                           |
+| ------------------------- | ----------------------------------------------------- |
 | PLASMO_PUBLIC_BACKEND_URL | SurfSense Backend URL (e.g., `http://127.0.0.1:8000`) |
 
 ### 2. Build the Extension
@@ -209,6 +241,7 @@ Edit the `.env` file:
 Build the extension for your browser using the [Plasmo framework](https://docs.plasmo.com/framework/workflows/build#with-a-specific-target).
 
 **Linux/macOS/Windows:**
+
 ```bash
 # Install dependencies
 pnpm install
@@ -253,7 +286,8 @@ Now that you have SurfSense running locally, you can explore its features:
 - Explore the advanced RAG capabilities
 
 For production deployments, consider setting up:
+
 - A reverse proxy like Nginx
 - SSL certificates for secure connections
 - Proper database backups
-- User access controls 
\ No newline at end of file
+- User access controls

From 4a2be4b98e607b2dd3404cfbb71d5ea4f127a43f Mon Sep 17 00:00:00 2001
From: "DESKTOP-RTLN3BA\\$punk" <vermarohanfinal@gmail.com>
Date: Thu, 8 May 2025 19:31:47 -0700
Subject: [PATCH 26/70] fix: Added API_BASE param for LiteLLM.

---
 surfsense_backend/.env.example                |   5 +
 surfsense_backend/.gitignore                  |   1 +
 .../app/agents/researcher/state.py            |   4 +-
 surfsense_backend/app/config/__init__.py      |  23 +++-
 .../[search_space_id]/chats/page.tsx          |   2 +-
 .../content/docs/docker-installation.mdx      | 113 +++++++++++-------
 .../content/docs/manual-installation.mdx      |  76 ++++++++----
 7 files changed, 151 insertions(+), 73 deletions(-)

diff --git a/surfsense_backend/.env.example b/surfsense_backend/.env.example
index 8e834bf1d..53a8fb58a 100644
--- a/surfsense_backend/.env.example
+++ b/surfsense_backend/.env.example
@@ -30,3 +30,8 @@ LANGSMITH_TRACING=true
 LANGSMITH_ENDPOINT="https://api.smith.langchain.com"
 LANGSMITH_API_KEY="lsv2_pt_....."
 LANGSMITH_PROJECT="surfsense"
+
+# OPTIONAL: LiteLLM API Base
+FAST_LLM_API_BASE=""
+STRATEGIC_LLM_API_BASE=""
+LONG_CONTEXT_LLM_API_BASE=""
diff --git a/surfsense_backend/.gitignore b/surfsense_backend/.gitignore
index ee59e4764..b89ba2402 100644
--- a/surfsense_backend/.gitignore
+++ b/surfsense_backend/.gitignore
@@ -5,3 +5,4 @@ data/
 __pycache__/
 .flashrank_cache
 surf_new_backend.egg-info/
+podcasts/
diff --git a/surfsense_backend/app/agents/researcher/state.py b/surfsense_backend/app/agents/researcher/state.py
index dd36163b6..7850c5b7a 100644
--- a/surfsense_backend/app/agents/researcher/state.py
+++ b/surfsense_backend/app/agents/researcher/state.py
@@ -3,7 +3,7 @@
 from __future__ import annotations
 
 from dataclasses import dataclass, field
-from typing import Optional, Any
+from typing import List, Optional, Any
 from sqlalchemy.ext.asyncio import AsyncSession
 from app.utils.streaming_service import StreamingService
 
@@ -21,7 +21,7 @@ class State:
     # Streaming service
     streaming_service: StreamingService
     
-    # Intermediate state - populated during workflow
+    # chat_history: Optional[List[Any]] = field(default=None)
     # Using field to explicitly mark as part of state
     answer_outline: Optional[Any] = field(default=None)
     
diff --git a/surfsense_backend/app/config/__init__.py b/surfsense_backend/app/config/__init__.py
index bdc370ea3..d51a74bae 100644
--- a/surfsense_backend/app/config/__init__.py
+++ b/surfsense_backend/app/config/__init__.py
@@ -46,14 +46,27 @@ class Config:
     
     # LONG-CONTEXT LLMS
     LONG_CONTEXT_LLM = os.getenv("LONG_CONTEXT_LLM")
-    long_context_llm_instance = ChatLiteLLM(model=LONG_CONTEXT_LLM)
+    FAST_LLM_API_BASE = os.getenv("FAST_LLM_API_BASE")
+    if FAST_LLM_API_BASE:
+        long_context_llm_instance = ChatLiteLLM(model=LONG_CONTEXT_LLM, api_base=FAST_LLM_API_BASE)
+    else:
+        long_context_llm_instance = ChatLiteLLM(model=LONG_CONTEXT_LLM)
     
-    # FAST & STRATEGIC LLM's
+    # FAST LLM
     FAST_LLM = os.getenv("FAST_LLM")
-    STRATEGIC_LLM = os.getenv("STRATEGIC_LLM")
-    fast_llm_instance = ChatLiteLLM(model=FAST_LLM)
-    strategic_llm_instance = ChatLiteLLM(model=STRATEGIC_LLM)
+    FAST_LLM_API_BASE = os.getenv("FAST_LLM_API_BASE")
+    if FAST_LLM_API_BASE:
+        fast_llm_instance = ChatLiteLLM(model=FAST_LLM, api_base=FAST_LLM_API_BASE)
+    else:
+        fast_llm_instance = ChatLiteLLM(model=FAST_LLM)
     
+    # STRATEGIC LLM
+    STRATEGIC_LLM = os.getenv("STRATEGIC_LLM")
+    STRATEGIC_LLM_API_BASE = os.getenv("STRATEGIC_LLM_API_BASE")
+    if STRATEGIC_LLM_API_BASE:
+        strategic_llm_instance = ChatLiteLLM(model=STRATEGIC_LLM, api_base=STRATEGIC_LLM_API_BASE)
+    else:
+        strategic_llm_instance = ChatLiteLLM(model=STRATEGIC_LLM)
 
     # Chonkie Configuration | Edit this to your needs
     EMBEDDING_MODEL = os.getenv("EMBEDDING_MODEL")
diff --git a/surfsense_web/app/dashboard/[search_space_id]/chats/page.tsx b/surfsense_web/app/dashboard/[search_space_id]/chats/page.tsx
index 4d71dac77..f382d633c 100644
--- a/surfsense_web/app/dashboard/[search_space_id]/chats/page.tsx
+++ b/surfsense_web/app/dashboard/[search_space_id]/chats/page.tsx
@@ -9,7 +9,7 @@ interface PageProps {
 
 export default async function ChatsPage({ params }: PageProps) {
   // Get search space ID from the route parameter
-  const { search_space_id: searchSpaceId } = params;
+  const { search_space_id: searchSpaceId } = await Promise.resolve(params);
   
   return (
     <Suspense fallback={<div className="flex items-center justify-center h-[60vh]">
diff --git a/surfsense_web/content/docs/docker-installation.mdx b/surfsense_web/content/docs/docker-installation.mdx
index 47053c915..7d7d36909 100644
--- a/surfsense_web/content/docs/docker-installation.mdx
+++ b/surfsense_web/content/docs/docker-installation.mdx
@@ -1,8 +1,9 @@
 ---
 title: Docker Installation
-description: Setting up SurfSense using Docker 
+description: Setting up SurfSense using Docker
 full: true
 ---
+
 ## Known Limitations
 
 ⚠️ **Important Note:** Currently, the following features have limited functionality when running in Docker:
@@ -12,8 +13,7 @@ full: true
 
 We're actively working to resolve these limitations in future releases.
 
-
-# Docker Installation 
+# Docker Installation
 
 This guide explains how to run SurfSense using Docker Compose, which is the preferred and recommended method for deployment.
 
@@ -32,23 +32,26 @@ Before you begin, ensure you have:
 ## Installation Steps
 
 1. **Configure Environment Variables**
-   
+
    Set up the necessary environment variables:
-   
+
    **Linux/macOS:**
+
    ```bash
    # Copy example environment files
    cp surfsense_backend/.env.example surfsense_backend/.env
    cp surfsense_web/.env.example surfsense_web/.env
    ```
-   
+
    **Windows (Command Prompt):**
+
    ```cmd
    copy surfsense_backend\.env.example surfsense_backend\.env
    copy surfsense_web\.env.example surfsense_web\.env
    ```
-   
+
    **Windows (PowerShell):**
+
    ```powershell
    Copy-Item -Path surfsense_backend\.env.example -Destination surfsense_backend\.env
    Copy-Item -Path surfsense_web\.env.example -Destination surfsense_web\.env
@@ -58,47 +61,65 @@ Before you begin, ensure you have:
 
    **Backend Environment Variables:**
 
-   | ENV VARIABLE | DESCRIPTION |
-   |--------------|-------------|
-   | DATABASE_URL | PostgreSQL connection string (e.g., `postgresql+asyncpg://postgres:postgres@localhost:5432/surfsense`) |
-   | SECRET_KEY | JWT Secret key for authentication (should be a secure random string) |
-   | GOOGLE_OAUTH_CLIENT_ID | Google OAuth client ID obtained from Google Cloud Console |
-   | GOOGLE_OAUTH_CLIENT_SECRET | Google OAuth client secret obtained from Google Cloud Console |
-   | NEXT_FRONTEND_URL | URL where your frontend application is hosted (e.g., `http://localhost:3000`) |
-   | EMBEDDING_MODEL | Name of the embedding model (e.g., `openai://text-embedding-ada-002`, `anthropic://claude-v1`, `mixedbread-ai/mxbai-embed-large-v1`) |
-   | RERANKERS_MODEL_NAME | Name of the reranker model (e.g., `ms-marco-MiniLM-L-12-v2`) |
-   | RERANKERS_MODEL_TYPE | Type of reranker model (e.g., `flashrank`) |
-   | FAST_LLM | LiteLLM routed smaller, faster LLM (e.g., `openai/gpt-4o-mini`, `ollama/deepseek-r1:8b`) |
-   | STRATEGIC_LLM | LiteLLM routed advanced LLM for complex tasks (e.g., `openai/gpt-4o`, `ollama/gemma3:12b`) |
-   | LONG_CONTEXT_LLM | LiteLLM routed LLM for longer context windows (e.g., `gemini/gemini-2.0-flash`, `ollama/deepseek-r1:8b`) |
-   | UNSTRUCTURED_API_KEY | API key for Unstructured.io service for document parsing |
-   | FIRECRAWL_API_KEY | API key for Firecrawl service for web crawling |
-   | TTS_SERVICE | Text-to-Speech API provider for Podcasts (e.g., `openai/tts-1`, `azure/neural`, `vertex_ai/`). See [supported providers](https://docs.litellm.ai/docs/text_to_speech#supported-providers) |
+   | ENV VARIABLE               | DESCRIPTION                                                                                                                                                                               |
+   | -------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+   | DATABASE_URL               | PostgreSQL connection string (e.g., `postgresql+asyncpg://postgres:postgres@localhost:5432/surfsense`)                                                                                    |
+   | SECRET_KEY                 | JWT Secret key for authentication (should be a secure random string)                                                                                                                      |
+   | GOOGLE_OAUTH_CLIENT_ID     | Google OAuth client ID obtained from Google Cloud Console                                                                                                                                 |
+   | GOOGLE_OAUTH_CLIENT_SECRET | Google OAuth client secret obtained from Google Cloud Console                                                                                                                             |
+   | NEXT_FRONTEND_URL          | URL where your frontend application is hosted (e.g., `http://localhost:3000`)                                                                                                             |
+   | EMBEDDING_MODEL            | Name of the embedding model (e.g., `openai://text-embedding-ada-002`, `anthropic://claude-v1`, `mixedbread-ai/mxbai-embed-large-v1`)                                                      |
+   | RERANKERS_MODEL_NAME       | Name of the reranker model (e.g., `ms-marco-MiniLM-L-12-v2`)                                                                                                                              |
+   | RERANKERS_MODEL_TYPE       | Type of reranker model (e.g., `flashrank`)                                                                                                                                                |
+   | FAST_LLM                   | LiteLLM routed smaller, faster LLM (e.g., `openai/gpt-4o-mini`, `ollama/deepseek-r1:8b`)                                                                                                  |
+   | STRATEGIC_LLM              | LiteLLM routed advanced LLM for complex tasks (e.g., `openai/gpt-4o`, `ollama/gemma3:12b`)                                                                                                |
+   | LONG_CONTEXT_LLM           | LiteLLM routed LLM for longer context windows (e.g., `gemini/gemini-2.0-flash`, `ollama/deepseek-r1:8b`)                                                                                  |
+   | UNSTRUCTURED_API_KEY       | API key for Unstructured.io service for document parsing                                                                                                                                  |
+   | FIRECRAWL_API_KEY          | API key for Firecrawl service for web crawling                                                                                                                                            |
+   | TTS_SERVICE                | Text-to-Speech API provider for Podcasts (e.g., `openai/tts-1`, `azure/neural`, `vertex_ai/`). See [supported providers](https://docs.litellm.ai/docs/text_to_speech#supported-providers) |
 
-   Include API keys for the LLM providers you're using. For example:
-   - `OPENAI_API_KEY`: If using OpenAI models
-   - `GEMINI_API_KEY`: If using Google Gemini models
-   
-   For other LLM providers, refer to the [LiteLLM documentation](https://docs.litellm.ai/docs/providers).
+Include API keys for the LLM providers you're using. For example:
 
-   **Frontend Environment Variables:**
+- `OPENAI_API_KEY`: If using OpenAI models
+- `GEMINI_API_KEY`: If using Google Gemini models
 
-   | ENV VARIABLE | DESCRIPTION |
-   |--------------|-------------|
-   | NEXT_PUBLIC_FASTAPI_BACKEND_URL | URL of the backend service (e.g., `http://localhost:8000`) |
+  **Optional LangSmith Observability:**
+  | ENV VARIABLE | DESCRIPTION |
+  |--------------|-------------|
+  | LANGSMITH_TRACING | Enable LangSmith tracing (e.g., `true`) |
+  | LANGSMITH_ENDPOINT | LangSmith API endpoint (e.g., `https://api.smith.langchain.com`) |
+  | LANGSMITH_API_KEY | Your LangSmith API key |
+  | LANGSMITH_PROJECT | LangSmith project name (e.g., `surfsense`) |
+
+  **Optional LiteLLM API Base URLs:**
+  | ENV VARIABLE | DESCRIPTION |
+  |--------------|-------------|
+  | FAST_LLM_API_BASE | Custom API base URL for the fast LLM |
+  | STRATEGIC_LLM_API_BASE | Custom API base URL for the strategic LLM |
+  | LONG_CONTEXT_LLM_API_BASE | Custom API base URL for the long context LLM |
+
+For other LLM providers, refer to the [LiteLLM documentation](https://docs.litellm.ai/docs/providers).
+
+**Frontend Environment Variables:**
+
+| ENV VARIABLE                    | DESCRIPTION                                                |
+| ------------------------------- | ---------------------------------------------------------- |
+| NEXT_PUBLIC_FASTAPI_BACKEND_URL | URL of the backend service (e.g., `http://localhost:8000`) |
 
 2. **Build and Start Containers**
-   
+
    Start the Docker containers:
-   
+
    **Linux/macOS/Windows:**
+
    ```bash
    docker-compose up --build
    ```
 
    To run in detached mode (in the background):
-   
+
    **Linux/macOS/Windows:**
+
    ```bash
    docker-compose up -d
    ```
@@ -106,8 +127,9 @@ Before you begin, ensure you have:
    **Note for Windows users:** If you're using older Docker Desktop versions, you might need to use `docker compose` (with a space) instead of `docker-compose`.
 
 3. **Access the Applications**
-   
+
    Once the containers are running, you can access:
+
    - Frontend: [http://localhost:3000](http://localhost:3000)
    - Backend API: [http://localhost:8000](http://localhost:8000)
    - API Documentation: [http://localhost:8000/docs](http://localhost:8000/docs)
@@ -117,19 +139,21 @@ Before you begin, ensure you have:
 ### Container Management
 
 - **Stop containers:**
-  
+
   **Linux/macOS/Windows:**
+
   ```bash
   docker-compose down
   ```
 
 - **View logs:**
-  
+
   **Linux/macOS/Windows:**
+
   ```bash
   # All services
   docker-compose logs -f
-  
+
   # Specific service
   docker-compose logs -f backend
   docker-compose logs -f frontend
@@ -137,19 +161,21 @@ Before you begin, ensure you have:
   ```
 
 - **Restart a specific service:**
-  
+
   **Linux/macOS/Windows:**
+
   ```bash
   docker-compose restart backend
   ```
 
 - **Execute commands in a running container:**
-  
+
   **Linux/macOS/Windows:**
+
   ```bash
   # Backend
   docker-compose exec backend python -m pytest
-  
+
   # Frontend
   docker-compose exec frontend pnpm lint
   ```
@@ -163,7 +189,6 @@ Before you begin, ensure you have:
 - For frontend dependency issues, check the `Dockerfile` in the frontend directory.
 - **Windows-specific:** If you encounter line ending issues (CRLF vs LF), configure Git to handle line endings properly with `git config --global core.autocrlf true` before cloning the repository.
 
-
 ## Next Steps
 
-Once your installation is complete, you can start using SurfSense! Navigate to the frontend URL and log in using your Google account. 
\ No newline at end of file
+Once your installation is complete, you can start using SurfSense! Navigate to the frontend URL and log in using your Google account.
diff --git a/surfsense_web/content/docs/manual-installation.mdx b/surfsense_web/content/docs/manual-installation.mdx
index b1fed6aa4..b2bffae1a 100644
--- a/surfsense_web/content/docs/manual-installation.mdx
+++ b/surfsense_web/content/docs/manual-installation.mdx
@@ -27,18 +27,21 @@ The backend is the core of SurfSense. Follow these steps to set it up:
 First, create and configure your environment variables by copying the example file:
 
 **Linux/macOS:**
+
 ```bash
 cd surfsense_backend
 cp .env.example .env
 ```
 
 **Windows (Command Prompt):**
+
 ```cmd
 cd surfsense_backend
 copy .env.example .env
 ```
 
 **Windows (PowerShell):**
+
 ```powershell
 cd surfsense_backend
 Copy-Item -Path .env.example -Destination .env
@@ -46,33 +49,50 @@ Copy-Item -Path .env.example -Destination .env
 
 Edit the `.env` file and set the following variables:
 
-| ENV VARIABLE | DESCRIPTION |
-|--------------|-------------|
-| DATABASE_URL | PostgreSQL connection string (e.g., `postgresql+asyncpg://postgres:postgres@localhost:5432/surfsense`) |
-| SECRET_KEY | JWT Secret key for authentication (should be a secure random string) |
-| GOOGLE_OAUTH_CLIENT_ID | Google OAuth client ID |
-| GOOGLE_OAUTH_CLIENT_SECRET | Google OAuth client secret |
-| NEXT_FRONTEND_URL | Frontend application URL (e.g., `http://localhost:3000`) |
-| EMBEDDING_MODEL | Name of the embedding model (e.g., `openai://text-embedding-ada-002`, `anthropic://claude-v1`, `mixedbread-ai/mxbai-embed-large-v1`) |
-| RERANKERS_MODEL_NAME | Name of the reranker model (e.g., `ms-marco-MiniLM-L-12-v2`) |
-| RERANKERS_MODEL_TYPE | Type of reranker model (e.g., `flashrank`) |
-| FAST_LLM | LiteLLM routed faster LLM (e.g., `openai/gpt-4o-mini`, `ollama/deepseek-r1:8b`) |
-| STRATEGIC_LLM | LiteLLM routed advanced LLM (e.g., `openai/gpt-4o`, `ollama/gemma3:12b`) |
-| LONG_CONTEXT_LLM | LiteLLM routed long-context LLM (e.g., `gemini/gemini-2.0-flash`, `ollama/deepseek-r1:8b`) |
-| UNSTRUCTURED_API_KEY | API key for Unstructured.io service |
-| FIRECRAWL_API_KEY | API key for Firecrawl service (if using crawler) |
-| TTS_SERVICE | Text-to-Speech API provider for Podcasts (e.g., `openai/tts-1`, `azure/neural`, `vertex_ai/`). See [supported providers](https://docs.litellm.ai/docs/text_to_speech#supported-providers) |
+| ENV VARIABLE               | DESCRIPTION                                                                                                                                                                               |
+| -------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| DATABASE_URL               | PostgreSQL connection string (e.g., `postgresql+asyncpg://postgres:postgres@localhost:5432/surfsense`)                                                                                    |
+| SECRET_KEY                 | JWT Secret key for authentication (should be a secure random string)                                                                                                                      |
+| GOOGLE_OAUTH_CLIENT_ID     | Google OAuth client ID                                                                                                                                                                    |
+| GOOGLE_OAUTH_CLIENT_SECRET | Google OAuth client secret                                                                                                                                                                |
+| NEXT_FRONTEND_URL          | Frontend application URL (e.g., `http://localhost:3000`)                                                                                                                                  |
+| EMBEDDING_MODEL            | Name of the embedding model (e.g., `openai://text-embedding-ada-002`, `anthropic://claude-v1`, `mixedbread-ai/mxbai-embed-large-v1`)                                                      |
+| RERANKERS_MODEL_NAME       | Name of the reranker model (e.g., `ms-marco-MiniLM-L-12-v2`)                                                                                                                              |
+| RERANKERS_MODEL_TYPE       | Type of reranker model (e.g., `flashrank`)                                                                                                                                                |
+| FAST_LLM                   | LiteLLM routed faster LLM (e.g., `openai/gpt-4o-mini`, `ollama/deepseek-r1:8b`)                                                                                                           |
+| STRATEGIC_LLM              | LiteLLM routed advanced LLM (e.g., `openai/gpt-4o`, `ollama/gemma3:12b`)                                                                                                                  |
+| LONG_CONTEXT_LLM           | LiteLLM routed long-context LLM (e.g., `gemini/gemini-2.0-flash`, `ollama/deepseek-r1:8b`)                                                                                                |
+| UNSTRUCTURED_API_KEY       | API key for Unstructured.io service                                                                                                                                                       |
+| FIRECRAWL_API_KEY          | API key for Firecrawl service (if using crawler)                                                                                                                                          |
+| TTS_SERVICE                | Text-to-Speech API provider for Podcasts (e.g., `openai/tts-1`, `azure/neural`, `vertex_ai/`). See [supported providers](https://docs.litellm.ai/docs/text_to_speech#supported-providers) |
 
 **Important**: Since LLM calls are routed through LiteLLM, include API keys for the LLM providers you're using:
+
 - For OpenAI models: `OPENAI_API_KEY`
 - For Google Gemini models: `GEMINI_API_KEY`
 - For other providers, refer to the [LiteLLM documentation](https://docs.litellm.ai/docs/providers)
 
+  **Optional LangSmith Observability:**
+  | ENV VARIABLE | DESCRIPTION |
+  |--------------|-------------|
+  | LANGSMITH_TRACING | Enable LangSmith tracing (e.g., `true`) |
+  | LANGSMITH_ENDPOINT | LangSmith API endpoint (e.g., `https://api.smith.langchain.com`) |
+  | LANGSMITH_API_KEY | Your LangSmith API key |
+  | LANGSMITH_PROJECT | LangSmith project name (e.g., `surfsense`) |
+
+  **Optional LiteLLM API Base URLs:**
+  | ENV VARIABLE | DESCRIPTION |
+  |--------------|-------------|
+  | FAST_LLM_API_BASE | Custom API base URL for the fast LLM |
+  | STRATEGIC_LLM_API_BASE | Custom API base URL for the strategic LLM |
+  | LONG_CONTEXT_LLM_API_BASE | Custom API base URL for the long context LLM |
+
 ### 2. Install Dependencies
 
 Install the backend dependencies using `uv`:
 
 **Linux/macOS:**
+
 ```bash
 # Install uv if you don't have it
 curl -fsSL https://astral.sh/uv/install.sh | bash
@@ -82,6 +102,7 @@ uv sync
 ```
 
 **Windows (PowerShell):**
+
 ```powershell
 # Install uv if you don't have it
 iwr -useb https://astral.sh/uv/install.ps1 | iex
@@ -91,6 +112,7 @@ uv sync
 ```
 
 **Windows (Command Prompt):**
+
 ```cmd
 # Install dependencies with uv (after installing uv)
 uv sync
@@ -101,6 +123,7 @@ uv sync
 Start the backend server:
 
 **Linux/macOS/Windows:**
+
 ```bash
 # Run without hot reloading
 uv run main.py
@@ -118,18 +141,21 @@ If everything is set up correctly, you should see output indicating the server i
 Set up the frontend environment:
 
 **Linux/macOS:**
+
 ```bash
 cd surfsense_web
 cp .env.example .env
 ```
 
 **Windows (Command Prompt):**
+
 ```cmd
 cd surfsense_web
 copy .env.example .env
 ```
 
 **Windows (PowerShell):**
+
 ```powershell
 cd surfsense_web
 Copy-Item -Path .env.example -Destination .env
@@ -137,8 +163,8 @@ Copy-Item -Path .env.example -Destination .env
 
 Edit the `.env` file and set:
 
-| ENV VARIABLE | DESCRIPTION |
-|--------------|-------------|
+| ENV VARIABLE                    | DESCRIPTION                                 |
+| ------------------------------- | ------------------------------------------- |
 | NEXT_PUBLIC_FASTAPI_BACKEND_URL | Backend URL (e.g., `http://localhost:8000`) |
 
 ### 2. Install Dependencies
@@ -146,6 +172,7 @@ Edit the `.env` file and set:
 Install the frontend dependencies:
 
 **Linux/macOS:**
+
 ```bash
 # Install pnpm if you don't have it
 npm install -g pnpm
@@ -155,6 +182,7 @@ pnpm install
 ```
 
 **Windows:**
+
 ```powershell
 # Install pnpm if you don't have it
 npm install -g pnpm
@@ -168,6 +196,7 @@ pnpm install
 Start the Next.js development server:
 
 **Linux/macOS/Windows:**
+
 ```bash
 pnpm run dev
 ```
@@ -181,18 +210,21 @@ The SurfSense browser extension allows you to save any webpage, including those
 ### 1. Environment Configuration
 
 **Linux/macOS:**
+
 ```bash
 cd surfsense_browser_extension
 cp .env.example .env
 ```
 
 **Windows (Command Prompt):**
+
 ```cmd
 cd surfsense_browser_extension
 copy .env.example .env
 ```
 
 **Windows (PowerShell):**
+
 ```powershell
 cd surfsense_browser_extension
 Copy-Item -Path .env.example -Destination .env
@@ -200,8 +232,8 @@ Copy-Item -Path .env.example -Destination .env
 
 Edit the `.env` file:
 
-| ENV VARIABLE | DESCRIPTION |
-|--------------|-------------|
+| ENV VARIABLE              | DESCRIPTION                                           |
+| ------------------------- | ----------------------------------------------------- |
 | PLASMO_PUBLIC_BACKEND_URL | SurfSense Backend URL (e.g., `http://127.0.0.1:8000`) |
 
 ### 2. Build the Extension
@@ -209,6 +241,7 @@ Edit the `.env` file:
 Build the extension for your browser using the [Plasmo framework](https://docs.plasmo.com/framework/workflows/build#with-a-specific-target).
 
 **Linux/macOS/Windows:**
+
 ```bash
 # Install dependencies
 pnpm install
@@ -253,7 +286,8 @@ Now that you have SurfSense running locally, you can explore its features:
 - Explore the advanced RAG capabilities
 
 For production deployments, consider setting up:
+
 - A reverse proxy like Nginx
 - SSL certificates for secure connections
 - Proper database backups
-- User access controls 
\ No newline at end of file
+- User access controls

From b3b9086f6d27d5c3e941609b9691109ae034a9d8 Mon Sep 17 00:00:00 2001
From: "DESKTOP-RTLN3BA\\$punk" <vermarohanfinal@gmail.com>
Date: Thu, 8 May 2025 19:42:54 -0700
Subject: [PATCH 27/70] oops

---
 surfsense_backend/app/config/__init__.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/surfsense_backend/app/config/__init__.py b/surfsense_backend/app/config/__init__.py
index d51a74bae..7fd032aa8 100644
--- a/surfsense_backend/app/config/__init__.py
+++ b/surfsense_backend/app/config/__init__.py
@@ -46,9 +46,9 @@ class Config:
     
     # LONG-CONTEXT LLMS
     LONG_CONTEXT_LLM = os.getenv("LONG_CONTEXT_LLM")
-    FAST_LLM_API_BASE = os.getenv("FAST_LLM_API_BASE")
-    if FAST_LLM_API_BASE:
-        long_context_llm_instance = ChatLiteLLM(model=LONG_CONTEXT_LLM, api_base=FAST_LLM_API_BASE)
+    LONG_CONTEXT_LLM_API_BASE = os.getenv("LONG_CONTEXT_LLM_API_BASE")
+    if LONG_CONTEXT_LLM_API_BASE:
+        long_context_llm_instance = ChatLiteLLM(model=LONG_CONTEXT_LLM, api_base=LONG_CONTEXT_LLM_API_BASE)
     else:
         long_context_llm_instance = ChatLiteLLM(model=LONG_CONTEXT_LLM)
     

From 451a1f4a9515d24978461137bd1df984917b07f8 Mon Sep 17 00:00:00 2001
From: "DESKTOP-RTLN3BA\\$punk" <vermarohanfinal@gmail.com>
Date: Thu, 8 May 2025 19:49:11 -0700
Subject: [PATCH 28/70] fix: docs alignment

---
 .../content/docs/docker-installation.mdx      | 97 +++++++++----------
 .../content/docs/manual-installation.mdx      | 26 ++---
 2 files changed, 61 insertions(+), 62 deletions(-)

diff --git a/surfsense_web/content/docs/docker-installation.mdx b/surfsense_web/content/docs/docker-installation.mdx
index 7d7d36909..3e87078e4 100644
--- a/surfsense_web/content/docs/docker-installation.mdx
+++ b/surfsense_web/content/docs/docker-installation.mdx
@@ -32,71 +32,70 @@ Before you begin, ensure you have:
 ## Installation Steps
 
 1. **Configure Environment Variables**
+    Set up the necessary environment variables:
 
-   Set up the necessary environment variables:
+    **Linux/macOS:**
 
-   **Linux/macOS:**
+    ```bash
+    # Copy example environment files
+    cp surfsense_backend/.env.example surfsense_backend/.env
+    cp surfsense_web/.env.example surfsense_web/.env
+    ```
 
-   ```bash
-   # Copy example environment files
-   cp surfsense_backend/.env.example surfsense_backend/.env
-   cp surfsense_web/.env.example surfsense_web/.env
-   ```
+    **Windows (Command Prompt):**
 
-   **Windows (Command Prompt):**
+    ```cmd
+    copy surfsense_backend\.env.example surfsense_backend\.env
+    copy surfsense_web\.env.example surfsense_web\.env
+    ```
 
-   ```cmd
-   copy surfsense_backend\.env.example surfsense_backend\.env
-   copy surfsense_web\.env.example surfsense_web\.env
-   ```
+    **Windows (PowerShell):**
 
-   **Windows (PowerShell):**
+    ```powershell
+    Copy-Item -Path surfsense_backend\.env.example -Destination surfsense_backend\.env
+    Copy-Item -Path surfsense_web\.env.example -Destination surfsense_web\.env
+    ```
 
-   ```powershell
-   Copy-Item -Path surfsense_backend\.env.example -Destination surfsense_backend\.env
-   Copy-Item -Path surfsense_web\.env.example -Destination surfsense_web\.env
-   ```
+    Edit both `.env` files and fill in the required values:
 
-   Edit both `.env` files and fill in the required values:
+**Backend Environment Variables:**
 
-   **Backend Environment Variables:**
-
-   | ENV VARIABLE               | DESCRIPTION                                                                                                                                                                               |
-   | -------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-   | DATABASE_URL               | PostgreSQL connection string (e.g., `postgresql+asyncpg://postgres:postgres@localhost:5432/surfsense`)                                                                                    |
-   | SECRET_KEY                 | JWT Secret key for authentication (should be a secure random string)                                                                                                                      |
-   | GOOGLE_OAUTH_CLIENT_ID     | Google OAuth client ID obtained from Google Cloud Console                                                                                                                                 |
-   | GOOGLE_OAUTH_CLIENT_SECRET | Google OAuth client secret obtained from Google Cloud Console                                                                                                                             |
-   | NEXT_FRONTEND_URL          | URL where your frontend application is hosted (e.g., `http://localhost:3000`)                                                                                                             |
-   | EMBEDDING_MODEL            | Name of the embedding model (e.g., `openai://text-embedding-ada-002`, `anthropic://claude-v1`, `mixedbread-ai/mxbai-embed-large-v1`)                                                      |
-   | RERANKERS_MODEL_NAME       | Name of the reranker model (e.g., `ms-marco-MiniLM-L-12-v2`)                                                                                                                              |
-   | RERANKERS_MODEL_TYPE       | Type of reranker model (e.g., `flashrank`)                                                                                                                                                |
-   | FAST_LLM                   | LiteLLM routed smaller, faster LLM (e.g., `openai/gpt-4o-mini`, `ollama/deepseek-r1:8b`)                                                                                                  |
-   | STRATEGIC_LLM              | LiteLLM routed advanced LLM for complex tasks (e.g., `openai/gpt-4o`, `ollama/gemma3:12b`)                                                                                                |
-   | LONG_CONTEXT_LLM           | LiteLLM routed LLM for longer context windows (e.g., `gemini/gemini-2.0-flash`, `ollama/deepseek-r1:8b`)                                                                                  |
-   | UNSTRUCTURED_API_KEY       | API key for Unstructured.io service for document parsing                                                                                                                                  |
-   | FIRECRAWL_API_KEY          | API key for Firecrawl service for web crawling                                                                                                                                            |
-   | TTS_SERVICE                | Text-to-Speech API provider for Podcasts (e.g., `openai/tts-1`, `azure/neural`, `vertex_ai/`). See [supported providers](https://docs.litellm.ai/docs/text_to_speech#supported-providers) |
+| ENV VARIABLE               | DESCRIPTION                                                                                                                                                                               |
+| -------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| DATABASE_URL               | PostgreSQL connection string (e.g., `postgresql+asyncpg://postgres:postgres@localhost:5432/surfsense`)                                                                                    |
+| SECRET_KEY                 | JWT Secret key for authentication (should be a secure random string)                                                                                                                      |
+| GOOGLE_OAUTH_CLIENT_ID     | Google OAuth client ID obtained from Google Cloud Console                                                                                                                                 |
+| GOOGLE_OAUTH_CLIENT_SECRET | Google OAuth client secret obtained from Google Cloud Console                                                                                                                             |
+| NEXT_FRONTEND_URL          | URL where your frontend application is hosted (e.g., `http://localhost:3000`)                                                                                                             |
+| EMBEDDING_MODEL            | Name of the embedding model (e.g., `openai://text-embedding-ada-002`, `anthropic://claude-v1`, `mixedbread-ai/mxbai-embed-large-v1`)                                                      |
+| RERANKERS_MODEL_NAME       | Name of the reranker model (e.g., `ms-marco-MiniLM-L-12-v2`)                                                                                                                              |
+| RERANKERS_MODEL_TYPE       | Type of reranker model (e.g., `flashrank`)                                                                                                                                                |
+| FAST_LLM                   | LiteLLM routed smaller, faster LLM (e.g., `openai/gpt-4o-mini`, `ollama/deepseek-r1:8b`)                                                                                                  |
+| STRATEGIC_LLM              | LiteLLM routed advanced LLM for complex tasks (e.g., `openai/gpt-4o`, `ollama/gemma3:12b`)                                                                                                |
+| LONG_CONTEXT_LLM           | LiteLLM routed LLM for longer context windows (e.g., `gemini/gemini-2.0-flash`, `ollama/deepseek-r1:8b`)                                                                                  |
+| UNSTRUCTURED_API_KEY       | API key for Unstructured.io service for document parsing                                                                                                                                  |
+| FIRECRAWL_API_KEY          | API key for Firecrawl service for web crawling                                                                                                                                            |
+| TTS_SERVICE                | Text-to-Speech API provider for Podcasts (e.g., `openai/tts-1`, `azure/neural`, `vertex_ai/`). See [supported providers](https://docs.litellm.ai/docs/text_to_speech#supported-providers) |
 
 Include API keys for the LLM providers you're using. For example:
 
 - `OPENAI_API_KEY`: If using OpenAI models
 - `GEMINI_API_KEY`: If using Google Gemini models
 
-  **Optional LangSmith Observability:**
-  | ENV VARIABLE | DESCRIPTION |
-  |--------------|-------------|
-  | LANGSMITH_TRACING | Enable LangSmith tracing (e.g., `true`) |
-  | LANGSMITH_ENDPOINT | LangSmith API endpoint (e.g., `https://api.smith.langchain.com`) |
-  | LANGSMITH_API_KEY | Your LangSmith API key |
-  | LANGSMITH_PROJECT | LangSmith project name (e.g., `surfsense`) |
+**Optional LangSmith Observability:**
+| ENV VARIABLE | DESCRIPTION |
+|--------------|-------------|
+| LANGSMITH_TRACING | Enable LangSmith tracing (e.g., `true`) |
+| LANGSMITH_ENDPOINT | LangSmith API endpoint (e.g., `https://api.smith.langchain.com`) |
+| LANGSMITH_API_KEY | Your LangSmith API key |
+| LANGSMITH_PROJECT | LangSmith project name (e.g., `surfsense`) |
 
-  **Optional LiteLLM API Base URLs:**
-  | ENV VARIABLE | DESCRIPTION |
-  |--------------|-------------|
-  | FAST_LLM_API_BASE | Custom API base URL for the fast LLM |
-  | STRATEGIC_LLM_API_BASE | Custom API base URL for the strategic LLM |
-  | LONG_CONTEXT_LLM_API_BASE | Custom API base URL for the long context LLM |
+**Optional LiteLLM API Base URLs:**
+| ENV VARIABLE | DESCRIPTION |
+|--------------|-------------|
+| FAST_LLM_API_BASE | Custom API base URL for the fast LLM |
+| STRATEGIC_LLM_API_BASE | Custom API base URL for the strategic LLM |
+| LONG_CONTEXT_LLM_API_BASE | Custom API base URL for the long context LLM |
 
 For other LLM providers, refer to the [LiteLLM documentation](https://docs.litellm.ai/docs/providers).
 
diff --git a/surfsense_web/content/docs/manual-installation.mdx b/surfsense_web/content/docs/manual-installation.mdx
index b2bffae1a..d72277a33 100644
--- a/surfsense_web/content/docs/manual-installation.mdx
+++ b/surfsense_web/content/docs/manual-installation.mdx
@@ -72,20 +72,20 @@ Edit the `.env` file and set the following variables:
 - For Google Gemini models: `GEMINI_API_KEY`
 - For other providers, refer to the [LiteLLM documentation](https://docs.litellm.ai/docs/providers)
 
-  **Optional LangSmith Observability:**
-  | ENV VARIABLE | DESCRIPTION |
-  |--------------|-------------|
-  | LANGSMITH_TRACING | Enable LangSmith tracing (e.g., `true`) |
-  | LANGSMITH_ENDPOINT | LangSmith API endpoint (e.g., `https://api.smith.langchain.com`) |
-  | LANGSMITH_API_KEY | Your LangSmith API key |
-  | LANGSMITH_PROJECT | LangSmith project name (e.g., `surfsense`) |
+**Optional LangSmith Observability:**
+| ENV VARIABLE | DESCRIPTION |
+|--------------|-------------|
+| LANGSMITH_TRACING | Enable LangSmith tracing (e.g., `true`) |
+| LANGSMITH_ENDPOINT | LangSmith API endpoint (e.g., `https://api.smith.langchain.com`) |
+| LANGSMITH_API_KEY | Your LangSmith API key |
+| LANGSMITH_PROJECT | LangSmith project name (e.g., `surfsense`) |
 
-  **Optional LiteLLM API Base URLs:**
-  | ENV VARIABLE | DESCRIPTION |
-  |--------------|-------------|
-  | FAST_LLM_API_BASE | Custom API base URL for the fast LLM |
-  | STRATEGIC_LLM_API_BASE | Custom API base URL for the strategic LLM |
-  | LONG_CONTEXT_LLM_API_BASE | Custom API base URL for the long context LLM |
+**Optional LiteLLM API Base URLs:**
+| ENV VARIABLE | DESCRIPTION |
+|--------------|-------------|
+| FAST_LLM_API_BASE | Custom API base URL for the fast LLM |
+| STRATEGIC_LLM_API_BASE | Custom API base URL for the strategic LLM |
+| LONG_CONTEXT_LLM_API_BASE | Custom API base URL for the long context LLM |
 
 ### 2. Install Dependencies
 

From 66c92129b18312cb8532c84e1723498df6c229fc Mon Sep 17 00:00:00 2001
From: "DESKTOP-RTLN3BA\\$punk" <vermarohanfinal@gmail.com>
Date: Thu, 8 May 2025 19:53:15 -0700
Subject: [PATCH 29/70] fix docs

---
 surfsense_web/content/docs/docker-installation.mdx | 6 +++---
 surfsense_web/content/docs/manual-installation.mdx | 4 ++--
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/surfsense_web/content/docs/docker-installation.mdx b/surfsense_web/content/docs/docker-installation.mdx
index 3e87078e4..1a73cd65e 100644
--- a/surfsense_web/content/docs/docker-installation.mdx
+++ b/surfsense_web/content/docs/docker-installation.mdx
@@ -82,7 +82,7 @@ Include API keys for the LLM providers you're using. For example:
 - `OPENAI_API_KEY`: If using OpenAI models
 - `GEMINI_API_KEY`: If using Google Gemini models
 
-**Optional LangSmith Observability:**
+**Optional Backend LangSmith Observability:**
 | ENV VARIABLE | DESCRIPTION |
 |--------------|-------------|
 | LANGSMITH_TRACING | Enable LangSmith tracing (e.g., `true`) |
@@ -90,7 +90,7 @@ Include API keys for the LLM providers you're using. For example:
 | LANGSMITH_API_KEY | Your LangSmith API key |
 | LANGSMITH_PROJECT | LangSmith project name (e.g., `surfsense`) |
 
-**Optional LiteLLM API Base URLs:**
+**Optional Backend LiteLLM API Base URLs:**
 | ENV VARIABLE | DESCRIPTION |
 |--------------|-------------|
 | FAST_LLM_API_BASE | Custom API base URL for the fast LLM |
@@ -99,7 +99,7 @@ Include API keys for the LLM providers you're using. For example:
 
 For other LLM providers, refer to the [LiteLLM documentation](https://docs.litellm.ai/docs/providers).
 
-**Frontend Environment Variables:**
+### Frontend Environment Variables
 
 | ENV VARIABLE                    | DESCRIPTION                                                |
 | ------------------------------- | ---------------------------------------------------------- |
diff --git a/surfsense_web/content/docs/manual-installation.mdx b/surfsense_web/content/docs/manual-installation.mdx
index d72277a33..749aac217 100644
--- a/surfsense_web/content/docs/manual-installation.mdx
+++ b/surfsense_web/content/docs/manual-installation.mdx
@@ -72,7 +72,7 @@ Edit the `.env` file and set the following variables:
 - For Google Gemini models: `GEMINI_API_KEY`
 - For other providers, refer to the [LiteLLM documentation](https://docs.litellm.ai/docs/providers)
 
-**Optional LangSmith Observability:**
+**Optional Backend LangSmith Observability:**
 | ENV VARIABLE | DESCRIPTION |
 |--------------|-------------|
 | LANGSMITH_TRACING | Enable LangSmith tracing (e.g., `true`) |
@@ -80,7 +80,7 @@ Edit the `.env` file and set the following variables:
 | LANGSMITH_API_KEY | Your LangSmith API key |
 | LANGSMITH_PROJECT | LangSmith project name (e.g., `surfsense`) |
 
-**Optional LiteLLM API Base URLs:**
+**Optional Backend LiteLLM API Base URLs:**
 | ENV VARIABLE | DESCRIPTION |
 |--------------|-------------|
 | FAST_LLM_API_BASE | Custom API base URL for the fast LLM |

From b5ede457a4d01d201d5daa54f350dc1213a841a4 Mon Sep 17 00:00:00 2001
From: Xinwei Xiong <3293172751NSS@gmail.com>
Date: Fri, 9 May 2025 16:18:05 +0800
Subject: [PATCH 30/70] enhance Docker setup documentation and configuration
 options

---
 .env.example       | 17 ++++++++++++
 DOCKER_SETUP.md    | 68 +++++++++++++++++++++++++++++++++++++++++-----
 README.md          | 13 ++++++++-
 docker-compose.yml | 31 +++++++++++++++------
 4 files changed, 112 insertions(+), 17 deletions(-)
 create mode 100644 .env.example

diff --git a/.env.example b/.env.example
new file mode 100644
index 000000000..cc6ebe313
--- /dev/null
+++ b/.env.example
@@ -0,0 +1,17 @@
+# Frontend Configuration
+FRONTEND_PORT=3000
+NEXT_PUBLIC_API_URL=http://backend:8000
+
+# Backend Configuration
+BACKEND_PORT=8000
+
+# Database Configuration
+POSTGRES_USER=postgres
+POSTGRES_PASSWORD=postgres
+POSTGRES_DB=surfsense
+POSTGRES_PORT=5432
+
+# pgAdmin Configuration
+PGADMIN_PORT=5050
+PGADMIN_DEFAULT_EMAIL=admin@surfsense.com
+PGADMIN_DEFAULT_PASSWORD=surfsense
diff --git a/DOCKER_SETUP.md b/DOCKER_SETUP.md
index 44e6a142f..7ad878c94 100644
--- a/DOCKER_SETUP.md
+++ b/DOCKER_SETUP.md
@@ -7,11 +7,41 @@ This document explains how to run the SurfSense project using Docker Compose.
 - Docker and Docker Compose installed on your machine
 - Git (to clone the repository)
 
+## Environment Variables Configuration
+
+SurfSense Docker setup supports configuration through environment variables. You can set these variables in two ways:
+
+1. Create a `.env` file in the project root directory (copy from `.env.example`)
+2. Set environment variables directly in your shell before running Docker Compose
+
+The following environment variables are available:
+
+```
+# Frontend Configuration
+FRONTEND_PORT=3000
+NEXT_PUBLIC_API_URL=http://backend:8000
+
+# Backend Configuration
+BACKEND_PORT=8000
+
+# Database Configuration
+POSTGRES_USER=postgres
+POSTGRES_PASSWORD=postgres
+POSTGRES_DB=surfsense
+POSTGRES_PORT=5432
+
+# pgAdmin Configuration
+PGADMIN_PORT=5050
+PGADMIN_DEFAULT_EMAIL=admin@surfsense.com
+PGADMIN_DEFAULT_PASSWORD=surfsense
+```
+
 ## Setup
 
 1. Make sure you have all the necessary environment variables set up:
    - Copy `surfsense_backend/.env.example` to `surfsense_backend/.env` and fill in the required values
    - Copy `surfsense_web/.env.example` to `surfsense_web/.env` and fill in the required values
+   - Optionally: Copy `.env.example` to `.env` in the project root to customize Docker settings
 
 2. Build and start the containers:
    ```bash
@@ -27,6 +57,7 @@ This document explains how to run the SurfSense project using Docker Compose.
    - Frontend: http://localhost:3000
    - Backend API: http://localhost:8000
    - API Documentation: http://localhost:8000/docs
+   - pgAdmin: http://localhost:5050
 
 ## Useful Commands
 
@@ -44,6 +75,7 @@ This document explains how to run the SurfSense project using Docker Compose.
   docker-compose logs -f backend
   docker-compose logs -f frontend
   docker-compose logs -f db
+  docker-compose logs -f pgadmin
   ```
 
 - Restart a specific service:
@@ -64,16 +96,38 @@ This document explains how to run the SurfSense project using Docker Compose.
 
 The PostgreSQL database with pgvector extensions is available at:
 - Host: localhost
-- Port: 5432
-- Username: postgres
-- Password: postgres
-- Database: surfsense
+- Port: 5432 (configurable via POSTGRES_PORT)
+- Username: postgres (configurable via POSTGRES_USER)
+- Password: postgres (configurable via POSTGRES_PASSWORD)
+- Database: surfsense (configurable via POSTGRES_DB)
 
-You can connect to it using any PostgreSQL client.
+You can connect to it using any PostgreSQL client or the included pgAdmin.
+
+## pgAdmin
+
+pgAdmin is a web-based administration tool for PostgreSQL. It is included in the Docker setup for easier database management.
+
+- URL: http://localhost:5050 (configurable via PGADMIN_PORT)
+- Default Email: admin@surfsense.com (configurable via PGADMIN_DEFAULT_EMAIL)
+- Default Password: surfsense (configurable via PGADMIN_DEFAULT_PASSWORD)
+
+### Connecting to the Database in pgAdmin
+
+1. Log in to pgAdmin using the credentials above
+2. Right-click on "Servers" in the left sidebar and select "Create" > "Server"
+3. In the "General" tab, give your connection a name (e.g., "SurfSense DB")
+4. In the "Connection" tab, enter the following:
+   - Host: db
+   - Port: 5432
+   - Maintenance database: surfsense
+   - Username: postgres 
+   - Password: postgres
+5. Click "Save" to establish the connection
 
 ## Troubleshooting
 
 - If you encounter permission errors, you may need to run the docker commands with `sudo`.
-- If ports are already in use, modify the port mappings in the `docker-compose.yml` file.
+- If ports are already in use, modify the port mappings in the `.env` file or directly in the `docker-compose.yml` file.
 - For backend dependency issues, you may need to modify the `Dockerfile` in the backend directory.
-- For frontend dependency issues, you may need to modify the `Dockerfile` in the frontend directory. 
+- For frontend dependency issues, you may need to modify the `Dockerfile` in the frontend directory.
+- If pgAdmin doesn't connect to the database, ensure you're using `db` as the hostname, not `localhost`, as that's the Docker network name. 
diff --git a/README.md b/README.md
index 213e1c199..e8979cf50 100644
--- a/README.md
+++ b/README.md
@@ -85,7 +85,10 @@ Join the [SurfSense Discord](https://discord.gg/ejRNvftDp9) and help shape the f
 
 SurfSense provides two installation methods:
 
-1. **[Docker Installation](https://www.surfsense.net/docs/docker-installation)** - The easiest way to get SurfSense up and running with all dependencies containerized. Less Customization.
+1. **[Docker Installation](https://www.surfsense.net/docs/docker-installation)** - The easiest way to get SurfSense up and running with all dependencies containerized.
+   - Includes pgAdmin for database management through a web UI
+   - Supports environment variable customization via `.env` file
+   - See [Docker Setup Guide](DOCKER_SETUP.md) for detailed instructions
 
 2. **[Manual Installation (Recommended)](https://www.surfsense.net/docs/manual-installation)** - For users who prefer more control over their setup or need to customize their deployment.
 
@@ -191,6 +194,14 @@ Before installation, make sure to complete the [prerequisite setup steps](https:
 -  **@tanstack/react-table**: Headless UI for building powerful tables & datagrids.
 
 
+ ### **DevOps**
+
+-  **Docker**: Container platform for consistent deployment across environments
+  
+-  **Docker Compose**: Tool for defining and running multi-container Docker applications
+
+-  **pgAdmin**: Web-based PostgreSQL administration tool included in Docker setup
+
 
 ### **Extension** 
  Manifest v3 on Plasmo
diff --git a/docker-compose.yml b/docker-compose.yml
index 736400a6e..f8cfc2518 100644
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -6,21 +6,21 @@ services:
       context: ./surfsense_web
       dockerfile: Dockerfile
     ports:
-      - "3000:3000"
+      - "${FRONTEND_PORT:-3000}:3000"
     volumes:
       - ./surfsense_web:/app
       - /app/node_modules
     depends_on:
       - backend
     environment:
-      - NEXT_PUBLIC_API_URL=http://backend:8000
+      - NEXT_PUBLIC_API_URL=${NEXT_PUBLIC_API_URL:-http://backend:8000}
 
   backend:
     build:
       context: ./surfsense_backend
       dockerfile: Dockerfile
     ports:
-      - "8000:8000"
+      - "${BACKEND_PORT:-8000}:8000"
     volumes:
       - ./surfsense_backend:/app
     depends_on:
@@ -28,7 +28,7 @@ services:
     env_file:
       - ./surfsense_backend/.env
     environment:
-      - DATABASE_URL=postgresql+asyncpg://postgres:postgres@db:5432/surfsense
+      - DATABASE_URL=postgresql+asyncpg://${POSTGRES_USER:-postgres}:${POSTGRES_PASSWORD:-postgres}@db:5432/${POSTGRES_DB:-surfsense}
       - PYTHONPATH=/app
       - UVICORN_LOOP=asyncio
       - UNSTRUCTURED_HAS_PATCHED_LOOP=1
@@ -36,13 +36,26 @@ services:
   db:
     image: ankane/pgvector:latest
     ports:
-      - "5432:5432"
+      - "${POSTGRES_PORT:-5432}:5432"
     volumes:
       - postgres_data:/var/lib/postgresql/data
     environment:
-      - POSTGRES_USER=postgres
-      - POSTGRES_PASSWORD=postgres
-      - POSTGRES_DB=surfsense
+      - POSTGRES_USER=${POSTGRES_USER:-postgres}
+      - POSTGRES_PASSWORD=${POSTGRES_PASSWORD:-postgres}
+      - POSTGRES_DB=${POSTGRES_DB:-surfsense}
+
+  pgadmin:
+    image: dpage/pgadmin4
+    ports:
+      - "${PGADMIN_PORT:-5050}:80"
+    environment:
+      - PGADMIN_DEFAULT_EMAIL=${PGADMIN_DEFAULT_EMAIL:-admin@surfsense.com}
+      - PGADMIN_DEFAULT_PASSWORD=${PGADMIN_DEFAULT_PASSWORD:-surfsense}
+    volumes:
+      - pgadmin_data:/var/lib/pgadmin
+    depends_on:
+      - db
 
 volumes:
-  postgres_data: 
\ No newline at end of file
+  postgres_data:
+  pgadmin_data: 
\ No newline at end of file

From 09c16935326360fae63a704afe5f129e7efa4fc4 Mon Sep 17 00:00:00 2001
From: ritikprajapat21 <ritikprajapati084@gmail.com>
Date: Sat, 10 May 2025 09:55:50 +0530
Subject: [PATCH 31/70] Fix #45: Added connector icon

---
 .../connectors/(manage)/page.tsx              | 510 ++++++++++--------
 1 file changed, 281 insertions(+), 229 deletions(-)

diff --git a/surfsense_web/app/dashboard/[search_space_id]/connectors/(manage)/page.tsx b/surfsense_web/app/dashboard/[search_space_id]/connectors/(manage)/page.tsx
index af92a6ae5..95c769c00 100644
--- a/surfsense_web/app/dashboard/[search_space_id]/connectors/(manage)/page.tsx
+++ b/surfsense_web/app/dashboard/[search_space_id]/connectors/(manage)/page.tsx
@@ -4,256 +4,308 @@ import { useState, useEffect } from "react";
 import { useRouter, useParams } from "next/navigation";
 import { motion } from "framer-motion";
 import { toast } from "sonner";
-import { Edit, Plus, Search, Trash2, ExternalLink, RefreshCw } from "lucide-react";
+import {
+	Edit,
+	Plus,
+	Search,
+	Trash2,
+	ExternalLink,
+	RefreshCw,
+} from "lucide-react";
 
 import { useSearchSourceConnectors } from "@/hooks/useSearchSourceConnectors";
 import { Button } from "@/components/ui/button";
 import {
-  Card,
-  CardContent,
-  CardDescription,
-  CardFooter,
-  CardHeader,
-  CardTitle,
+	Card,
+	CardContent,
+	CardDescription,
+	CardFooter,
+	CardHeader,
+	CardTitle,
 } from "@/components/ui/card";
 import {
-  Table,
-  TableBody,
-  TableCell,
-  TableHead,
-  TableHeader,
-  TableRow,
+	Table,
+	TableBody,
+	TableCell,
+	TableHead,
+	TableHeader,
+	TableRow,
 } from "@/components/ui/table";
 import {
-  AlertDialog,
-  AlertDialogAction,
-  AlertDialogCancel,
-  AlertDialogContent,
-  AlertDialogDescription,
-  AlertDialogFooter,
-  AlertDialogHeader,
-  AlertDialogTitle,
-  AlertDialogTrigger,
+	AlertDialog,
+	AlertDialogAction,
+	AlertDialogCancel,
+	AlertDialogContent,
+	AlertDialogDescription,
+	AlertDialogFooter,
+	AlertDialogHeader,
+	AlertDialogTitle,
+	AlertDialogTrigger,
 } from "@/components/ui/alert-dialog";
-import { Tooltip, TooltipContent, TooltipProvider, TooltipTrigger } from "@/components/ui/tooltip";
+import {
+	Tooltip,
+	TooltipContent,
+	TooltipProvider,
+	TooltipTrigger,
+} from "@/components/ui/tooltip";
+import { getConnectorIcon } from "@/components/chat";
 
 // Helper function to get connector type display name
 const getConnectorTypeDisplay = (type: string): string => {
-  const typeMap: Record<string, string> = {
-    "SERPER_API": "Serper API",
-    "TAVILY_API": "Tavily API",
-    "SLACK_CONNECTOR": "Slack",
-    "NOTION_CONNECTOR": "Notion",
-    "GITHUB_CONNECTOR": "GitHub",
-    "LINEAR_CONNECTOR": "Linear",
-    "LINKUP_API": "Linkup",
-    // Add other connector types here as needed
-  };
-  return typeMap[type] || type;
+	const typeMap: Record<string, string> = {
+		SERPER_API: "Serper API",
+		TAVILY_API: "Tavily API",
+		SLACK_CONNECTOR: "Slack",
+		NOTION_CONNECTOR: "Notion",
+		GITHUB_CONNECTOR: "GitHub",
+		LINEAR_CONNECTOR: "Linear",
+		LINKUP_API: "Linkup",
+		// Add other connector types here as needed
+	};
+	return typeMap[type] || type;
 };
 
 // Helper function to format date with time
 const formatDateTime = (dateString: string | null): string => {
-  if (!dateString) return "Never";
-  
-  const date = new Date(dateString);
-  return new Intl.DateTimeFormat('en-US', {
-    year: 'numeric',
-    month: 'short',
-    day: 'numeric',
-    hour: '2-digit',
-    minute: '2-digit'
-  }).format(date);
+	if (!dateString) return "Never";
+
+	const date = new Date(dateString);
+	return new Intl.DateTimeFormat("en-US", {
+		year: "numeric",
+		month: "short",
+		day: "numeric",
+		hour: "2-digit",
+		minute: "2-digit",
+	}).format(date);
 };
 
 export default function ConnectorsPage() {
-  const router = useRouter();
-  const params = useParams();
-  const searchSpaceId = params.search_space_id as string;
-  
-  const { connectors, isLoading, error, deleteConnector, indexConnector } = useSearchSourceConnectors();
-  const [connectorToDelete, setConnectorToDelete] = useState<number | null>(null);
-  const [indexingConnectorId, setIndexingConnectorId] = useState<number | null>(null);
+	const router = useRouter();
+	const params = useParams();
+	const searchSpaceId = params.search_space_id as string;
 
-  useEffect(() => {
-    if (error) {
-      toast.error("Failed to load connectors");
-      console.error("Error fetching connectors:", error);
-    }
-  }, [error]);
+	const { connectors, isLoading, error, deleteConnector, indexConnector } =
+		useSearchSourceConnectors();
+	const [connectorToDelete, setConnectorToDelete] = useState<number | null>(
+		null,
+	);
+	const [indexingConnectorId, setIndexingConnectorId] = useState<number | null>(
+		null,
+	);
 
-  // Handle connector deletion
-  const handleDeleteConnector = async () => {
-    if (connectorToDelete === null) return;
-    
-    try {
-      await deleteConnector(connectorToDelete);
-      toast.success("Connector deleted successfully");
-    } catch (error) {
-      console.error("Error deleting connector:", error);
-      toast.error("Failed to delete connector");
-    } finally {
-      setConnectorToDelete(null);
-    }
-  };
+	useEffect(() => {
+		if (error) {
+			toast.error("Failed to load connectors");
+			console.error("Error fetching connectors:", error);
+		}
+	}, [error]);
 
-  // Handle connector indexing
-  const handleIndexConnector = async (connectorId: number) => {
-    setIndexingConnectorId(connectorId);
-    try {
-      await indexConnector(connectorId, searchSpaceId);
-      toast.success("Connector content indexed successfully");
-    } catch (error) {
-      console.error("Error indexing connector content:", error);
-      toast.error(error instanceof Error ? error.message : "Failed to index connector content");
-    } finally {
-      setIndexingConnectorId(null);
-    }
-  };
+	// Handle connector deletion
+	const handleDeleteConnector = async () => {
+		if (connectorToDelete === null) return;
 
-  return (
-    <div className="container mx-auto py-8 max-w-6xl">
-      <motion.div
-        initial={{ opacity: 0, y: 20 }}
-        animate={{ opacity: 1, y: 0 }}
-        transition={{ duration: 0.5 }}
-        className="mb-8 flex items-center justify-between"
-      >
-        <div>
-          <h1 className="text-3xl font-bold tracking-tight">Connectors</h1>
-          <p className="text-muted-foreground mt-2">
-            Manage your connected services and data sources.
-          </p>
-        </div>
-        <Button onClick={() => router.push(`/dashboard/${searchSpaceId}/connectors/add`)}>
-          <Plus className="mr-2 h-4 w-4" />
-          Add Connector
-        </Button>
-      </motion.div>
+		try {
+			await deleteConnector(connectorToDelete);
+			toast.success("Connector deleted successfully");
+		} catch (error) {
+			console.error("Error deleting connector:", error);
+			toast.error("Failed to delete connector");
+		} finally {
+			setConnectorToDelete(null);
+		}
+	};
 
-      <Card>
-        <CardHeader className="pb-3">
-          <CardTitle>Your Connectors</CardTitle>
-          <CardDescription>
-            View and manage all your connected services.
-          </CardDescription>
-        </CardHeader>
-        <CardContent>
-          {isLoading ? (
-            <div className="flex justify-center py-8">
-              <div className="animate-pulse text-center">
-                <div className="h-6 w-32 bg-muted rounded mx-auto mb-2"></div>
-                <div className="h-4 w-48 bg-muted rounded mx-auto"></div>
-              </div>
-            </div>
-          ) : connectors.length === 0 ? (
-            <div className="text-center py-12">
-              <h3 className="text-lg font-medium mb-2">No connectors found</h3>
-              <p className="text-muted-foreground mb-6">
-                You haven't added any connectors yet. Add one to enhance your search capabilities.
-              </p>
-              <Button onClick={() => router.push(`/dashboard/${searchSpaceId}/connectors/add`)}>
-                <Plus className="mr-2 h-4 w-4" />
-                Add Your First Connector
-              </Button>
-            </div>
-          ) : (
-            <div className="rounded-md border">
-              <Table>
-                <TableHeader>
-                  <TableRow>
-                    <TableHead>Name</TableHead>
-                    <TableHead>Type</TableHead>
-                    <TableHead>Last Indexed</TableHead>
-                    <TableHead className="text-right">Actions</TableHead>
-                  </TableRow>
-                </TableHeader>
-                <TableBody>
-                  {connectors.map((connector) => (
-                    <TableRow key={connector.id}>
-                      <TableCell className="font-medium">{connector.name}</TableCell>
-                      <TableCell>{getConnectorTypeDisplay(connector.connector_type)}</TableCell>
-                      <TableCell>
-                        {connector.is_indexable 
-                          ? formatDateTime(connector.last_indexed_at)
-                          : "Not indexable"}
-                      </TableCell>
-                      <TableCell className="text-right">
-                        <div className="flex justify-end gap-2">
-                          {connector.is_indexable && (
-                            <TooltipProvider>
-                              <Tooltip>
-                                <TooltipTrigger asChild>
-                                  <Button
-                                    variant="outline"
-                                    size="sm"
-                                    onClick={() => handleIndexConnector(connector.id)}
-                                    disabled={indexingConnectorId === connector.id}
-                                  >
-                                    {indexingConnectorId === connector.id ? (
-                                      <RefreshCw className="h-4 w-4 animate-spin" />
-                                    ) : (
-                                      <RefreshCw className="h-4 w-4" />
-                                    )}
-                                    <span className="sr-only">Index Content</span>
-                                  </Button>
-                                </TooltipTrigger>
-                                <TooltipContent>
-                                  <p>Index Content</p>
-                                </TooltipContent>
-                              </Tooltip>
-                            </TooltipProvider>
-                          )}
-                          <Button
-                            variant="outline"
-                            size="sm"
-                            onClick={() => router.push(`/dashboard/${searchSpaceId}/connectors/${connector.id}/edit`)}
-                          >
-                            <Edit className="h-4 w-4" />
-                            <span className="sr-only">Edit</span>
-                          </Button>
-                          <AlertDialog>
-                            <AlertDialogTrigger asChild>
-                              <Button
-                                variant="outline"
-                                size="sm"
-                                className="text-destructive-foreground hover:bg-destructive/10"
-                                onClick={() => setConnectorToDelete(connector.id)}
-                              >
-                                <Trash2 className="h-4 w-4" />
-                                <span className="sr-only">Delete</span>
-                              </Button>
-                            </AlertDialogTrigger>
-                            <AlertDialogContent>
-                              <AlertDialogHeader>
-                                <AlertDialogTitle>Delete Connector</AlertDialogTitle>
-                                <AlertDialogDescription>
-                                  Are you sure you want to delete this connector? This action cannot be undone.
-                                </AlertDialogDescription>
-                              </AlertDialogHeader>
-                              <AlertDialogFooter>
-                                <AlertDialogCancel onClick={() => setConnectorToDelete(null)}>
-                                  Cancel
-                                </AlertDialogCancel>
-                                <AlertDialogAction
-                                  className="bg-destructive text-destructive-foreground hover:bg-destructive/90"
-                                  onClick={handleDeleteConnector}
-                                >
-                                  Delete
-                                </AlertDialogAction>
-                              </AlertDialogFooter>
-                            </AlertDialogContent>
-                          </AlertDialog>
-                        </div>
-                      </TableCell>
-                    </TableRow>
-                  ))}
-                </TableBody>
-              </Table>
-            </div>
-          )}
-        </CardContent>
-      </Card>
-    </div>
-  );
-} 
+	// Handle connector indexing
+	const handleIndexConnector = async (connectorId: number) => {
+		setIndexingConnectorId(connectorId);
+		try {
+			await indexConnector(connectorId, searchSpaceId);
+			toast.success("Connector content indexed successfully");
+		} catch (error) {
+			console.error("Error indexing connector content:", error);
+			toast.error(
+				error instanceof Error
+					? error.message
+					: "Failed to index connector content",
+			);
+		} finally {
+			setIndexingConnectorId(null);
+		}
+	};
+
+	return (
+		<div className="container mx-auto py-8 max-w-6xl">
+			<motion.div
+				initial={{ opacity: 0, y: 20 }}
+				animate={{ opacity: 1, y: 0 }}
+				transition={{ duration: 0.5 }}
+				className="mb-8 flex items-center justify-between"
+			>
+				<div>
+					<h1 className="text-3xl font-bold tracking-tight">Connectors</h1>
+					<p className="text-muted-foreground mt-2">
+						Manage your connected services and data sources.
+					</p>
+				</div>
+				<Button
+					onClick={() =>
+						router.push(`/dashboard/${searchSpaceId}/connectors/add`)
+					}
+				>
+					<Plus className="mr-2 h-4 w-4" />
+					Add Connector
+				</Button>
+			</motion.div>
+
+			<Card>
+				<CardHeader className="pb-3">
+					<CardTitle>Your Connectors</CardTitle>
+					<CardDescription>
+						View and manage all your connected services.
+					</CardDescription>
+				</CardHeader>
+				<CardContent>
+					{isLoading ? (
+						<div className="flex justify-center py-8">
+							<div className="animate-pulse text-center">
+								<div className="h-6 w-32 bg-muted rounded mx-auto mb-2"></div>
+								<div className="h-4 w-48 bg-muted rounded mx-auto"></div>
+							</div>
+						</div>
+					) : connectors.length === 0 ? (
+						<div className="text-center py-12">
+							<h3 className="text-lg font-medium mb-2">No connectors found</h3>
+							<p className="text-muted-foreground mb-6">
+								You haven't added any connectors yet. Add one to enhance your
+								search capabilities.
+							</p>
+							<Button
+								onClick={() =>
+									router.push(`/dashboard/${searchSpaceId}/connectors/add`)
+								}
+							>
+								<Plus className="mr-2 h-4 w-4" />
+								Add Your First Connector
+							</Button>
+						</div>
+					) : (
+						<div className="rounded-md border">
+							<Table>
+								<TableHeader>
+									<TableRow>
+										<TableHead>Name</TableHead>
+										<TableHead>Type</TableHead>
+										<TableHead>Last Indexed</TableHead>
+										<TableHead className="text-right">Actions</TableHead>
+									</TableRow>
+								</TableHeader>
+								<TableBody>
+									{connectors.map((connector) => (
+										<TableRow key={connector.id}>
+											<TableCell className="font-medium">
+												{connector.name}
+											</TableCell>
+											<TableCell>
+												{getConnectorIcon(connector.connector_type)}
+											</TableCell>
+											<TableCell>
+												{connector.is_indexable
+													? formatDateTime(connector.last_indexed_at)
+													: "Not indexable"}
+											</TableCell>
+											<TableCell className="text-right">
+												<div className="flex justify-end gap-2">
+													{connector.is_indexable && (
+														<TooltipProvider>
+															<Tooltip>
+																<TooltipTrigger asChild>
+																	<Button
+																		variant="outline"
+																		size="sm"
+																		onClick={() =>
+																			handleIndexConnector(connector.id)
+																		}
+																		disabled={
+																			indexingConnectorId === connector.id
+																		}
+																	>
+																		{indexingConnectorId === connector.id ? (
+																			<RefreshCw className="h-4 w-4 animate-spin" />
+																		) : (
+																			<RefreshCw className="h-4 w-4" />
+																		)}
+																		<span className="sr-only">
+																			Index Content
+																		</span>
+																	</Button>
+																</TooltipTrigger>
+																<TooltipContent>
+																	<p>Index Content</p>
+																</TooltipContent>
+															</Tooltip>
+														</TooltipProvider>
+													)}
+													<Button
+														variant="outline"
+														size="sm"
+														onClick={() =>
+															router.push(
+																`/dashboard/${searchSpaceId}/connectors/${connector.id}/edit`,
+															)
+														}
+													>
+														<Edit className="h-4 w-4" />
+														<span className="sr-only">Edit</span>
+													</Button>
+													<AlertDialog>
+														<AlertDialogTrigger asChild>
+															<Button
+																variant="outline"
+																size="sm"
+																className="text-destructive-foreground hover:bg-destructive/10"
+																onClick={() =>
+																	setConnectorToDelete(connector.id)
+																}
+															>
+																<Trash2 className="h-4 w-4" />
+																<span className="sr-only">Delete</span>
+															</Button>
+														</AlertDialogTrigger>
+														<AlertDialogContent>
+															<AlertDialogHeader>
+																<AlertDialogTitle>
+																	Delete Connector
+																</AlertDialogTitle>
+																<AlertDialogDescription>
+																	Are you sure you want to delete this
+																	connector? This action cannot be undone.
+																</AlertDialogDescription>
+															</AlertDialogHeader>
+															<AlertDialogFooter>
+																<AlertDialogCancel
+																	onClick={() => setConnectorToDelete(null)}
+																>
+																	Cancel
+																</AlertDialogCancel>
+																<AlertDialogAction
+																	className="bg-destructive text-destructive-foreground hover:bg-destructive/90"
+																	onClick={handleDeleteConnector}
+																>
+																	Delete
+																</AlertDialogAction>
+															</AlertDialogFooter>
+														</AlertDialogContent>
+													</AlertDialog>
+												</div>
+											</TableCell>
+										</TableRow>
+									))}
+								</TableBody>
+							</Table>
+						</div>
+					)}
+				</CardContent>
+			</Card>
+		</div>
+	);
+}

From 32bff316e53aeb46ef5821c185e77856aad37541 Mon Sep 17 00:00:00 2001
From: ritikprajapat21 <ritikprajapati084@gmail.com>
Date: Sat, 10 May 2025 10:23:44 +0530
Subject: [PATCH 32/70] Fix #46: Changed to dynamic icon

---
 .../connectors/[connector_id]/edit/page.tsx   | 344 ++++++++++--------
 1 file changed, 186 insertions(+), 158 deletions(-)

diff --git a/surfsense_web/app/dashboard/[search_space_id]/connectors/[connector_id]/edit/page.tsx b/surfsense_web/app/dashboard/[search_space_id]/connectors/[connector_id]/edit/page.tsx
index 5afea12c9..644dbc981 100644
--- a/surfsense_web/app/dashboard/[search_space_id]/connectors/[connector_id]/edit/page.tsx
+++ b/surfsense_web/app/dashboard/[search_space_id]/connectors/[connector_id]/edit/page.tsx
@@ -1,6 +1,6 @@
 "use client";
 
-import React, { useEffect } from 'react';
+import React, { useEffect } from "react";
 import { useRouter, useParams } from "next/navigation";
 import { motion } from "framer-motion";
 import { toast } from "sonner";
@@ -8,180 +8,208 @@ import { ArrowLeft, Check, Loader2, Github } from "lucide-react";
 
 import { Form } from "@/components/ui/form";
 import { Button } from "@/components/ui/button";
-import { Card, CardContent, CardDescription, CardFooter, CardHeader, CardTitle } from "@/components/ui/card";
+import {
+	Card,
+	CardContent,
+	CardDescription,
+	CardFooter,
+	CardHeader,
+	CardTitle,
+} from "@/components/ui/card";
 
 // Import Utils, Types, Hook, and Components
-import { getConnectorTypeDisplay } from '@/lib/connectors/utils';
-import { useConnectorEditPage } from '@/hooks/useConnectorEditPage';
+import { getConnectorTypeDisplay } from "@/lib/connectors/utils";
+import { useConnectorEditPage } from "@/hooks/useConnectorEditPage";
 import { EditConnectorLoadingSkeleton } from "@/components/editConnector/EditConnectorLoadingSkeleton";
 import { EditConnectorNameForm } from "@/components/editConnector/EditConnectorNameForm";
 import { EditGitHubConnectorConfig } from "@/components/editConnector/EditGitHubConnectorConfig";
 import { EditSimpleTokenForm } from "@/components/editConnector/EditSimpleTokenForm";
+import { getConnectorIcon } from "@/components/chat";
 
 export default function EditConnectorPage() {
-    const router = useRouter();
-    const params = useParams();
-    const searchSpaceId = params.search_space_id as string;
-    // Ensure connectorId is parsed safely
-    const connectorIdParam = params.connector_id as string;
-    const connectorId = connectorIdParam ? parseInt(connectorIdParam, 10) : NaN;
+	const router = useRouter();
+	const params = useParams();
+	const searchSpaceId = params.search_space_id as string;
+	// Ensure connectorId is parsed safely
+	const connectorIdParam = params.connector_id as string;
+	const connectorId = connectorIdParam ? parseInt(connectorIdParam, 10) : NaN;
 
-    // Use the custom hook to manage state and logic
-    const {
-        connectorsLoading,
-        connector,
-        isSaving,
-        editForm,
-        patForm, // Needed for GitHub child component
-        handleSaveChanges,
-        // GitHub specific props for the child component
-        editMode,
-        setEditMode, // Pass down if needed by GitHub component
-        originalPat,
-        currentSelectedRepos,
-        fetchedRepos,
-        setFetchedRepos,
-        newSelectedRepos,
-        setNewSelectedRepos,
-        isFetchingRepos,
-        handleFetchRepositories,
-        handleRepoSelectionChange,
-    } = useConnectorEditPage(connectorId, searchSpaceId);
+	// Use the custom hook to manage state and logic
+	const {
+		connectorsLoading,
+		connector,
+		isSaving,
+		editForm,
+		patForm, // Needed for GitHub child component
+		handleSaveChanges,
+		// GitHub specific props for the child component
+		editMode,
+		setEditMode, // Pass down if needed by GitHub component
+		originalPat,
+		currentSelectedRepos,
+		fetchedRepos,
+		setFetchedRepos,
+		newSelectedRepos,
+		setNewSelectedRepos,
+		isFetchingRepos,
+		handleFetchRepositories,
+		handleRepoSelectionChange,
+	} = useConnectorEditPage(connectorId, searchSpaceId);
 
-    // Redirect if connectorId is not a valid number after parsing
-    useEffect(() => {
-        if (isNaN(connectorId)) {
-            toast.error("Invalid Connector ID.");
-            router.push(`/dashboard/${searchSpaceId}/connectors`);
-        }
-    }, [connectorId, router, searchSpaceId]);
+	// Redirect if connectorId is not a valid number after parsing
+	useEffect(() => {
+		if (isNaN(connectorId)) {
+			toast.error("Invalid Connector ID.");
+			router.push(`/dashboard/${searchSpaceId}/connectors`);
+		}
+	}, [connectorId, router, searchSpaceId]);
 
-    // Loading State
-    if (connectorsLoading || !connector) {
-        // Handle NaN case before showing skeleton
-        if (isNaN(connectorId)) return null; 
-        return <EditConnectorLoadingSkeleton />;
-    }
+	// Loading State
+	if (connectorsLoading || !connector) {
+		// Handle NaN case before showing skeleton
+		if (isNaN(connectorId)) return null;
+		return <EditConnectorLoadingSkeleton />;
+	}
 
-    // Main Render using data/handlers from the hook
-    return (
-        <div className="container mx-auto py-8 max-w-3xl">
-            <Button variant="ghost" className="mb-6" onClick={() => router.push(`/dashboard/${searchSpaceId}/connectors`)}>
-                <ArrowLeft className="mr-2 h-4 w-4" /> Back to Connectors
-            </Button>
+	// Main Render using data/handlers from the hook
+	return (
+		<div className="container mx-auto py-8 max-w-3xl">
+			<Button
+				variant="ghost"
+				className="mb-6"
+				onClick={() => router.push(`/dashboard/${searchSpaceId}/connectors`)}
+			>
+				<ArrowLeft className="mr-2 h-4 w-4" /> Back to Connectors
+			</Button>
 
-            <motion.div initial={{ opacity: 0, y: 20 }} animate={{ opacity: 1, y: 0 }} transition={{ duration: 0.5 }}>
-                <Card className="border-2 border-border">
-                    <CardHeader>
-                        <CardTitle className="text-2xl font-bold flex items-center gap-2">
-                            <Github className="h-6 w-6" /> {/* TODO: Dynamic icon */}
-                            Edit {getConnectorTypeDisplay(connector.connector_type)} Connector
-                        </CardTitle>
-                        <CardDescription>Modify connector name and configuration.</CardDescription>
-                    </CardHeader>
+			<motion.div
+				initial={{ opacity: 0, y: 20 }}
+				animate={{ opacity: 1, y: 0 }}
+				transition={{ duration: 0.5 }}
+			>
+				<Card className="border-2 border-border">
+					<CardHeader>
+						<CardTitle className="text-2xl font-bold flex items-center gap-2">
+							{getConnectorIcon(connector.connector_type)}
+							Edit {getConnectorTypeDisplay(connector.connector_type)} Connector
+						</CardTitle>
+						<CardDescription>
+							Modify connector name and configuration.
+						</CardDescription>
+					</CardHeader>
 
-                    <Form {...editForm}> 
-                        {/* Pass hook's handleSaveChanges */}
-                        <form onSubmit={editForm.handleSubmit(handleSaveChanges)} className="space-y-6">
-                            <CardContent className="space-y-6">
-                                {/* Pass form control from hook */}
-                                <EditConnectorNameForm control={editForm.control} />
+					<Form {...editForm}>
+						{/* Pass hook's handleSaveChanges */}
+						<form
+							onSubmit={editForm.handleSubmit(handleSaveChanges)}
+							className="space-y-6"
+						>
+							<CardContent className="space-y-6">
+								{/* Pass form control from hook */}
+								<EditConnectorNameForm control={editForm.control} />
 
-                                <hr />
+								<hr />
 
-                                <h3 className="text-lg font-semibold">Configuration</h3>
+								<h3 className="text-lg font-semibold">Configuration</h3>
 
-                                {/* == GitHub == */}
-                                {connector.connector_type === 'GITHUB_CONNECTOR' && (
-                                    <EditGitHubConnectorConfig
-                                        // Pass relevant state and handlers from hook
-                                        editMode={editMode}
-                                        setEditMode={setEditMode} // Pass setter if child manages mode
-                                        originalPat={originalPat}
-                                        currentSelectedRepos={currentSelectedRepos}
-                                        fetchedRepos={fetchedRepos}
-                                        newSelectedRepos={newSelectedRepos}
-                                        isFetchingRepos={isFetchingRepos}
-                                        patForm={patForm}
-                                        handleFetchRepositories={handleFetchRepositories}
-                                        handleRepoSelectionChange={handleRepoSelectionChange}
-                                        setNewSelectedRepos={setNewSelectedRepos}
-                                        setFetchedRepos={setFetchedRepos}
-                                    />
-                                )}
+								{/* == GitHub == */}
+								{connector.connector_type === "GITHUB_CONNECTOR" && (
+									<EditGitHubConnectorConfig
+										// Pass relevant state and handlers from hook
+										editMode={editMode}
+										setEditMode={setEditMode} // Pass setter if child manages mode
+										originalPat={originalPat}
+										currentSelectedRepos={currentSelectedRepos}
+										fetchedRepos={fetchedRepos}
+										newSelectedRepos={newSelectedRepos}
+										isFetchingRepos={isFetchingRepos}
+										patForm={patForm}
+										handleFetchRepositories={handleFetchRepositories}
+										handleRepoSelectionChange={handleRepoSelectionChange}
+										setNewSelectedRepos={setNewSelectedRepos}
+										setFetchedRepos={setFetchedRepos}
+									/>
+								)}
 
-                                {/* == Slack == */}
-                                {connector.connector_type === 'SLACK_CONNECTOR' && (
-                                    <EditSimpleTokenForm
-                                        control={editForm.control}
-                                        fieldName="SLACK_BOT_TOKEN"
-                                        fieldLabel="Slack Bot Token"
-                                        fieldDescription="Update the Slack Bot Token if needed."
-                                        placeholder="Begins with xoxb-..."
-                                    />
-                                )}
-                                {/* == Notion == */}
-                                {connector.connector_type === 'NOTION_CONNECTOR' && (
-                                    <EditSimpleTokenForm
-                                        control={editForm.control}
-                                        fieldName="NOTION_INTEGRATION_TOKEN"
-                                        fieldLabel="Notion Integration Token"
-                                        fieldDescription="Update the Notion Integration Token if needed."
-                                        placeholder="Begins with secret_..."
-                                    />
-                                )}
-                                {/* == Serper == */}
-                                {connector.connector_type === 'SERPER_API' && (
-                                    <EditSimpleTokenForm
-                                        control={editForm.control}
-                                        fieldName="SERPER_API_KEY"
-                                        fieldLabel="Serper API Key"
-                                        fieldDescription="Update the Serper API Key if needed."
-                                    />
-                                )}
-                                {/* == Tavily == */}
-                                {connector.connector_type === 'TAVILY_API' && (
-                                    <EditSimpleTokenForm
-                                        control={editForm.control}
-                                        fieldName="TAVILY_API_KEY"
-                                        fieldLabel="Tavily API Key"
-                                        fieldDescription="Update the Tavily API Key if needed."
-                                    />
-                                )}
+								{/* == Slack == */}
+								{connector.connector_type === "SLACK_CONNECTOR" && (
+									<EditSimpleTokenForm
+										control={editForm.control}
+										fieldName="SLACK_BOT_TOKEN"
+										fieldLabel="Slack Bot Token"
+										fieldDescription="Update the Slack Bot Token if needed."
+										placeholder="Begins with xoxb-..."
+									/>
+								)}
+								{/* == Notion == */}
+								{connector.connector_type === "NOTION_CONNECTOR" && (
+									<EditSimpleTokenForm
+										control={editForm.control}
+										fieldName="NOTION_INTEGRATION_TOKEN"
+										fieldLabel="Notion Integration Token"
+										fieldDescription="Update the Notion Integration Token if needed."
+										placeholder="Begins with secret_..."
+									/>
+								)}
+								{/* == Serper == */}
+								{connector.connector_type === "SERPER_API" && (
+									<EditSimpleTokenForm
+										control={editForm.control}
+										fieldName="SERPER_API_KEY"
+										fieldLabel="Serper API Key"
+										fieldDescription="Update the Serper API Key if needed."
+									/>
+								)}
+								{/* == Tavily == */}
+								{connector.connector_type === "TAVILY_API" && (
+									<EditSimpleTokenForm
+										control={editForm.control}
+										fieldName="TAVILY_API_KEY"
+										fieldLabel="Tavily API Key"
+										fieldDescription="Update the Tavily API Key if needed."
+									/>
+								)}
 
-                                {/* == Linear == */}
-                                {connector.connector_type === 'LINEAR_CONNECTOR' && (
-                                    <EditSimpleTokenForm
-                                        control={editForm.control}
-                                        fieldName="LINEAR_API_KEY"
-                                        fieldLabel="Linear API Key"
-                                        fieldDescription="Update your Linear API Key if needed."
-                                        placeholder="Begins with lin_api_..."
-                                    />
-                                )}
+								{/* == Linear == */}
+								{connector.connector_type === "LINEAR_CONNECTOR" && (
+									<EditSimpleTokenForm
+										control={editForm.control}
+										fieldName="LINEAR_API_KEY"
+										fieldLabel="Linear API Key"
+										fieldDescription="Update your Linear API Key if needed."
+										placeholder="Begins with lin_api_..."
+									/>
+								)}
 
-                                {/* == Linkup == */}
-                                {connector.connector_type === 'LINKUP_API' && (
-                                    <EditSimpleTokenForm
-                                        control={editForm.control}
-                                        fieldName="LINKUP_API_KEY"
-                                        fieldLabel="Linkup API Key"
-                                        fieldDescription="Update your Linkup API Key if needed."
-                                        placeholder="Begins with linkup_..."
-                                    />
-                                )}
-
-                            </CardContent>
-                            <CardFooter className="border-t pt-6">
-                                <Button type="submit" disabled={isSaving} className="w-full sm:w-auto">
-                                    {isSaving ? <Loader2 className="mr-2 h-4 w-4 animate-spin" /> : <Check className="mr-2 h-4 w-4" />}
-                                    Save Changes
-                                </Button>
-                            </CardFooter>
-                        </form>
-                    </Form>
-                </Card>
-            </motion.div>
-        </div>
-    );
-} 
+								{/* == Linkup == */}
+								{connector.connector_type === "LINKUP_API" && (
+									<EditSimpleTokenForm
+										control={editForm.control}
+										fieldName="LINKUP_API_KEY"
+										fieldLabel="Linkup API Key"
+										fieldDescription="Update your Linkup API Key if needed."
+										placeholder="Begins with linkup_..."
+									/>
+								)}
+							</CardContent>
+							<CardFooter className="border-t pt-6">
+								<Button
+									type="submit"
+									disabled={isSaving}
+									className="w-full sm:w-auto"
+								>
+									{isSaving ? (
+										<Loader2 className="mr-2 h-4 w-4 animate-spin" />
+									) : (
+										<Check className="mr-2 h-4 w-4" />
+									)}
+									Save Changes
+								</Button>
+							</CardFooter>
+						</form>
+					</Form>
+				</Card>
+			</motion.div>
+		</div>
+	);
+}

From 2cee5acaa3589ae62da2bf4fd2a7e43be98285c7 Mon Sep 17 00:00:00 2001
From: "DESKTOP-RTLN3BA\\$punk" <vermarohanfinal@gmail.com>
Date: Fri, 9 May 2025 22:14:22 -0700
Subject: [PATCH 33/70] feat: Improved sub section writer agent & Chat UI

---
 .../app/agents/researcher/nodes.py            | 139 +++++--
 .../sub_section_writer/configuration.py       |   9 +
 .../researcher/sub_section_writer/nodes.py    |  21 +-
 .../researcher/[chat_id]/page.tsx             | 375 +++++++++---------
 4 files changed, 304 insertions(+), 240 deletions(-)

diff --git a/surfsense_backend/app/agents/researcher/nodes.py b/surfsense_backend/app/agents/researcher/nodes.py
index 4c3bc721f..902dbe3e2 100644
--- a/surfsense_backend/app/agents/researcher/nodes.py
+++ b/surfsense_backend/app/agents/researcher/nodes.py
@@ -14,6 +14,8 @@ from .configuration import Configuration
 from .prompts import get_answer_outline_system_prompt
 from .state import State
 from .sub_section_writer.graph import graph as sub_section_writer_graph
+from .sub_section_writer.configuration import SubSectionType
+
 
 from langgraph.types import StreamWriter
 
@@ -41,14 +43,14 @@ async def write_answer_outline(state: State, config: RunnableConfig, writer: Str
     """
     streaming_service = state.streaming_service
     
-    streaming_service.only_update_terminal("Generating answer outline...")
+    streaming_service.only_update_terminal("🔍 Generating answer outline...")
     writer({"yeild_value": streaming_service._format_annotations()})
     # Get configuration from runnable config
     configuration = Configuration.from_runnable_config(config)
     user_query = configuration.user_query
     num_sections = configuration.num_sections
     
-    streaming_service.only_update_terminal(f"Planning research approach for query: {user_query[:100]}...")
+    streaming_service.only_update_terminal(f"🤔 Planning research approach for: \"{user_query[:100]}...\"")
     writer({"yeild_value": streaming_service._format_annotations()})
     
     # Initialize LLM
@@ -78,7 +80,7 @@ async def write_answer_outline(state: State, config: RunnableConfig, writer: Str
     Your output MUST be valid JSON in exactly this format. Do not include any other text or explanation.
     """
     
-    streaming_service.only_update_terminal("Designing structured outline with AI...")
+    streaming_service.only_update_terminal("📝 Designing structured outline with AI...")
     writer({"yeild_value": streaming_service._format_annotations()})
     
     # Create messages for the LLM
@@ -88,7 +90,7 @@ async def write_answer_outline(state: State, config: RunnableConfig, writer: Str
     ]
     
     # Call the LLM directly without using structured output
-    streaming_service.only_update_terminal("Processing answer structure...")
+    streaming_service.only_update_terminal("⚙️ Processing answer structure...")
     writer({"yeild_value": streaming_service._format_annotations()})
     
     response = await llm.ainvoke(messages)
@@ -111,7 +113,7 @@ async def write_answer_outline(state: State, config: RunnableConfig, writer: Str
             answer_outline = AnswerOutline(**parsed_data)
             
             total_questions = sum(len(section.questions) for section in answer_outline.answer_outline)
-            streaming_service.only_update_terminal(f"Successfully generated outline with {len(answer_outline.answer_outline)} sections and {total_questions} research questions")
+            streaming_service.only_update_terminal(f"✅ Successfully generated outline with {len(answer_outline.answer_outline)} sections and {total_questions} research questions!")
             writer({"yeild_value": streaming_service._format_annotations()})
             
             print(f"Successfully generated answer outline with {len(answer_outline.answer_outline)} sections")
@@ -121,14 +123,14 @@ async def write_answer_outline(state: State, config: RunnableConfig, writer: Str
         else:
             # If JSON structure not found, raise a clear error
             error_message = f"Could not find valid JSON in LLM response. Raw response: {content}"
-            streaming_service.only_update_terminal(error_message, "error")
+            streaming_service.only_update_terminal(f"❌ {error_message}", "error")
             writer({"yeild_value": streaming_service._format_annotations()})
             raise ValueError(error_message)
             
     except (json.JSONDecodeError, ValueError) as e:
         # Log the error and re-raise it
         error_message = f"Error parsing LLM response: {str(e)}"
-        streaming_service.only_update_terminal(error_message, "error")
+        streaming_service.only_update_terminal(f"❌ {error_message}", "error")
         writer({"yeild_value": streaming_service._format_annotations()})
         
         print(f"Error parsing LLM response: {str(e)}")
@@ -149,6 +151,11 @@ async def fetch_relevant_documents(
     """
     Fetch relevant documents for research questions using the provided connectors.
     
+    This function searches across multiple data sources for information related to the
+    research questions. It provides user-friendly feedback during the search process by
+    displaying connector names (like "Web Search" instead of "TAVILY_API") and adding
+    relevant emojis to indicate the type of source being searched.
+    
     Args:
         research_questions: List of research questions to find documents for
         user_id: The user ID
@@ -158,6 +165,7 @@ async def fetch_relevant_documents(
         writer: StreamWriter for sending progress updates
         state: The current state containing the streaming service
         top_k: Number of top results to retrieve per connector per question
+        connector_service: An initialized connector service to use for searching
         
     Returns:
         List of relevant documents
@@ -170,7 +178,9 @@ async def fetch_relevant_documents(
 
     # Stream initial status update
     if streaming_service and writer:
-        streaming_service.only_update_terminal(f"Starting research on {len(research_questions)} questions using {len(connectors_to_search)} connectors...")
+        connector_names = [get_connector_friendly_name(connector) for connector in connectors_to_search]
+        connector_names_str = ", ".join(connector_names)
+        streaming_service.only_update_terminal(f"🔎 Starting research on {len(research_questions)} questions using {connector_names_str} data sources")
         writer({"yeild_value": streaming_service._format_annotations()})
 
     all_raw_documents = []  # Store all raw documents
@@ -179,7 +189,7 @@ async def fetch_relevant_documents(
     for i, user_query in enumerate(research_questions):
         # Stream question being researched
         if streaming_service and writer:
-            streaming_service.only_update_terminal(f"Researching question {i+1}/{len(research_questions)}: {user_query[:100]}...")
+            streaming_service.only_update_terminal(f"🧠 Researching question {i+1}/{len(research_questions)}: \"{user_query[:100]}...\"")
             writer({"yeild_value": streaming_service._format_annotations()})
             
         # Use original research question as the query
@@ -189,7 +199,9 @@ async def fetch_relevant_documents(
         for connector in connectors_to_search:
             # Stream connector being searched
             if streaming_service and writer:
-                streaming_service.only_update_terminal(f"Searching {connector} for relevant information...")
+                connector_emoji = get_connector_emoji(connector)
+                friendly_name = get_connector_friendly_name(connector)
+                streaming_service.only_update_terminal(f"{connector_emoji} Searching {friendly_name} for relevant information...")
                 writer({"yeild_value": streaming_service._format_annotations()})
                 
             try:
@@ -208,7 +220,7 @@ async def fetch_relevant_documents(
                     
                     # Stream found document count
                     if streaming_service and writer:
-                        streaming_service.only_update_terminal(f"Found {len(youtube_chunks)} YouTube chunks relevant to the query")
+                        streaming_service.only_update_terminal(f"📹 Found {len(youtube_chunks)} YouTube chunks related to your query")
                         writer({"yeild_value": streaming_service._format_annotations()})
                     
                 elif connector == "EXTENSION":
@@ -226,7 +238,7 @@ async def fetch_relevant_documents(
                     
                     # Stream found document count
                     if streaming_service and writer:
-                        streaming_service.only_update_terminal(f"Found {len(extension_chunks)} extension chunks relevant to the query")
+                        streaming_service.only_update_terminal(f"🧩 Found {len(extension_chunks)} Browser Extension chunks related to your query")
                         writer({"yeild_value": streaming_service._format_annotations()})
                     
                 elif connector == "CRAWLED_URL":
@@ -244,7 +256,7 @@ async def fetch_relevant_documents(
                     
                     # Stream found document count
                     if streaming_service and writer:
-                        streaming_service.only_update_terminal(f"Found {len(crawled_urls_chunks)} crawled URL chunks relevant to the query")
+                        streaming_service.only_update_terminal(f"🌐 Found {len(crawled_urls_chunks)} Web Pages chunks related to your query")
                         writer({"yeild_value": streaming_service._format_annotations()})
                     
                 elif connector == "FILE":
@@ -262,7 +274,7 @@ async def fetch_relevant_documents(
                     
                     # Stream found document count
                     if streaming_service and writer:
-                        streaming_service.only_update_terminal(f"Found {len(files_chunks)} file chunks relevant to the query")
+                        streaming_service.only_update_terminal(f"📄 Found {len(files_chunks)} Files chunks related to your query")
                         writer({"yeild_value": streaming_service._format_annotations()})
                     
                     
@@ -281,7 +293,7 @@ async def fetch_relevant_documents(
                     
                     # Stream found document count
                     if streaming_service and writer:
-                        streaming_service.only_update_terminal(f"Found {len(slack_chunks)} Slack messages relevant to the query")
+                        streaming_service.only_update_terminal(f"💬 Found {len(slack_chunks)} Slack messages related to your query")
                         writer({"yeild_value": streaming_service._format_annotations()})
                     
                 elif connector == "NOTION_CONNECTOR":
@@ -299,7 +311,7 @@ async def fetch_relevant_documents(
                     
                     # Stream found document count
                     if streaming_service and writer:
-                        streaming_service.only_update_terminal(f"Found {len(notion_chunks)} Notion pages/blocks relevant to the query")
+                        streaming_service.only_update_terminal(f"📘 Found {len(notion_chunks)} Notion pages/blocks related to your query")
                         writer({"yeild_value": streaming_service._format_annotations()})
                     
                 elif connector == "GITHUB_CONNECTOR":
@@ -317,7 +329,7 @@ async def fetch_relevant_documents(
                     
                     # Stream found document count
                     if streaming_service and writer:
-                        streaming_service.only_update_terminal(f"Found {len(github_chunks)} GitHub files/issues relevant to the query")
+                        streaming_service.only_update_terminal(f"🐙 Found {len(github_chunks)} GitHub files/issues related to your query")
                         writer({"yeild_value": streaming_service._format_annotations()})
                     
                 elif connector == "LINEAR_CONNECTOR":
@@ -335,7 +347,7 @@ async def fetch_relevant_documents(
                     
                     # Stream found document count
                     if streaming_service and writer:
-                        streaming_service.only_update_terminal(f"Found {len(linear_chunks)} Linear issues relevant to the query")
+                        streaming_service.only_update_terminal(f"📊 Found {len(linear_chunks)} Linear issues related to your query")
                         writer({"yeild_value": streaming_service._format_annotations()})
                         
                 elif connector == "TAVILY_API":
@@ -352,7 +364,7 @@ async def fetch_relevant_documents(
                     
                     # Stream found document count
                     if streaming_service and writer:
-                        streaming_service.only_update_terminal(f"Found {len(tavily_chunks)} web search results relevant to the query")
+                        streaming_service.only_update_terminal(f"🔍 Found {len(tavily_chunks)} Web Search results related to your query")
                         writer({"yeild_value": streaming_service._format_annotations()})
                         
                 elif connector == "LINKUP_API":
@@ -374,7 +386,7 @@ async def fetch_relevant_documents(
                     
                     # Stream found document count
                     if streaming_service and writer:
-                        streaming_service.only_update_terminal(f"Found {len(linkup_chunks)} Linkup chunks relevant to the query")
+                        streaming_service.only_update_terminal(f"🔗 Found {len(linkup_chunks)} Linkup results related to your query")
                         writer({"yeild_value": streaming_service._format_annotations()})
                     
 
@@ -384,7 +396,8 @@ async def fetch_relevant_documents(
                 
                 # Stream error message
                 if streaming_service and writer:
-                    streaming_service.only_update_terminal(error_message, "error")
+                    friendly_name = get_connector_friendly_name(connector)
+                    streaming_service.only_update_terminal(f"⚠️ Error searching {friendly_name}: {str(e)}", "error")
                     writer({"yeild_value": streaming_service._format_annotations()})
                 
                 # Continue with other connectors on error
@@ -411,7 +424,7 @@ async def fetch_relevant_documents(
     
     # Stream info about deduplicated sources
     if streaming_service and writer:
-        streaming_service.only_update_terminal(f"Collected {len(deduplicated_sources)} unique sources across all connectors")
+        streaming_service.only_update_terminal(f"📚 Collected {len(deduplicated_sources)} unique sources across all connectors")
         writer({"yeild_value": streaming_service._format_annotations()})
         
     # After all sources are collected and deduplicated, stream them
@@ -441,12 +454,44 @@ async def fetch_relevant_documents(
     
     # Stream info about deduplicated documents
     if streaming_service and writer:
-        streaming_service.only_update_terminal(f"Found {len(deduplicated_docs)} unique document chunks after deduplication")
+        streaming_service.only_update_terminal(f"🧹 Found {len(deduplicated_docs)} unique document chunks after removing duplicates")
         writer({"yeild_value": streaming_service._format_annotations()})
     
     # Return deduplicated documents
     return deduplicated_docs
 
+def get_connector_emoji(connector_name: str) -> str:
+    """Get an appropriate emoji for a connector type."""
+    connector_emojis = {
+        "YOUTUBE_VIDEO": "📹",
+        "EXTENSION": "🧩",
+        "CRAWLED_URL": "🌐",
+        "FILE": "📄",
+        "SLACK_CONNECTOR": "💬",
+        "NOTION_CONNECTOR": "📘",
+        "GITHUB_CONNECTOR": "🐙",
+        "LINEAR_CONNECTOR": "📊",
+        "TAVILY_API": "🔍",
+        "LINKUP_API": "🔗"
+    }
+    return connector_emojis.get(connector_name, "🔎")
+
+def get_connector_friendly_name(connector_name: str) -> str:
+    """Convert technical connector IDs to user-friendly names."""
+    connector_friendly_names = {
+        "YOUTUBE_VIDEO": "YouTube",
+        "EXTENSION": "Browser Extension",
+        "CRAWLED_URL": "Web Pages",
+        "FILE": "Files",
+        "SLACK_CONNECTOR": "Slack",
+        "NOTION_CONNECTOR": "Notion",
+        "GITHUB_CONNECTOR": "GitHub",
+        "LINEAR_CONNECTOR": "Linear",
+        "TAVILY_API": "Tavily Search",
+        "LINKUP_API": "Linkup Search"
+    }
+    return connector_friendly_names.get(connector_name, connector_name)
+
 async def process_sections(state: State, config: RunnableConfig, writer: StreamWriter) -> Dict[str, Any]:
     """
     Process all sections in parallel and combine the results.
@@ -463,13 +508,13 @@ async def process_sections(state: State, config: RunnableConfig, writer: StreamW
     answer_outline = state.answer_outline
     streaming_service = state.streaming_service
     
-    streaming_service.only_update_terminal(f"Starting to process research sections...")
+    streaming_service.only_update_terminal(f"🚀 Starting to process research sections...")
     writer({"yeild_value": streaming_service._format_annotations()})
     
     print(f"Processing sections from outline: {answer_outline is not None}")
     
     if not answer_outline:
-        streaming_service.only_update_terminal("Error: No answer outline was provided. Cannot generate report.", "error")
+        streaming_service.only_update_terminal("❌ Error: No answer outline was provided. Cannot generate report.", "error")
         writer({"yeild_value": streaming_service._format_annotations()})
         return {
             "final_written_report": "No answer outline was provided. Cannot generate final report."
@@ -481,11 +526,11 @@ async def process_sections(state: State, config: RunnableConfig, writer: StreamW
         all_questions.extend(section.questions)
     
     print(f"Collected {len(all_questions)} questions from all sections")
-    streaming_service.only_update_terminal(f"Found {len(all_questions)} research questions across {len(answer_outline.answer_outline)} sections")
+    streaming_service.only_update_terminal(f"🧩 Found {len(all_questions)} research questions across {len(answer_outline.answer_outline)} sections")
     writer({"yeild_value": streaming_service._format_annotations()})
     
     # Fetch relevant documents once for all questions
-    streaming_service.only_update_terminal("Searching for relevant information across all connectors...")
+    streaming_service.only_update_terminal("🔍 Searching for relevant information across all connectors...")
     writer({"yeild_value": streaming_service._format_annotations()})
     
     if configuration.num_sections == 1:
@@ -515,7 +560,7 @@ async def process_sections(state: State, config: RunnableConfig, writer: StreamW
         except Exception as e:
             error_message = f"Error fetching relevant documents: {str(e)}"
             print(error_message)
-            streaming_service.only_update_terminal(error_message, "error")
+            streaming_service.only_update_terminal(f"❌ {error_message}", "error")
             writer({"yeild_value": streaming_service._format_annotations()})
             # Log the error and continue with an empty list of documents
             # This allows the process to continue, but the report might lack information
@@ -523,15 +568,22 @@ async def process_sections(state: State, config: RunnableConfig, writer: StreamW
             # Consider adding more robust error handling or reporting if needed
     
     print(f"Fetched {len(relevant_documents)} relevant documents for all sections")
-    streaming_service.only_update_terminal(f"Starting to draft {len(answer_outline.answer_outline)} sections using {len(relevant_documents)} relevant document chunks")
+    streaming_service.only_update_terminal(f"✨ Starting to draft {len(answer_outline.answer_outline)} sections using {len(relevant_documents)} relevant document chunks")
     writer({"yeild_value": streaming_service._format_annotations()})
     
     # Create tasks to process each section in parallel with the same document set
     section_tasks = []
-    streaming_service.only_update_terminal("Creating processing tasks for each section...")
+    streaming_service.only_update_terminal("⚙️ Creating processing tasks for each section...")
     writer({"yeild_value": streaming_service._format_annotations()})
     
-    for section in answer_outline.answer_outline:
+    for i, section in enumerate(answer_outline.answer_outline):
+        if i == 0:
+            sub_section_type = SubSectionType.START
+        elif i == len(answer_outline.answer_outline) - 1:
+            sub_section_type = SubSectionType.END
+        else:
+            sub_section_type = SubSectionType.MIDDLE
+        
         section_tasks.append(
             process_section_with_documents(
                 section_title=section.section_title,
@@ -541,19 +593,20 @@ async def process_sections(state: State, config: RunnableConfig, writer: StreamW
                 search_space_id=configuration.search_space_id,
                 relevant_documents=relevant_documents,
                 state=state,
-                writer=writer
+                writer=writer,
+                sub_section_type=sub_section_type
             )
         )
     
     # Run all section processing tasks in parallel
     print(f"Running {len(section_tasks)} section processing tasks in parallel")
-    streaming_service.only_update_terminal(f"Processing {len(section_tasks)} sections simultaneously...")
+    streaming_service.only_update_terminal(f"⏳ Processing {len(section_tasks)} sections simultaneously...")
     writer({"yeild_value": streaming_service._format_annotations()})
     
     section_results = await asyncio.gather(*section_tasks, return_exceptions=True)
     
     # Handle any exceptions in the results
-    streaming_service.only_update_terminal("Combining section results into final report...")
+    streaming_service.only_update_terminal("🧵 Combining section results into final report...")
     writer({"yeild_value": streaming_service._format_annotations()})
     
     processed_results = []
@@ -562,7 +615,7 @@ async def process_sections(state: State, config: RunnableConfig, writer: StreamW
             section_title = answer_outline.answer_outline[i].section_title
             error_message = f"Error processing section '{section_title}': {str(result)}"
             print(error_message)
-            streaming_service.only_update_terminal(error_message, "error")
+            streaming_service.only_update_terminal(f"⚠️ {error_message}", "error")
             writer({"yeild_value": streaming_service._format_annotations()})
             processed_results.append(error_message)
         else:
@@ -580,7 +633,7 @@ async def process_sections(state: State, config: RunnableConfig, writer: StreamW
     final_written_report = "\n".join(final_report)
     print(f"Generated final report with {len(final_report)} parts")
     
-    streaming_service.only_update_terminal("Final research report generated successfully!")
+    streaming_service.only_update_terminal("🎉 Final research report generated successfully!")
     writer({"yeild_value": streaming_service._format_annotations()})
     
     if hasattr(state, 'streaming_service') and state.streaming_service:
@@ -612,7 +665,8 @@ async def process_section_with_documents(
     relevant_documents: List[Dict[str, Any]],
     user_query: str,
     state: State = None,
-    writer: StreamWriter = None
+    writer: StreamWriter = None,
+    sub_section_type: SubSectionType = SubSectionType.MIDDLE
 ) -> str:
     """
     Process a single section using pre-fetched documents.
@@ -635,14 +689,14 @@ async def process_section_with_documents(
         
         # Send status update via streaming if available
         if state and state.streaming_service and writer:
-            state.streaming_service.only_update_terminal(f"Writing section: {section_title} with {len(section_questions)} research questions")
+            state.streaming_service.only_update_terminal(f"📝 Writing section: \"{section_title}\" with {len(section_questions)} research questions")
             writer({"yeild_value": state.streaming_service._format_annotations()})
         
         # Fallback if no documents found
         if not documents_to_use:
             print(f"No relevant documents found for section: {section_title}")
             if state and state.streaming_service and writer:
-                state.streaming_service.only_update_terminal(f"Warning: No relevant documents found for section: {section_title}", "warning")
+                state.streaming_service.only_update_terminal(f"⚠️ Warning: No relevant documents found for section: \"{section_title}\"", "warning")
                 writer({"yeild_value": state.streaming_service._format_annotations()})
                 
             documents_to_use = [
@@ -657,6 +711,7 @@ async def process_section_with_documents(
                 "configurable": {
                     "sub_section_title": section_title,
                     "sub_section_questions": section_questions,
+                    "sub_section_type": sub_section_type,
                     "user_query": user_query,
                     "relevant_documents": documents_to_use,
                     "user_id": user_id,
@@ -670,7 +725,7 @@ async def process_section_with_documents(
             # Invoke the sub-section writer graph
             print(f"Invoking sub_section_writer for: {section_title}")
             if state and state.streaming_service and writer:
-                state.streaming_service.only_update_terminal(f"Analyzing information and drafting content for section: {section_title}")
+                state.streaming_service.only_update_terminal(f"🧠 Analyzing information and drafting content for section: \"{section_title}\"")
                 writer({"yeild_value": state.streaming_service._format_annotations()})
                 
             result = await sub_section_writer_graph.ainvoke(sub_state, config)
@@ -680,7 +735,7 @@ async def process_section_with_documents(
             
             # Send section content update via streaming if available
             if state and state.streaming_service and writer:
-                state.streaming_service.only_update_terminal(f"Completed writing section: {section_title}")
+                state.streaming_service.only_update_terminal(f"✅ Completed writing section: \"{section_title}\"")
                 writer({"yeild_value": state.streaming_service._format_annotations()})
                 
             return final_answer
@@ -689,7 +744,7 @@ async def process_section_with_documents(
         
         # Send error update via streaming if available
         if state and state.streaming_service and writer:
-            state.streaming_service.only_update_terminal(f"Error processing section '{section_title}': {str(e)}", "error")
+            state.streaming_service.only_update_terminal(f"❌ Error processing section \"{section_title}\": {str(e)}", "error")
             writer({"yeild_value": state.streaming_service._format_annotations()})
             
         return f"Error processing section: {section_title}. Details: {str(e)}"
diff --git a/surfsense_backend/app/agents/researcher/sub_section_writer/configuration.py b/surfsense_backend/app/agents/researcher/sub_section_writer/configuration.py
index 9e1ca32b5..b7acf8bb1 100644
--- a/surfsense_backend/app/agents/researcher/sub_section_writer/configuration.py
+++ b/surfsense_backend/app/agents/researcher/sub_section_writer/configuration.py
@@ -3,11 +3,19 @@
 from __future__ import annotations
 
 from dataclasses import dataclass, fields
+from enum import Enum
 from typing import Optional, List, Any
 
 from langchain_core.runnables import RunnableConfig
 
 
+class SubSectionType(Enum):
+    """Enum defining the type of sub-section."""
+    START = "START"
+    MIDDLE = "MIDDLE"
+    END = "END"
+
+
 @dataclass(kw_only=True)
 class Configuration:
     """The configuration for the agent."""
@@ -15,6 +23,7 @@ class Configuration:
     # Input parameters provided at invocation
     sub_section_title: str
     sub_section_questions: List[str]
+    sub_section_type: SubSectionType
     user_query: str
     relevant_documents: List[Any]  # Documents provided directly to the agent
     user_id: str
diff --git a/surfsense_backend/app/agents/researcher/sub_section_writer/nodes.py b/surfsense_backend/app/agents/researcher/sub_section_writer/nodes.py
index f1d50aeeb..3cd699ce5 100644
--- a/surfsense_backend/app/agents/researcher/sub_section_writer/nodes.py
+++ b/surfsense_backend/app/agents/researcher/sub_section_writer/nodes.py
@@ -5,6 +5,7 @@ from typing import Any, Dict
 from app.config import config as app_config
 from .prompts import get_citation_system_prompt
 from langchain_core.messages import HumanMessage, SystemMessage
+from .configuration import SubSectionType
 
 async def rerank_documents(state: State, config: RunnableConfig) -> Dict[str, Any]:
     """
@@ -122,10 +123,20 @@ async def write_sub_section(state: State, config: RunnableConfig) -> Dict[str, A
     sub_section_questions = configuration.sub_section_questions
     user_query = configuration.user_query  # Get the original user query
     documents_text = "\n".join(formatted_documents)
-    
+    sub_section_type = configuration.sub_section_type
+
     # Format the questions as bullet points for clarity
     questions_text = "\n".join([f"- {question}" for question in sub_section_questions])
     
+    # Provide more context based on the subsection type
+    section_position_context = ""
+    if sub_section_type == SubSectionType.START:
+        section_position_context = "This is the INTRODUCTION section. Focus on providing an overview of the topic, setting the context, and introducing key concepts that will be discussed in later sections. Do not provide any conclusions in this section, as conclusions should only appear in the final section."
+    elif sub_section_type == SubSectionType.MIDDLE:
+        section_position_context = "This is a MIDDLE section. Ensure this content flows naturally from previous sections and into subsequent ones. This could be any middle section in the document, so maintain coherence with the overall structure while addressing the specific topic of this section. Do not provide any conclusions in this section, as conclusions should only appear in the final section."
+    elif sub_section_type == SubSectionType.END:
+        section_position_context = "This is the CONCLUSION section. Focus on summarizing key points, providing closure, and possibly suggesting implications or future directions related to the topic."
+    
     # Construct a clear, structured query for the LLM
     human_message_content = f"""
     Now user's query is: 
@@ -137,6 +148,14 @@ async def write_sub_section(state: State, config: RunnableConfig) -> Dict[str, A
     <sub_section_title>
         {section_title}
     </sub_section_title>
+
+    <section_position>
+        {section_position_context}
+    </section_position>
+    
+    <guiding_questions>
+        {questions_text}
+    </guiding_questions>
     
     Use the provided documents as your source material and cite them properly using the IEEE citation format [X] where X is the source_id.
     <documents>
diff --git a/surfsense_web/app/dashboard/[search_space_id]/researcher/[chat_id]/page.tsx b/surfsense_web/app/dashboard/[search_space_id]/researcher/[chat_id]/page.tsx
index d371e9e53..bc58e8c30 100644
--- a/surfsense_web/app/dashboard/[search_space_id]/researcher/[chat_id]/page.tsx
+++ b/surfsense_web/app/dashboard/[search_space_id]/researcher/[chat_id]/page.tsx
@@ -239,7 +239,6 @@ const SourcesDialogContent = ({
 
 const ChatPage = () => {
   const [token, setToken] = React.useState<string | null>(null);
-  const [activeTab, setActiveTab] = useState("");
   const [dialogOpenId, setDialogOpenId] = useState<number | null>(null);
   const [sourcesPage, setSourcesPage] = useState(1);
   const [expandedSources, setExpandedSources] = useState(false);
@@ -252,7 +251,6 @@ const ChatPage = () => {
   const [researchMode, setResearchMode] = useState<ResearchMode>("GENERAL");
   const [currentTime, setCurrentTime] = useState<string>('');
   const [currentDate, setCurrentDate] = useState<string>('');
-  const [connectorSources, setConnectorSources] = useState<any[]>([]);
   const terminalMessagesRef = useRef<HTMLDivElement>(null);
   const { connectorSourceItems, isLoading: isLoadingConnectors } = useSearchSourceConnectors();
 
@@ -476,43 +474,10 @@ const ChatPage = () => {
     updateChat();
   }, [messages, status, chat_id, researchMode, selectedConnectors, search_space_id]);
 
-  // Memoize connector sources to prevent excessive re-renders
-  const processedConnectorSources = React.useMemo(() => {
-    if (messages.length === 0) return connectorSources;
-    
-    // Only process when we have a complete message (not streaming)
-    if (status !== 'ready') return connectorSources;
-    
-    // Find the latest assistant message
-    const assistantMessages = messages.filter(msg => msg.role === 'assistant');
-    if (assistantMessages.length === 0) return connectorSources;
-    
-    const latestAssistantMessage = assistantMessages[assistantMessages.length - 1];
-    if (!latestAssistantMessage?.annotations) return connectorSources;
-    
-    // Find the latest SOURCES annotation
-    const annotations = latestAssistantMessage.annotations as any[];
-    const sourcesAnnotations = annotations.filter(a => a.type === 'SOURCES');
-    
-    if (sourcesAnnotations.length === 0) return connectorSources;
-    
-    const latestSourcesAnnotation = sourcesAnnotations[sourcesAnnotations.length - 1];
-    if (!latestSourcesAnnotation.content) return connectorSources;
-    
-    // Use this content if it differs from current
-    return latestSourcesAnnotation.content;
-  }, [messages, status, connectorSources]);
-  
-  // Update connector sources when processed value changes
-  useEffect(() => {
-    if (processedConnectorSources !== connectorSources) {
-      setConnectorSources(processedConnectorSources);
-    }
-  }, [processedConnectorSources, connectorSources]);
-  
   // Check and scroll terminal when terminal info is available
   useEffect(() => {
-    if (messages.length === 0 || status !== 'ready') return;
+    // Modified to trigger during streaming as well (removed status check)
+    if (messages.length === 0) return;
     
     // Find the latest assistant message
     const assistantMessages = messages.filter(msg => msg.role === 'assistant');
@@ -526,10 +491,27 @@ const ChatPage = () => {
     const terminalInfoAnnotations = annotations.filter(a => a.type === 'TERMINAL_INFO');
     
     if (terminalInfoAnnotations.length > 0) {
-      // Schedule scrolling after the DOM has been updated
-      setTimeout(scrollTerminalToBottom, 100);
+      // Always scroll to bottom when terminal info is updated, even during streaming
+      scrollTerminalToBottom();
     }
-  }, [messages, status]);
+  }, [messages]); // Removed status from dependencies to ensure it triggers during streaming
+
+  // Pure function to get connector sources for a specific message
+  const getMessageConnectorSources = (message: any): any[] => {
+    if (!message || message.role !== 'assistant' || !message.annotations) return [];
+
+    // Find all SOURCES annotations
+    const annotations = message.annotations as any[];
+    const sourcesAnnotations = annotations.filter(a => a.type === 'SOURCES');
+
+    // Get the latest SOURCES annotation
+    if (sourcesAnnotations.length === 0) return [];
+    const latestSourcesAnnotation = sourcesAnnotations[sourcesAnnotations.length - 1];
+    
+    if (!latestSourcesAnnotation.content) return [];
+    
+    return latestSourcesAnnotation.content;
+  };
 
   // Custom handleSubmit function to include selected connectors and answer type
   const handleSubmit = (e: React.FormEvent) => {
@@ -561,17 +543,12 @@ const ChatPage = () => {
     scrollToBottom();
   }, [messages]);
 
-  // Set activeTab when connectorSources change using a memoized value
-  const activeTabValue = React.useMemo(() => {
-    return connectorSources.length > 0 ? connectorSources[0].type : "";
-  }, [connectorSources]);
-  
-  // Update activeTab when the memoized value changes
+  // Reset sources page when new messages arrive
   useEffect(() => {
-    if (activeTabValue && activeTabValue !== activeTab) {
-      setActiveTab(activeTabValue);
-    }
-  }, [activeTabValue, activeTab]);
+    // Reset pagination when we get new messages
+    setSourcesPage(1);
+    setExpandedSources(false);
+  }, [messages]);
 
   // Scroll terminal to bottom when expanded
   useEffect(() => {
@@ -582,7 +559,7 @@ const ChatPage = () => {
 
   // Get total sources count for a connector type
   const getSourcesCount = (connectorType: string) => {
-    return getSourcesCountUtil(connectorSources, connectorType);
+    return getSourcesCountUtil(getMessageConnectorSources(messages[messages.length - 1]), connectorType);
   };
 
   // Function to check scroll position and update indicators
@@ -638,23 +615,14 @@ const ChatPage = () => {
       if (assistantMessages.length === 0) return null;
 
       const latestAssistantMessage = assistantMessages[assistantMessages.length - 1];
-      if (!latestAssistantMessage?.annotations) return null;
-
-      // Find all SOURCES annotations
-      const annotations = latestAssistantMessage.annotations as any[];
-      const sourcesAnnotations = annotations.filter(
-        (annotation) => annotation.type === 'SOURCES'
-      );
-
-      // Get the latest SOURCES annotation
-      if (sourcesAnnotations.length === 0) return null;
-      const latestSourcesAnnotation = sourcesAnnotations[sourcesAnnotations.length - 1];
-
-      if (!latestSourcesAnnotation.content) return null;
+      
+      // Use our helper function to get sources
+      const sources = getMessageConnectorSources(latestAssistantMessage);
+      if (sources.length === 0) return null;
 
       // Flatten all sources from all connectors
       const allSources: Source[] = [];
-      latestSourcesAnnotation.content.forEach((connector: ConnectorSource) => {
+      sources.forEach((connector: ConnectorSource) => {
         if (connector.sources && Array.isArray(connector.sources)) {
           connector.sources.forEach((source: SourceItem) => {
             allSources.push({
@@ -675,23 +643,14 @@ const ChatPage = () => {
     } else {
       // Use the specific message by index
       const message = messages[messageIndex];
-      if (!message || message.role !== 'assistant' || !message.annotations) return null;
-
-      // Find all SOURCES annotations
-      const annotations = message.annotations as any[];
-      const sourcesAnnotations = annotations.filter(
-        (annotation) => annotation.type === 'SOURCES'
-      );
-
-      // Get the latest SOURCES annotation
-      if (sourcesAnnotations.length === 0) return null;
-      const latestSourcesAnnotation = sourcesAnnotations[sourcesAnnotations.length - 1];
-
-      if (!latestSourcesAnnotation.content) return null;
+      
+      // Use our helper function to get sources
+      const sources = getMessageConnectorSources(message);
+      if (sources.length === 0) return null;
 
       // Flatten all sources from all connectors
       const allSources: Source[] = [];
-      latestSourcesAnnotation.content.forEach((connector: ConnectorSource) => {
+      sources.forEach((connector: ConnectorSource) => {
         if (connector.sources && Array.isArray(connector.sources)) {
           connector.sources.forEach((source: SourceItem) => {
             allSources.push({
@@ -712,6 +671,34 @@ const ChatPage = () => {
     }
   }, [messages]);
 
+  // Pure function for rendering terminal content - no hooks allowed here
+  const renderTerminalContent = (message: any) => {
+    if (!message.annotations) return null;
+
+    // Get all TERMINAL_INFO annotations
+    const terminalInfoAnnotations = (message.annotations as any[])
+      .filter(a => a.type === 'TERMINAL_INFO');
+
+    // Get the latest TERMINAL_INFO annotation
+    const latestTerminalInfo = terminalInfoAnnotations.length > 0
+      ? terminalInfoAnnotations[terminalInfoAnnotations.length - 1]
+      : null;
+
+    // Render the content of the latest TERMINAL_INFO annotation
+    return latestTerminalInfo?.content.map((item: any, idx: number) => (
+      <div key={idx} className="py-0.5 flex items-start text-gray-300">
+        <span className="text-gray-500 text-xs mr-2 w-10 flex-shrink-0">[{String(idx).padStart(2, '0')}:{String(Math.floor(idx * 2)).padStart(2, '0')}]</span>
+        <span className="mr-2 opacity-70">{'>'}</span>
+        <span className={`
+          ${item.type === 'info' ? 'text-blue-300' : ''}
+          ${item.type === 'success' ? 'text-green-300' : ''}
+          ${item.type === 'error' ? 'text-red-300' : ''}
+          ${item.type === 'warning' ? 'text-yellow-300' : ''}
+        `}>{item.text}</span>
+      </div>
+    ));
+  };
+
   return (
     <>
       <div className="flex flex-col min-h-[calc(100vh-4rem)] min-w-4xl max-w-4xl mx-auto px-4 py-8 overflow-x-hidden justify-center gap-4">
@@ -781,30 +768,9 @@ const ChatPage = () => {
                           <span className="mr-1">$</span>
                           <span>surfsense-researcher</span>
                         </div>
-                        {message.annotations && (() => {
-                          // Get all TERMINAL_INFO annotations
-                          const terminalInfoAnnotations = (message.annotations as any[])
-                            .filter(a => a.type === 'TERMINAL_INFO');
-
-                          // Get the latest TERMINAL_INFO annotation
-                          const latestTerminalInfo = terminalInfoAnnotations.length > 0
-                            ? terminalInfoAnnotations[terminalInfoAnnotations.length - 1]
-                            : null;
-
-                          // Render the content of the latest TERMINAL_INFO annotation
-                          return latestTerminalInfo?.content.map((item: any, idx: number) => (
-                            <div key={idx} className="py-0.5 flex items-start text-gray-300">
-                              <span className="text-gray-500 text-xs mr-2 w-10 flex-shrink-0">[{String(idx).padStart(2, '0')}:{String(Math.floor(idx * 2)).padStart(2, '0')}]</span>
-                              <span className="mr-2 opacity-70">{'>'}</span>
-                              <span className={`
-                                ${item.type === 'info' ? 'text-blue-300' : ''}
-                                ${item.type === 'success' ? 'text-green-300' : ''}
-                                ${item.type === 'error' ? 'text-red-300' : ''}
-                                ${item.type === 'warning' ? 'text-yellow-300' : ''}
-                              `}>{item.text}</span>
-                            </div>
-                          ));
-                        })()}
+                        
+                        {renderTerminalContent(message)}
+                        
                         <div className="mt-2 flex items-center">
                           <span className="text-gray-500 text-xs mr-2 w-10 flex-shrink-0">[00:13]</span>
                           <span className="text-green-400 mr-1">researcher@surfsense</span>
@@ -836,105 +802,120 @@ const ChatPage = () => {
                         <span className="font-medium">Sources</span>
                       </div>
 
-                      <Tabs
-                        defaultValue={connectorSources.length > 0 ? connectorSources[0].type : "CRAWLED_URL"}
-                        className="w-full"
-                        onValueChange={setActiveTab}
-                      >
-                        <div className="mb-4">
-                          <div className="flex items-center">
-                            <Button
-                              variant="ghost"
-                              size="icon"
-                              onClick={scrollTabsLeft}
-                              className="flex-shrink-0 mr-2 z-10"
-                              disabled={!canScrollLeft}
-                            >
-                              <ChevronLeft className="h-4 w-4" />
-                            </Button>
+                      {(() => {
+                        // Get sources for this specific message
+                        const messageConnectorSources = getMessageConnectorSources(message);
+                        
+                        if (messageConnectorSources.length === 0) {
+                          return (
+                            <div className="text-center py-8 text-gray-500 dark:text-gray-400 border border-dashed rounded-md">
+                              <Database className="h-8 w-8 mx-auto mb-2 opacity-50" />
+                            </div>
+                          );
+                        }
+                        
+                        // Use these message-specific sources for the Tabs component
+                        return (
+                          <Tabs
+                            defaultValue={messageConnectorSources.length > 0 ? messageConnectorSources[0].type : "CRAWLED_URL"}
+                            className="w-full"
+                          >
+                            <div className="mb-4">
+                              <div className="flex items-center">
+                                <Button
+                                  variant="ghost"
+                                  size="icon"
+                                  onClick={scrollTabsLeft}
+                                  className="flex-shrink-0 mr-2 z-10"
+                                  disabled={!canScrollLeft}
+                                >
+                                  <ChevronLeft className="h-4 w-4" />
+                                </Button>
 
-                            <div className="flex-1 overflow-hidden">
-                              <div className="flex overflow-x-auto hide-scrollbar" ref={tabsListRef} onScroll={updateScrollIndicators}>
-                                <TabsList className="flex-1 bg-transparent border-0 p-0 custom-tabs-list">
-                                  {connectorSources.map((connector) => (
-                                    <TabsTrigger
-                                      key={connector.id}
-                                      value={connector.type}
-                                      className="flex items-center gap-1 mx-1 data-[state=active]:bg-gray-100 dark:data-[state=active]:bg-gray-800 rounded-md"
-                                    >
-                                      {getConnectorIcon(connector.type)}
-                                      <span className="hidden sm:inline ml-1">{connector.name.split(' ')[0]}</span>
-                                      <span className="bg-gray-200 dark:bg-gray-700 px-1.5 py-0.5 rounded text-xs">
-                                        {getSourcesCount(connector.type)}
-                                      </span>
-                                    </TabsTrigger>
-                                  ))}
-                                </TabsList>
+                                <div className="flex-1 overflow-hidden">
+                                  <div className="flex overflow-x-auto hide-scrollbar" ref={tabsListRef} onScroll={updateScrollIndicators}>
+                                    <TabsList className="flex-1 bg-transparent border-0 p-0 custom-tabs-list">
+                                      {messageConnectorSources.map((connector) => (
+                                        <TabsTrigger
+                                          key={connector.id}
+                                          value={connector.type}
+                                          className="flex items-center gap-1 mx-1 data-[state=active]:bg-gray-100 dark:data-[state=active]:bg-gray-800 rounded-md"
+                                        >
+                                          {getConnectorIcon(connector.type)}
+                                          <span className="hidden sm:inline ml-1">{connector.name.split(' ')[0]}</span>
+                                          <span className="bg-gray-200 dark:bg-gray-700 px-1.5 py-0.5 rounded text-xs">
+                                            {connector.sources?.length || 0}
+                                          </span>
+                                        </TabsTrigger>
+                                      ))}
+                                    </TabsList>
+                                  </div>
+                                </div>
+
+                                <Button
+                                  variant="ghost"
+                                  size="icon"
+                                  onClick={scrollTabsRight}
+                                  className="flex-shrink-0 ml-2 z-10"
+                                  disabled={!canScrollRight}
+                                >
+                                  <ChevronRight className="h-4 w-4" />
+                                </Button>
                               </div>
                             </div>
 
-                            <Button
-                              variant="ghost"
-                              size="icon"
-                              onClick={scrollTabsRight}
-                              className="flex-shrink-0 ml-2 z-10"
-                              disabled={!canScrollRight}
-                            >
-                              <ChevronRight className="h-4 w-4" />
-                            </Button>
-                          </div>
-                        </div>
+                            {messageConnectorSources.map(connector => (
+                              <TabsContent key={connector.id} value={connector.type} className="mt-0">
+                                <div className="space-y-3">
+                                  {connector.sources?.slice(0, INITIAL_SOURCES_DISPLAY)?.map((source: any) => (
+                                    <Card key={source.id} className="p-3 hover:bg-gray-50 dark:hover:bg-gray-800 cursor-pointer">
+                                      <div className="flex items-start gap-3">
+                                        <div className="flex-shrink-0 w-6 h-6 flex items-center justify-center">
+                                          {getConnectorIcon(connector.type)}
+                                        </div>
+                                        <div className="flex-1">
+                                          <h3 className="font-medium text-sm">{source.title}</h3>
+                                          <p className="text-sm text-gray-500 dark:text-gray-400">{source.description}</p>
+                                        </div>
+                                        <Button
+                                          variant="ghost"
+                                          size="icon"
+                                          className="h-6 w-6"
+                                          onClick={() => window.open(source.url, '_blank')}
+                                        >
+                                          <ExternalLink className="h-4 w-4" />
+                                        </Button>
+                                      </div>
+                                    </Card>
+                                  ))}
 
-                        {connectorSources.map(connector => (
-                          <TabsContent key={connector.id} value={connector.type} className="mt-0">
-                            <div className="space-y-3">
-                              {getMainViewSources(connector)?.map((source: any) => (
-                                <Card key={source.id} className="p-3 hover:bg-gray-50 dark:hover:bg-gray-800 cursor-pointer">
-                                  <div className="flex items-start gap-3">
-                                    <div className="flex-shrink-0 w-6 h-6 flex items-center justify-center">
-                                      {getConnectorIcon(connector.type)}
-                                    </div>
-                                    <div className="flex-1">
-                                      <h3 className="font-medium text-sm">{source.title}</h3>
-                                      <p className="text-sm text-gray-500 dark:text-gray-400">{source.description}</p>
-                                    </div>
-                                    <Button
-                                      variant="ghost"
-                                      size="icon"
-                                      className="h-6 w-6"
-                                      onClick={() => window.open(source.url, '_blank')}
-                                    >
-                                      <ExternalLink className="h-4 w-4" />
-                                    </Button>
-                                  </div>
-                                </Card>
-                              ))}
-
-                              {connector.sources.length > INITIAL_SOURCES_DISPLAY && (
-                                <Dialog open={dialogOpenId === connector.id} onOpenChange={(open) => setDialogOpenId(open ? connector.id : null)}>
-                                  <DialogTrigger asChild>
-                                    <Button variant="ghost" className="w-full text-sm text-gray-500 dark:text-gray-400">
-                                      Show {connector.sources.length - INITIAL_SOURCES_DISPLAY} More Sources
-                                    </Button>
-                                  </DialogTrigger>
-                                  <DialogContent className="sm:max-w-[600px] max-h-[80vh] overflow-y-auto dark:border-gray-700">
-                                    <SourcesDialogContent
-                                      connector={connector}
-                                      sourceFilter={sourceFilter}
-                                      expandedSources={expandedSources}
-                                      sourcesPage={sourcesPage}
-                                      setSourcesPage={setSourcesPage}
-                                      setSourceFilter={setSourceFilter}
-                                      setExpandedSources={setExpandedSources}
-                                      isLoadingMore={false}
-                                    />
-                                  </DialogContent>
-                                </Dialog>
-                              )}
-                            </div>
-                          </TabsContent>
-                        ))}
-                      </Tabs>
+                                  {connector.sources?.length > INITIAL_SOURCES_DISPLAY && (
+                                    <Dialog open={dialogOpenId === connector.id} onOpenChange={(open) => setDialogOpenId(open ? connector.id : null)}>
+                                      <DialogTrigger asChild>
+                                        <Button variant="ghost" className="w-full text-sm text-gray-500 dark:text-gray-400">
+                                          Show {connector.sources.length - INITIAL_SOURCES_DISPLAY} More Sources
+                                        </Button>
+                                      </DialogTrigger>
+                                      <DialogContent className="sm:max-w-[600px] max-h-[80vh] overflow-y-auto dark:border-gray-700">
+                                        <SourcesDialogContent
+                                          connector={connector}
+                                          sourceFilter={sourceFilter}
+                                          expandedSources={expandedSources}
+                                          sourcesPage={sourcesPage}
+                                          setSourcesPage={setSourcesPage}
+                                          setSourceFilter={setSourceFilter}
+                                          setExpandedSources={setExpandedSources}
+                                          isLoadingMore={false}
+                                        />
+                                      </DialogContent>
+                                    </Dialog>
+                                  )}
+                                </div>
+                              </TabsContent>
+                            ))}
+                          </Tabs>
+                        );
+                      })()}
                     </div>
 
                     {/* Answer Section */}

From 7a212820640618344aec8c7b709afa4d56ffecc1 Mon Sep 17 00:00:00 2001
From: Xinwei Xiong <3293172751NSS@gmail.com>
Date: Sat, 10 May 2025 13:19:18 +0800
Subject: [PATCH 34/70] enhance Docker installation documentation with
 environment variable details and pgAdmin usage instructions

---
 .../content/docs/docker-installation.mdx      | 37 ++++++++++++++++++-
 1 file changed, 36 insertions(+), 1 deletion(-)

diff --git a/surfsense_web/content/docs/docker-installation.mdx b/surfsense_web/content/docs/docker-installation.mdx
index 1a73cd65e..1545fd598 100644
--- a/surfsense_web/content/docs/docker-installation.mdx
+++ b/surfsense_web/content/docs/docker-installation.mdx
@@ -40,6 +40,7 @@ Before you begin, ensure you have:
     # Copy example environment files
     cp surfsense_backend/.env.example surfsense_backend/.env
     cp surfsense_web/.env.example surfsense_web/.env
+    cp .env.example .env  # For Docker-specific settings
     ```
 
     **Windows (Command Prompt):**
@@ -47,6 +48,7 @@ Before you begin, ensure you have:
     ```cmd
     copy surfsense_backend\.env.example surfsense_backend\.env
     copy surfsense_web\.env.example surfsense_web\.env
+    copy .env.example .env
     ```
 
     **Windows (PowerShell):**
@@ -54,9 +56,25 @@ Before you begin, ensure you have:
     ```powershell
     Copy-Item -Path surfsense_backend\.env.example -Destination surfsense_backend\.env
     Copy-Item -Path surfsense_web\.env.example -Destination surfsense_web\.env
+    Copy-Item -Path .env.example -Destination .env
     ```
 
-    Edit both `.env` files and fill in the required values:
+    Edit all `.env` files and fill in the required values:
+
+### Docker-Specific Environment Variables
+
+| ENV VARIABLE               | DESCRIPTION                                                                 | DEFAULT VALUE       |
+|----------------------------|-----------------------------------------------------------------------------|---------------------|
+| FRONTEND_PORT              | Port for the frontend service                                               | 3000                |
+| BACKEND_PORT               | Port for the backend API service                                            | 8000                |
+| POSTGRES_PORT              | Port for the PostgreSQL database                                            | 5432                |
+| PGADMIN_PORT               | Port for pgAdmin web interface                                              | 5050                |
+| POSTGRES_USER              | PostgreSQL username                                                         | postgres            |
+| POSTGRES_PASSWORD          | PostgreSQL password                                                         | postgres            |
+| POSTGRES_DB                | PostgreSQL database name                                                    | surfsense           |
+| PGADMIN_DEFAULT_EMAIL      | Email for pgAdmin login                                                     | admin@surfsense.com |
+| PGADMIN_DEFAULT_PASSWORD   | Password for pgAdmin login                                                  | surfsense           |
+| NEXT_PUBLIC_API_URL        | URL of the backend API (used by frontend)                                   | http://backend:8000 |
 
 **Backend Environment Variables:**
 
@@ -132,6 +150,23 @@ For other LLM providers, refer to the [LiteLLM documentation](https://docs.litel
    - Frontend: [http://localhost:3000](http://localhost:3000)
    - Backend API: [http://localhost:8000](http://localhost:8000)
    - API Documentation: [http://localhost:8000/docs](http://localhost:8000/docs)
+   - pgAdmin: [http://localhost:5050](http://localhost:5050)
+
+## Using pgAdmin
+
+pgAdmin is included in the Docker setup to help manage your PostgreSQL database. To connect:
+
+1. Open pgAdmin at [http://localhost:5050](http://localhost:5050)
+2. Login with the credentials from your `.env` file (default: admin@surfsense.com / surfsense)
+3. Right-click "Servers" > "Create" > "Server"
+4. In the "General" tab, name your connection (e.g., "SurfSense DB")
+5. In the "Connection" tab:
+   - Host: `db`
+   - Port: `5432`
+   - Maintenance database: `surfsense`
+   - Username: `postgres` (or your custom POSTGRES_USER)
+   - Password: `postgres` (or your custom POSTGRES_PASSWORD)
+6. Click "Save" to connect
 
 ## Useful Docker Commands
 

From ccd12d3ad2000427607ce1800a2c34b6016eb01f Mon Sep 17 00:00:00 2001
From: ritikprajapat21 <ritikprajapati084@gmail.com>
Date: Sat, 10 May 2025 17:11:51 +0530
Subject: [PATCH 35/70] Fix #33: Refactored code

---
 .../app/tasks/background_tasks.py             | 241 ++++++++++--------
 1 file changed, 131 insertions(+), 110 deletions(-)

diff --git a/surfsense_backend/app/tasks/background_tasks.py b/surfsense_backend/app/tasks/background_tasks.py
index 099391f24..1510fb6f1 100644
--- a/surfsense_backend/app/tasks/background_tasks.py
+++ b/surfsense_backend/app/tasks/background_tasks.py
@@ -11,17 +11,19 @@ from langchain_core.documents import Document as LangChainDocument
 from langchain_community.document_loaders import FireCrawlLoader, AsyncChromiumLoader
 from langchain_community.document_transformers import MarkdownifyTransformer
 import validators
+from youtube_transcript_api import YouTubeTranscriptApi
+from urllib.parse import urlparse, parse_qs
+import aiohttp
+from app.db import Document as DB_Document, DocumentType as DB_DocumentType
+import logging
 
 md = MarkdownifyTransformer()
 
 
 async def add_crawled_url_document(
-    session: AsyncSession,
-    url: str,
-    search_space_id: int
+    session: AsyncSession, url: str, search_space_id: int
 ) -> Optional[Document]:
     try:
-
         if not validators.url(url):
             raise ValueError(f"Url {url} is not a valid URL address")
 
@@ -33,7 +35,7 @@ async def add_crawled_url_document(
                 params={
                     "formats": ["markdown"],
                     "excludeTags": ["a"],
-                }
+                },
             )
         else:
             crawl_loader = AsyncChromiumLoader(urls=[url], headless=True)
@@ -43,20 +45,21 @@ async def add_crawled_url_document(
         if type(crawl_loader) == FireCrawlLoader:
             content_in_markdown = url_crawled[0].page_content
         elif type(crawl_loader) == AsyncChromiumLoader:
-            content_in_markdown = md.transform_documents(url_crawled)[
-                0].page_content
+            content_in_markdown = md.transform_documents(url_crawled)[0].page_content
 
         # Format document metadata in a more maintainable way
         metadata_sections = [
-            ("METADATA", [
-                f"{key.upper()}: {value}" for key, value in url_crawled[0].metadata.items()
-            ]),
-            ("CONTENT", [
-                "FORMAT: markdown",
-                "TEXT_START",
-                content_in_markdown,
-                "TEXT_END"
-            ])
+            (
+                "METADATA",
+                [
+                    f"{key.upper()}: {value}"
+                    for key, value in url_crawled[0].metadata.items()
+                ],
+            ),
+            (
+                "CONTENT",
+                ["FORMAT: markdown", "TEXT_START", content_in_markdown, "TEXT_END"],
+            ),
         ]
 
         # Build the document string more efficiently
@@ -69,31 +72,36 @@ async def add_crawled_url_document(
             document_parts.append(f"</{section_title}>")
 
         document_parts.append("</DOCUMENT>")
-        combined_document_string = '\n'.join(document_parts)
+        combined_document_string = "\n".join(document_parts)
 
         # Generate summary
         summary_chain = SUMMARY_PROMPT_TEMPLATE | config.long_context_llm_instance
-        summary_result = await summary_chain.ainvoke({"document": combined_document_string})
+        summary_result = await summary_chain.ainvoke(
+            {"document": combined_document_string}
+        )
         summary_content = summary_result.content
-        summary_embedding = config.embedding_model_instance.embed(
-            summary_content)
+        summary_embedding = config.embedding_model_instance.embed(summary_content)
 
         # Process chunks
         chunks = [
-            Chunk(content=chunk.text, embedding=config.embedding_model_instance.embed(chunk.text))
+            Chunk(
+                content=chunk.text,
+                embedding=config.embedding_model_instance.embed(chunk.text),
+            )
             for chunk in config.chunker_instance.chunk(content_in_markdown)
         ]
 
         # Create and store document
         document = Document(
             search_space_id=search_space_id,
-            title=url_crawled[0].metadata['title'] if type(
-                crawl_loader) == FireCrawlLoader else url_crawled[0].metadata['source'],
+            title=url_crawled[0].metadata["title"]
+            if type(crawl_loader) == FireCrawlLoader
+            else url_crawled[0].metadata["source"],
             document_type=DocumentType.CRAWLED_URL,
             document_metadata=url_crawled[0].metadata,
             content=summary_content,
             embedding=summary_embedding,
-            chunks=chunks
+            chunks=chunks,
         )
 
         session.add(document)
@@ -111,9 +119,7 @@ async def add_crawled_url_document(
 
 
 async def add_extension_received_document(
-    session: AsyncSession,
-    content: ExtensionDocumentContent,
-    search_space_id: int
+    session: AsyncSession, content: ExtensionDocumentContent, search_space_id: int
 ) -> Optional[Document]:
     """
     Process and store document content received from the SurfSense Extension.
@@ -129,20 +135,21 @@ async def add_extension_received_document(
     try:
         # Format document metadata in a more maintainable way
         metadata_sections = [
-            ("METADATA", [
-                f"SESSION_ID: {content.metadata.BrowsingSessionId}",
-                f"URL: {content.metadata.VisitedWebPageURL}",
-                f"TITLE: {content.metadata.VisitedWebPageTitle}",
-                f"REFERRER: {content.metadata.VisitedWebPageReffererURL}",
-                f"TIMESTAMP: {content.metadata.VisitedWebPageDateWithTimeInISOString}",
-                f"DURATION_MS: {content.metadata.VisitedWebPageVisitDurationInMilliseconds}"
-            ]),
-            ("CONTENT", [
-                "FORMAT: markdown",
-                "TEXT_START",
-                content.pageContent,
-                "TEXT_END"
-            ])
+            (
+                "METADATA",
+                [
+                    f"SESSION_ID: {content.metadata.BrowsingSessionId}",
+                    f"URL: {content.metadata.VisitedWebPageURL}",
+                    f"TITLE: {content.metadata.VisitedWebPageTitle}",
+                    f"REFERRER: {content.metadata.VisitedWebPageReffererURL}",
+                    f"TIMESTAMP: {content.metadata.VisitedWebPageDateWithTimeInISOString}",
+                    f"DURATION_MS: {content.metadata.VisitedWebPageVisitDurationInMilliseconds}",
+                ],
+            ),
+            (
+                "CONTENT",
+                ["FORMAT: markdown", "TEXT_START", content.pageContent, "TEXT_END"],
+            ),
         ]
 
         # Build the document string more efficiently
@@ -155,18 +162,22 @@ async def add_extension_received_document(
             document_parts.append(f"</{section_title}>")
 
         document_parts.append("</DOCUMENT>")
-        combined_document_string = '\n'.join(document_parts)
+        combined_document_string = "\n".join(document_parts)
 
         # Generate summary
         summary_chain = SUMMARY_PROMPT_TEMPLATE | config.long_context_llm_instance
-        summary_result = await summary_chain.ainvoke({"document": combined_document_string})
+        summary_result = await summary_chain.ainvoke(
+            {"document": combined_document_string}
+        )
         summary_content = summary_result.content
-        summary_embedding = config.embedding_model_instance.embed(
-            summary_content)
+        summary_embedding = config.embedding_model_instance.embed(summary_content)
 
         # Process chunks
         chunks = [
-            Chunk(content=chunk.text, embedding=config.embedding_model_instance.embed(chunk.text))
+            Chunk(
+                content=chunk.text,
+                embedding=config.embedding_model_instance.embed(chunk.text),
+            )
             for chunk in config.chunker_instance.chunk(content.pageContent)
         ]
 
@@ -178,7 +189,7 @@ async def add_extension_received_document(
             document_metadata=content.metadata.model_dump(),
             content=summary_content,
             embedding=summary_embedding,
-            chunks=chunks
+            chunks=chunks,
         )
 
         session.add(document)
@@ -194,24 +205,23 @@ async def add_extension_received_document(
         await session.rollback()
         raise RuntimeError(f"Failed to process extension document: {str(e)}")
 
+
 async def add_received_markdown_file_document(
-    session: AsyncSession,
-    file_name: str,
-    file_in_markdown: str,
-    search_space_id: int
+    session: AsyncSession, file_name: str, file_in_markdown: str, search_space_id: int
 ) -> Optional[Document]:
     try:
-
         # Generate summary
         summary_chain = SUMMARY_PROMPT_TEMPLATE | config.long_context_llm_instance
         summary_result = await summary_chain.ainvoke({"document": file_in_markdown})
         summary_content = summary_result.content
-        summary_embedding = config.embedding_model_instance.embed(
-            summary_content)
+        summary_embedding = config.embedding_model_instance.embed(summary_content)
 
-       # Process chunks
+        # Process chunks
         chunks = [
-            Chunk(content=chunk.text, embedding=config.embedding_model_instance.embed(chunk.text))
+            Chunk(
+                content=chunk.text,
+                embedding=config.embedding_model_instance.embed(chunk.text),
+            )
             for chunk in config.chunker_instance.chunk(file_in_markdown)
         ]
 
@@ -222,11 +232,11 @@ async def add_received_markdown_file_document(
             document_type=DocumentType.FILE,
             document_metadata={
                 "FILE_NAME": file_name,
-                "SAVED_AT": datetime.now().strftime("%Y-%m-%d %H:%M:%S")
+                "SAVED_AT": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
             },
             content=summary_content,
             embedding=summary_embedding,
-            chunks=chunks
+            chunks=chunks,
         )
 
         session.add(document)
@@ -241,14 +251,17 @@ async def add_received_markdown_file_document(
         await session.rollback()
         raise RuntimeError(f"Failed to process file document: {str(e)}")
 
+
 async def add_received_file_document(
     session: AsyncSession,
     file_name: str,
     unstructured_processed_elements: List[LangChainDocument],
-    search_space_id: int
+    search_space_id: int,
 ) -> Optional[Document]:
     try:
-        file_in_markdown = await convert_document_to_markdown(unstructured_processed_elements)
+        file_in_markdown = await convert_document_to_markdown(
+            unstructured_processed_elements
+        )
 
         # TODO: Check if file_markdown exceeds token limit of embedding model
 
@@ -256,12 +269,14 @@ async def add_received_file_document(
         summary_chain = SUMMARY_PROMPT_TEMPLATE | config.long_context_llm_instance
         summary_result = await summary_chain.ainvoke({"document": file_in_markdown})
         summary_content = summary_result.content
-        summary_embedding = config.embedding_model_instance.embed(
-            summary_content)
+        summary_embedding = config.embedding_model_instance.embed(summary_content)
 
-       # Process chunks
+        # Process chunks
         chunks = [
-            Chunk(content=chunk.text, embedding=config.embedding_model_instance.embed(chunk.text))
+            Chunk(
+                content=chunk.text,
+                embedding=config.embedding_model_instance.embed(chunk.text),
+            )
             for chunk in config.chunker_instance.chunk(file_in_markdown)
         ]
 
@@ -272,11 +287,11 @@ async def add_received_file_document(
             document_type=DocumentType.FILE,
             document_metadata={
                 "FILE_NAME": file_name,
-                "SAVED_AT": datetime.now().strftime("%Y-%m-%d %H:%M:%S")
+                "SAVED_AT": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
             },
             content=summary_content,
             embedding=summary_embedding,
-            chunks=chunks
+            chunks=chunks,
         )
 
         session.add(document)
@@ -293,20 +308,27 @@ async def add_received_file_document(
 
 
 async def add_youtube_video_document(
-    session: AsyncSession,
-    url: str,
-    search_space_id: int
+    session: AsyncSession, url: str, search_space_id: int
 ):
     """
-    Process a YouTube video URL, extract transcripts, and add as document.
+    Process a YouTube video URL, extract transcripts, and store as a document.
+
+    Args:
+        session: Database session for storing the document
+        url: YouTube video URL (supports standard, shortened, and embed formats)
+        search_space_id: ID of the search space to add the document to
+
+    Returns:
+        Document: The created document object
+
+    Raises:
+        ValueError: If the YouTube video ID cannot be extracted from the URL
+        SQLAlchemyError: If there's a database error
+        RuntimeError: If the video processing fails
     """
     try:
-        from youtube_transcript_api import YouTubeTranscriptApi
-
         # Extract video ID from URL
         def get_youtube_video_id(url: str):
-            from urllib.parse import urlparse, parse_qs
-
             parsed_url = urlparse(url)
             hostname = parsed_url.hostname
 
@@ -327,20 +349,16 @@ async def add_youtube_video_document(
         if not video_id:
             raise ValueError(f"Could not extract video ID from URL: {url}")
 
-        # Get video metadata
-        import json
-        from urllib.parse import urlencode
-        from urllib.request import urlopen
-
-        params = {"format": "json",
-                  "url": f"https://www.youtube.com/watch?v={video_id}"}
+        # Get video metadata using async HTTP client
+        params = {
+            "format": "json",
+            "url": f"https://www.youtube.com/watch?v={video_id}",
+        }
         oembed_url = "https://www.youtube.com/oembed"
-        query_string = urlencode(params)
-        full_url = oembed_url + "?" + query_string
 
-        with urlopen(full_url) as response:
-            response_text = response.read()
-            video_data = json.loads(response_text.decode())
+        async with aiohttp.ClientSession() as session:
+            async with session.get(oembed_url, params=params) as response:
+                video_data = await response.json()
 
         # Get video transcript
         try:
@@ -359,19 +377,20 @@ async def add_youtube_video_document(
 
         # Format document metadata in a more maintainable way
         metadata_sections = [
-            ("METADATA", [
-                f"TITLE: {video_data.get('title', 'YouTube Video')}",
-                f"URL: {url}",
-                f"VIDEO_ID: {video_id}",
-                f"AUTHOR: {video_data.get('author_name', 'Unknown')}",
-                f"THUMBNAIL: {video_data.get('thumbnail_url', '')}"
-            ]),
-            ("CONTENT", [
-                "FORMAT: transcript",
-                "TEXT_START",
-                transcript_text,
-                "TEXT_END"
-            ])
+            (
+                "METADATA",
+                [
+                    f"TITLE: {video_data.get('title', 'YouTube Video')}",
+                    f"URL: {url}",
+                    f"VIDEO_ID: {video_id}",
+                    f"AUTHOR: {video_data.get('author_name', 'Unknown')}",
+                    f"THUMBNAIL: {video_data.get('thumbnail_url', '')}",
+                ],
+            ),
+            (
+                "CONTENT",
+                ["FORMAT: transcript", "TEXT_START", transcript_text, "TEXT_END"],
+            ),
         ]
 
         # Build the document string more efficiently
@@ -384,38 +403,41 @@ async def add_youtube_video_document(
             document_parts.append(f"</{section_title}>")
 
         document_parts.append("</DOCUMENT>")
-        combined_document_string = '\n'.join(document_parts)
+        combined_document_string = "\n".join(document_parts)
 
         # Generate summary
         summary_chain = SUMMARY_PROMPT_TEMPLATE | config.long_context_llm_instance
-        summary_result = await summary_chain.ainvoke({"document": combined_document_string})
+        summary_result = await summary_chain.ainvoke(
+            {"document": combined_document_string}
+        )
         summary_content = summary_result.content
-        summary_embedding = config.embedding_model_instance.embed(
-            summary_content)
+        summary_embedding = config.embedding_model_instance.embed(summary_content)
 
         # Process chunks
         chunks = [
-            Chunk(content=chunk.text, embedding=config.embedding_model_instance.embed(chunk.text))
+            Chunk(
+                content=chunk.text,
+                embedding=config.embedding_model_instance.embed(chunk.text),
+            )
             for chunk in config.chunker_instance.chunk(transcript_text)
         ]
 
         # Create document
-        from app.db import Document, DocumentType
 
-        document = Document(
+        document = DB_Document(
             title=video_data.get("title", "YouTube Video"),
-            document_type=DocumentType.YOUTUBE_VIDEO,
+            document_type=DB_DocumentType.YOUTUBE_VIDEO,
             document_metadata={
                 "url": url,
                 "video_id": video_id,
                 "video_title": video_data.get("title", "YouTube Video"),
                 "author": video_data.get("author_name", "Unknown"),
-                "thumbnail": video_data.get("thumbnail_url", "")
+                "thumbnail": video_data.get("thumbnail_url", ""),
             },
             content=summary_content,
             embedding=summary_embedding,
             chunks=chunks,
-            search_space_id=search_space_id
+            search_space_id=search_space_id,
         )
 
         session.add(document)
@@ -428,6 +450,5 @@ async def add_youtube_video_document(
         raise db_error
     except Exception as e:
         await session.rollback()
-        import logging
         logging.error(f"Failed to process YouTube video: {str(e)}")
         raise

From a58550818b761795137044bed70a8eb22a74d599 Mon Sep 17 00:00:00 2001
From: "DESKTOP-RTLN3BA\\$punk" <vermarohanfinal@gmail.com>
Date: Sat, 10 May 2025 20:06:19 -0700
Subject: [PATCH 36/70] feat: Added chat_history to researcher agent

---
 surfsense_backend/app/agents/__init__.py      |  1 -
 .../app/agents/researcher/graph.py            |  6 +-
 .../app/agents/researcher/nodes.py            | 32 +++++-
 .../app/agents/researcher/state.py            |  4 +-
 .../researcher/sub_section_writer/nodes.py    |  4 +-
 .../researcher/sub_section_writer/state.py    |  3 +-
 surfsense_backend/app/routes/chats_routes.py  | 24 ++++-
 surfsense_backend/app/schemas/chats.py        | 12 +--
 .../tasks/stream_connector_search_results.py  |  8 +-
 surfsense_backend/app/utils/query_service.py  | 99 +++++++++++--------
 10 files changed, 129 insertions(+), 64 deletions(-)

diff --git a/surfsense_backend/app/agents/__init__.py b/surfsense_backend/app/agents/__init__.py
index 944afebc6..e69de29bb 100644
--- a/surfsense_backend/app/agents/__init__.py
+++ b/surfsense_backend/app/agents/__init__.py
@@ -1 +0,0 @@
-"""This is upcoming research agent. Work in progress."""
\ No newline at end of file
diff --git a/surfsense_backend/app/agents/researcher/graph.py b/surfsense_backend/app/agents/researcher/graph.py
index 31835da4a..0f6915f7a 100644
--- a/surfsense_backend/app/agents/researcher/graph.py
+++ b/surfsense_backend/app/agents/researcher/graph.py
@@ -1,6 +1,6 @@
 from langgraph.graph import StateGraph
 from .state import State
-from .nodes import write_answer_outline, process_sections
+from .nodes import reformulate_user_query, write_answer_outline, process_sections
 from .configuration import Configuration
 from typing import TypedDict, List, Dict, Any, Optional
 
@@ -25,11 +25,13 @@ def build_graph():
     workflow = StateGraph(State, config_schema=Configuration)
     
     # Add nodes to the graph
+    workflow.add_node("reformulate_user_query", reformulate_user_query)
     workflow.add_node("write_answer_outline", write_answer_outline)
     workflow.add_node("process_sections", process_sections)
 
     # Define the edges - create a linear flow
-    workflow.add_edge("__start__", "write_answer_outline")
+    workflow.add_edge("__start__", "reformulate_user_query")
+    workflow.add_edge("reformulate_user_query", "write_answer_outline")
     workflow.add_edge("write_answer_outline", "process_sections")
     workflow.add_edge("process_sections", "__end__")
 
diff --git a/surfsense_backend/app/agents/researcher/nodes.py b/surfsense_backend/app/agents/researcher/nodes.py
index 902dbe3e2..b0b81aef3 100644
--- a/surfsense_backend/app/agents/researcher/nodes.py
+++ b/surfsense_backend/app/agents/researcher/nodes.py
@@ -16,6 +16,8 @@ from .state import State
 from .sub_section_writer.graph import graph as sub_section_writer_graph
 from .sub_section_writer.configuration import SubSectionType
 
+from app.utils.query_service import QueryService
+
 
 from langgraph.types import StreamWriter
 
@@ -47,6 +49,7 @@ async def write_answer_outline(state: State, config: RunnableConfig, writer: Str
     writer({"yeild_value": streaming_service._format_annotations()})
     # Get configuration from runnable config
     configuration = Configuration.from_runnable_config(config)
+    reformulated_query = state.reformulated_query
     user_query = configuration.user_query
     num_sections = configuration.num_sections
     
@@ -60,7 +63,7 @@ async def write_answer_outline(state: State, config: RunnableConfig, writer: Str
     human_message_content = f"""
     Now Please create an answer outline for the following query:
     
-    User Query: {user_query}
+    User Query: {reformulated_query}
     Number of Sections: {num_sections}
     
     Remember to format your response as valid JSON exactly matching this structure:
@@ -719,8 +722,11 @@ async def process_section_with_documents(
                 }
             }
             
-            # Create the initial state with db_session
-            sub_state = {"db_session": db_session}
+            # Create the initial state with db_session and chat_history
+            sub_state = {
+                "db_session": db_session,
+                "chat_history": state.chat_history
+            }
             
             # Invoke the sub-section writer graph
             print(f"Invoking sub_section_writer for: {section_title}")
@@ -749,3 +755,23 @@ async def process_section_with_documents(
             
         return f"Error processing section: {section_title}. Details: {str(e)}"
 
+
+
+async def reformulate_user_query(state: State, config: RunnableConfig, writer: StreamWriter) -> Dict[str, Any]:
+    """
+    Reforms the user query based on the chat history.
+    """
+    
+    configuration = Configuration.from_runnable_config(config)
+    user_query = configuration.user_query
+    chat_history_str = await QueryService.langchain_chat_history_to_str(state.chat_history)
+    if len(state.chat_history) == 0: 
+        reformulated_query = user_query
+    else:
+        reformulated_query = await QueryService.reformulate_query_with_chat_history(user_query, chat_history_str)
+    
+    return {
+        "reformulated_query": reformulated_query
+    }
+
+
diff --git a/surfsense_backend/app/agents/researcher/state.py b/surfsense_backend/app/agents/researcher/state.py
index 7850c5b7a..edc73f1e0 100644
--- a/surfsense_backend/app/agents/researcher/state.py
+++ b/surfsense_backend/app/agents/researcher/state.py
@@ -21,7 +21,9 @@ class State:
     # Streaming service
     streaming_service: StreamingService
     
-    # chat_history: Optional[List[Any]] = field(default=None)
+    chat_history: Optional[List[Any]] = field(default_factory=list)
+    
+    reformulated_query: Optional[str] = field(default=None)
     # Using field to explicitly mark as part of state
     answer_outline: Optional[Any] = field(default=None)
     
diff --git a/surfsense_backend/app/agents/researcher/sub_section_writer/nodes.py b/surfsense_backend/app/agents/researcher/sub_section_writer/nodes.py
index 3cd699ce5..d52a42530 100644
--- a/surfsense_backend/app/agents/researcher/sub_section_writer/nodes.py
+++ b/surfsense_backend/app/agents/researcher/sub_section_writer/nodes.py
@@ -164,13 +164,13 @@ async def write_sub_section(state: State, config: RunnableConfig) -> Dict[str, A
     """
     
     # Create messages for the LLM
-    messages = [
+    messages_with_chat_history = state.chat_history + [
         SystemMessage(content=get_citation_system_prompt()),
         HumanMessage(content=human_message_content)
     ]
     
     # Call the LLM and get the response
-    response = await llm.ainvoke(messages)
+    response = await llm.ainvoke(messages_with_chat_history)
     final_answer = response.content
     
     return {
diff --git a/surfsense_backend/app/agents/researcher/sub_section_writer/state.py b/surfsense_backend/app/agents/researcher/sub_section_writer/state.py
index b33abe6bd..7998279be 100644
--- a/surfsense_backend/app/agents/researcher/sub_section_writer/state.py
+++ b/surfsense_backend/app/agents/researcher/sub_section_writer/state.py
@@ -2,7 +2,7 @@
 
 from __future__ import annotations
 
-from dataclasses import dataclass
+from dataclasses import dataclass, field
 from typing import List, Optional, Any
 from sqlalchemy.ext.asyncio import AsyncSession
 
@@ -17,6 +17,7 @@ class State:
     # Runtime context
     db_session: AsyncSession
     
+    chat_history: Optional[List[Any]] = field(default_factory=list)
     # OUTPUT: Populated by agent nodes
     reranked_documents: Optional[List[Any]] = None
     final_answer: Optional[str] = None
diff --git a/surfsense_backend/app/routes/chats_routes.py b/surfsense_backend/app/routes/chats_routes.py
index 74ea97b06..62c7e8a9b 100644
--- a/surfsense_backend/app/routes/chats_routes.py
+++ b/surfsense_backend/app/routes/chats_routes.py
@@ -10,7 +10,7 @@ from fastapi.responses import StreamingResponse
 from sqlalchemy.exc import IntegrityError, OperationalError
 from sqlalchemy.ext.asyncio import AsyncSession
 from sqlalchemy.future import select
-
+from langchain.schema import HumanMessage, AIMessage
 router = APIRouter()
 
 @router.post("/chat")
@@ -20,11 +20,11 @@ async def handle_chat_data(
     user: User = Depends(current_active_user)
 ):
     messages = request.messages
-    if messages[-1].role != "user":
+    if messages[-1]['role'] != "user":
         raise HTTPException(
             status_code=400, detail="Last message must be a user message")
 
-    user_query = messages[-1].content
+    user_query = messages[-1]['content']
     search_space_id = request.data.get('search_space_id')
     research_mode: str = request.data.get('research_mode')
     selected_connectors: List[str] = request.data.get('selected_connectors')
@@ -43,6 +43,21 @@ async def handle_chat_data(
     except HTTPException:
         raise HTTPException(
             status_code=403, detail="You don't have access to this search space")
+        
+    langchain_chat_history = []
+    for message in messages[:-1]:
+        if message['role'] == "user":
+            langchain_chat_history.append(HumanMessage(content=message['content']))
+        elif message['role'] == "assistant":
+            # Last annotation type will always be "ANSWER" here
+            answer_annotation = message['annotations'][-1]
+            answer_text = ""
+            if answer_annotation['type'] == "ANSWER":
+                answer_text = answer_annotation['content']
+                # If content is a list, join it into a single string
+                if isinstance(answer_text, list):
+                    answer_text = "\n".join(answer_text)
+                langchain_chat_history.append(AIMessage(content=answer_text))
 
     response = StreamingResponse(stream_connector_search_results(
         user_query,
@@ -50,7 +65,8 @@ async def handle_chat_data(
         search_space_id,  # Already converted to int in lines 32-37
         session,
         research_mode,
-        selected_connectors
+        selected_connectors,
+        langchain_chat_history
     ))
     response.headers['x-vercel-ai-data-stream'] = 'v1'
     return response
diff --git a/surfsense_backend/app/schemas/chats.py b/surfsense_backend/app/schemas/chats.py
index 3d73ab2f9..82191fbd6 100644
--- a/surfsense_backend/app/schemas/chats.py
+++ b/surfsense_backend/app/schemas/chats.py
@@ -27,14 +27,14 @@ class ToolInvocation(BaseModel):
     result: dict
     
     
-class ClientMessage(BaseModel):
-    role: str
-    content: str
-    experimental_attachments: Optional[List[ClientAttachment]] = None
-    toolInvocations: Optional[List[ToolInvocation]] = None
+# class ClientMessage(BaseModel):
+#     role: str
+#     content: str
+#     experimental_attachments: Optional[List[ClientAttachment]] = None
+#     toolInvocations: Optional[List[ToolInvocation]] = None
     
 class AISDKChatRequest(BaseModel):
-    messages: List[ClientMessage]
+    messages: List[Any]
     data: Optional[Dict[str, Any]] = None
 
 class ChatCreate(ChatBase):
diff --git a/surfsense_backend/app/tasks/stream_connector_search_results.py b/surfsense_backend/app/tasks/stream_connector_search_results.py
index c7eb07627..2f3b50a0f 100644
--- a/surfsense_backend/app/tasks/stream_connector_search_results.py
+++ b/surfsense_backend/app/tasks/stream_connector_search_results.py
@@ -1,4 +1,4 @@
-from typing import AsyncGenerator, List, Union
+from typing import Any, AsyncGenerator, List, Union
 from uuid import UUID
 
 from app.agents.researcher.graph import graph as researcher_graph
@@ -13,7 +13,8 @@ async def stream_connector_search_results(
     search_space_id: int, 
     session: AsyncSession, 
     research_mode: str, 
-    selected_connectors: List[str]
+    selected_connectors: List[str],
+    langchain_chat_history: List[Any]
 ) -> AsyncGenerator[str, None]:
     """
     Stream connector search results to the client
@@ -53,7 +54,8 @@ async def stream_connector_search_results(
     # Initialize state with database session and streaming service
     initial_state = State(
         db_session=session,
-        streaming_service=streaming_service
+        streaming_service=streaming_service,
+        chat_history=langchain_chat_history
     )
     
     # Run the graph directly
diff --git a/surfsense_backend/app/utils/query_service.py b/surfsense_backend/app/utils/query_service.py
index 760f0c8fa..588a8695b 100644
--- a/surfsense_backend/app/utils/query_service.py
+++ b/surfsense_backend/app/utils/query_service.py
@@ -1,8 +1,7 @@
-"""
-NOTE: This is not used anymore. Might be removed in the future.
-"""
-from langchain.schema import HumanMessage, SystemMessage
+from langchain.schema import HumanMessage, SystemMessage, AIMessage
 from app.config import config
+from typing import Any, List, Optional
+
 
 class QueryService:
     """
@@ -10,72 +9,90 @@ class QueryService:
     """
 
     @staticmethod
-    async def reformulate_query(user_query: str) -> str:
+    async def reformulate_query_with_chat_history(user_query: str, chat_history_str: Optional[str] = None) -> str:
         """
         Reformulate the user query using the STRATEGIC_LLM to make it more 
         effective for information retrieval and research purposes.
-        
+
         Args:
             user_query: The original user query
-            
+            chat_history: Optional list of previous chat messages
+
         Returns:
             str: The reformulated query
         """
         if not user_query or not user_query.strip():
             return user_query
-            
+
         try:
             # Get the strategic LLM instance from config
             llm = config.strategic_llm_instance
-            
+
             # Create system message with instructions
             system_message = SystemMessage(
-                content="""
-                You are an expert at reformulating user queries to optimize information retrieval. 
-                Your job is to take a user query and reformulate it to:
-                
-                1. Make it more specific and detailed
-                2. Expand ambiguous terms
-                3. Include relevant synonyms and alternative phrasings
-                4. Break down complex questions into their core components
-                5. Ensure it's comprehensive for research purposes
-                
-                The query will be used with the following data sources/connectors:
-                - SERPER_API: Web search for retrieving current information from the internet
-                - TAVILY_API: Research-focused search API for comprehensive information
-                - SLACK_CONNECTOR: Retrieves information from indexed Slack workspace conversations
-                - NOTION_CONNECTOR: Retrieves information from indexed Notion documents and databases
-                - FILE: Searches through user's uploaded files
-                - CRAWLED_URL: Searches through previously crawled web pages
-                
-                IMPORTANT: Keep the reformulated query as concise as possible while still being effective.
-                Avoid unnecessary verbosity and limit the query to only essential terms and concepts.
-                
-                Please optimize the query to work effectively across these different data sources.
-                
-                Return ONLY the reformulated query without explanations, prefixes, or commentary.
-                Do not include phrases like "Reformulated query:" or any other text except the query itself.
+                content=f"""
+                You are a highly skilled AI assistant specializing in query optimization for advanced research.
+                Your primary objective is to transform a user's initial query into a highly effective search query.
+                This reformulated query will be used to retrieve information from diverse data sources.
+
+                **Chat History Context:**
+                {chat_history_str if chat_history_str else "No prior conversation history is available."}
+                If chat history is provided, analyze it to understand the user's evolving information needs and the broader context of their request. Use this understanding to refine the current query, ensuring it builds upon or clarifies previous interactions.
+
+                **Query Reformulation Guidelines:**
+                Your reformulated query should:
+                1.  **Enhance Specificity and Detail:** Add precision to narrow the search focus effectively, making the query less ambiguous and more targeted.
+                2.  **Resolve Ambiguities:** Identify and clarify vague terms or phrases. If a term has multiple meanings, orient the query towards the most likely one given the context.
+                3.  **Expand Key Concepts:** Incorporate relevant synonyms, related terms, and alternative phrasings for core concepts. This helps capture a wider range of relevant documents.
+                4.  **Deconstruct Complex Questions:** If the original query is multifaceted, break it down into its core searchable components or rephrase it to address each aspect clearly. The final output must still be a single, coherent query string.
+                5.  **Optimize for Comprehensiveness:** Ensure the query is structured to uncover all essential facets of the original request, aiming for thorough information retrieval suitable for research.
+                6.  **Maintain User Intent:** The reformulated query must stay true to the original intent of the user's query. Do not introduce new topics or shift the focus significantly.
+
+                **Crucial Constraints:**
+                *   **Conciseness and Effectiveness:** While aiming for comprehensiveness, the reformulated query MUST be as concise as possible. Eliminate all unnecessary verbosity. Focus on essential keywords, entities, and concepts that directly contribute to effective retrieval.
+                *   **Single, Direct Output:** Return ONLY the reformulated query itself. Do NOT include any explanations, introductory phrases (e.g., "Reformulated query:", "Here is the optimized query:"), or any other surrounding text or markdown formatting.
+
+                Your output should be a single, optimized query string, ready for immediate use in a search system.
                 """
             )
-            
+
             # Create human message with the user query
             human_message = HumanMessage(
                 content=f"Reformulate this query for better research results: {user_query}"
             )
-            
+
             # Get the response from the LLM
             response = await llm.agenerate(messages=[[system_message, human_message]])
-            
+
             # Extract the reformulated query from the response
             reformulated_query = response.generations[0][0].text.strip()
-            
+
             # Return the original query if the reformulation is empty
             if not reformulated_query:
                 return user_query
-                
+
             return reformulated_query
-            
+
         except Exception as e:
             # Log the error and return the original query
             print(f"Error reformulating query: {e}")
-            return user_query 
\ No newline at end of file
+            return user_query
+
+
+    @staticmethod
+    async def langchain_chat_history_to_str(chat_history: List[Any]) -> str:
+        """
+        Convert a list of chat history messages to a string.
+        """
+        chat_history_str = "<chat_history>\n"
+        
+        for chat_message in chat_history:
+            if isinstance(chat_message, HumanMessage):
+                chat_history_str += f"<user>{chat_message.content}</user>\n"
+            elif isinstance(chat_message, AIMessage):
+                chat_history_str += f"<assistant>{chat_message.content}</assistant>\n"
+            elif isinstance(chat_message, SystemMessage):
+                chat_history_str += f"<system>{chat_message.content}</system>\n"
+                
+        chat_history_str += "</chat_history>"
+        return chat_history_str

From d3540d8cc5a26f786edc0a4c3e856d5d08ebbc0e Mon Sep 17 00:00:00 2001
From: "DESKTOP-RTLN3BA\\$punk" <vermarohanfinal@gmail.com>
Date: Sat, 10 May 2025 20:19:08 -0700
Subject: [PATCH 37/70] chore: more context to reranker

---
 .../app/agents/researcher/sub_section_writer/nodes.py         | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/surfsense_backend/app/agents/researcher/sub_section_writer/nodes.py b/surfsense_backend/app/agents/researcher/sub_section_writer/nodes.py
index d52a42530..765b619b8 100644
--- a/surfsense_backend/app/agents/researcher/sub_section_writer/nodes.py
+++ b/surfsense_backend/app/agents/researcher/sub_section_writer/nodes.py
@@ -39,7 +39,9 @@ async def rerank_documents(state: State, config: RunnableConfig) -> Dict[str, An
         try:
             # Use the sub-section questions for reranking context
             # rerank_query = "\n".join(sub_section_questions)
-            rerank_query = configuration.user_query
+            # rerank_query = configuration.user_query
+            
+            rerank_query = configuration.user_query + "\n" + "\n".join(sub_section_questions)
 
             # Convert documents to format expected by reranker if needed
             reranker_input_docs = [

From 16867b4bed3b2c009037963410ecf9c806681164 Mon Sep 17 00:00:00 2001
From: ritikprajapat21 <ritikprajapati084@gmail.com>
Date: Mon, 12 May 2025 09:36:20 +0530
Subject: [PATCH 38/70] Changes updated

---
 surfsense_backend/app/tasks/background_tasks.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/surfsense_backend/app/tasks/background_tasks.py b/surfsense_backend/app/tasks/background_tasks.py
index 1510fb6f1..1d278f715 100644
--- a/surfsense_backend/app/tasks/background_tasks.py
+++ b/surfsense_backend/app/tasks/background_tasks.py
@@ -356,8 +356,8 @@ async def add_youtube_video_document(
         }
         oembed_url = "https://www.youtube.com/oembed"
 
-        async with aiohttp.ClientSession() as session:
-            async with session.get(oembed_url, params=params) as response:
+        async with aiohttp.ClientSession() as http_session:
+            async with http_session.get(oembed_url, params=params) as response:
                 video_data = await response.json()
 
         # Get video transcript

From fbbb3294f4c67d50b590420bc39d233f29eae862 Mon Sep 17 00:00:00 2001
From: "DESKTOP-RTLN3BA\\$punk" <vermarohanfinal@gmail.com>
Date: Sun, 11 May 2025 23:04:48 -0700
Subject: [PATCH 39/70] feat: Introduce the RAPTOR Search.

---
 .../app/agents/researcher/configuration.py    |   7 +
 .../app/agents/researcher/nodes.py            |  32 ++-
 .../researcher/sub_section_writer/nodes.py    |  10 +-
 .../researcher/sub_section_writer/prompts.py  |  21 ++
 .../app/retriver/documents_hybrid_search.py   |  14 +-
 surfsense_backend/app/routes/chats_routes.py  |   7 +-
 .../tasks/stream_connector_search_results.py  |  13 +-
 .../app/utils/connector_service.py            | 253 +++++++++++++-----
 .../researcher/[chat_id]/page.tsx             |  82 +++---
 .../components/chat/ConnectorComponents.tsx   |   2 +-
 .../components/chat/SegmentedControl.tsx      |   4 +-
 11 files changed, 318 insertions(+), 127 deletions(-)

diff --git a/surfsense_backend/app/agents/researcher/configuration.py b/surfsense_backend/app/agents/researcher/configuration.py
index 8ba3849a3..0eb34b540 100644
--- a/surfsense_backend/app/agents/researcher/configuration.py
+++ b/surfsense_backend/app/agents/researcher/configuration.py
@@ -3,10 +3,16 @@
 from __future__ import annotations
 
 from dataclasses import dataclass, fields
+from enum import Enum
 from typing import Optional, List, Any
 
 from langchain_core.runnables import RunnableConfig
 
+class SearchMode(Enum): 
+    """Enum defining the type of search mode."""
+    CHUNKS = "CHUNKS"
+    DOCUMENTS = "DOCUMENTS"
+
 
 @dataclass(kw_only=True)
 class Configuration:
@@ -18,6 +24,7 @@ class Configuration:
     connectors_to_search: List[str]
     user_id: str
     search_space_id: int
+    search_mode: SearchMode
 
 
     @classmethod
diff --git a/surfsense_backend/app/agents/researcher/nodes.py b/surfsense_backend/app/agents/researcher/nodes.py
index b0b81aef3..644ddd918 100644
--- a/surfsense_backend/app/agents/researcher/nodes.py
+++ b/surfsense_backend/app/agents/researcher/nodes.py
@@ -10,7 +10,7 @@ from langchain_core.runnables import RunnableConfig
 from pydantic import BaseModel, Field
 from sqlalchemy.ext.asyncio import AsyncSession
 
-from .configuration import Configuration
+from .configuration import Configuration, SearchMode
 from .prompts import get_answer_outline_system_prompt
 from .state import State
 from .sub_section_writer.graph import graph as sub_section_writer_graph
@@ -149,7 +149,8 @@ async def fetch_relevant_documents(
     writer: StreamWriter = None,
     state: State = None,
     top_k: int = 10,
-    connector_service: ConnectorService = None
+    connector_service: ConnectorService = None,
+    search_mode: SearchMode = SearchMode.CHUNKS
 ) -> List[Dict[str, Any]]:
     """
     Fetch relevant documents for research questions using the provided connectors.
@@ -213,7 +214,8 @@ async def fetch_relevant_documents(
                         user_query=reformulated_query,
                         user_id=user_id,
                         search_space_id=search_space_id,
-                        top_k=top_k
+                        top_k=top_k,
+                        search_mode=search_mode
                     )
                     
                     # Add to sources and raw documents
@@ -231,7 +233,8 @@ async def fetch_relevant_documents(
                         user_query=reformulated_query,
                         user_id=user_id,
                         search_space_id=search_space_id,
-                        top_k=top_k
+                        top_k=top_k,
+                        search_mode=search_mode
                     )
                     
                     # Add to sources and raw documents
@@ -249,7 +252,8 @@ async def fetch_relevant_documents(
                         user_query=reformulated_query,
                         user_id=user_id,
                         search_space_id=search_space_id,
-                        top_k=top_k
+                        top_k=top_k,
+                        search_mode=search_mode
                     )
                     
                     # Add to sources and raw documents
@@ -267,7 +271,8 @@ async def fetch_relevant_documents(
                         user_query=reformulated_query,
                         user_id=user_id,
                         search_space_id=search_space_id,
-                        top_k=top_k
+                        top_k=top_k,
+                        search_mode=search_mode
                     )
                     
                     # Add to sources and raw documents
@@ -286,7 +291,8 @@ async def fetch_relevant_documents(
                         user_query=reformulated_query,
                         user_id=user_id,
                         search_space_id=search_space_id,
-                        top_k=top_k
+                        top_k=top_k,
+                        search_mode=search_mode
                     )
                     
                     # Add to sources and raw documents
@@ -304,7 +310,8 @@ async def fetch_relevant_documents(
                         user_query=reformulated_query,
                         user_id=user_id,
                         search_space_id=search_space_id,
-                        top_k=top_k
+                        top_k=top_k,
+                        search_mode=search_mode
                     )
                     
                     # Add to sources and raw documents
@@ -322,7 +329,8 @@ async def fetch_relevant_documents(
                         user_query=reformulated_query,
                         user_id=user_id,
                         search_space_id=search_space_id,
-                        top_k=top_k
+                        top_k=top_k,
+                        search_mode=search_mode
                     )
                     
                     # Add to sources and raw documents
@@ -340,7 +348,8 @@ async def fetch_relevant_documents(
                         user_query=reformulated_query,
                         user_id=user_id,
                         search_space_id=search_space_id,
-                        top_k=top_k
+                        top_k=top_k,
+                        search_mode=search_mode
                     )
                     
                     # Add to sources and raw documents
@@ -558,7 +567,8 @@ async def process_sections(state: State, config: RunnableConfig, writer: StreamW
                 writer=writer,
                 state=state,
                 top_k=TOP_K,
-                connector_service=connector_service
+                connector_service=connector_service,
+                search_mode=configuration.search_mode
             )
         except Exception as e:
             error_message = f"Error fetching relevant documents: {str(e)}"
diff --git a/surfsense_backend/app/agents/researcher/sub_section_writer/nodes.py b/surfsense_backend/app/agents/researcher/sub_section_writer/nodes.py
index 765b619b8..5853283c3 100644
--- a/surfsense_backend/app/agents/researcher/sub_section_writer/nodes.py
+++ b/surfsense_backend/app/agents/researcher/sub_section_writer/nodes.py
@@ -141,6 +141,11 @@ async def write_sub_section(state: State, config: RunnableConfig) -> Dict[str, A
     
     # Construct a clear, structured query for the LLM
     human_message_content = f"""
+    Source material:
+    <documents>
+        {documents_text}
+    </documents>
+    
     Now user's query is: 
     <user_query>
         {user_query}
@@ -158,11 +163,6 @@ async def write_sub_section(state: State, config: RunnableConfig) -> Dict[str, A
     <guiding_questions>
         {questions_text}
     </guiding_questions>
-    
-    Use the provided documents as your source material and cite them properly using the IEEE citation format [X] where X is the source_id.
-    <documents>
-        {documents_text}
-    </documents>
     """
     
     # Create messages for the LLM
diff --git a/surfsense_backend/app/agents/researcher/sub_section_writer/prompts.py b/surfsense_backend/app/agents/researcher/sub_section_writer/prompts.py
index 18a91eb07..48345c9c1 100644
--- a/surfsense_backend/app/agents/researcher/sub_section_writer/prompts.py
+++ b/surfsense_backend/app/agents/researcher/sub_section_writer/prompts.py
@@ -25,6 +25,8 @@ You are a research assistant tasked with analyzing documents and providing compr
 16. CRITICAL: Citations must ONLY appear as [X] or [X], [Y], [Z] format - never with parentheses, hyperlinks, or other formatting.
 17. CRITICAL: Never make up citation numbers. Only use source_id values that are explicitly provided in the document metadata.
 18. CRITICAL: If you are unsure about a source_id, do not include a citation rather than guessing or making one up.
+19. CRITICAL: Focus only on answering the user's query. Any guiding questions provided are for your thinking process only and should not be mentioned in your response.
+20. CRITICAL: Ensure your response aligns with the provided sub-section title and section position.
 </instructions>
 
 <format>
@@ -37,6 +39,8 @@ You are a research assistant tasked with analyzing documents and providing compr
 - NEVER create your own citation numbering system - use the exact source_id values from the documents.
 - NEVER format citations as clickable links or as markdown links like "([1](https://example.com))". Always use plain square brackets only.
 - NEVER make up citation numbers if you are unsure about the source_id. It is better to omit the citation than to guess.
+- NEVER include or mention the guiding questions in your response. They are only to help guide your thinking.
+- ALWAYS focus on answering the user's query directly from the information in the documents.
 </format>
 
 <input_example>
@@ -84,4 +88,21 @@ ONLY use plain square brackets [1] or multiple citations [1], [2], [3]
 </incorrect_citation_formats>
 
 Note that the citation numbers match exactly with the source_id values (1, 13, and 21) and are not renumbered sequentially. Citations follow IEEE style with square brackets and appear at the end of sentences.
+
+<user_query_instructions>
+When you see a user query like:
+    <user_query>
+        Give all linear issues.
+    </user_query>
+
+Focus exclusively on answering this query using information from the provided documents. 
+
+If guiding questions are provided in a <guiding_questions> section, use them only to guide your thinking process. Do not mention or list these questions in your response.
+
+Make sure your response:
+1. Directly answers the user's query
+2. Fits the provided sub-section title and section position
+3. Uses proper citations for all information from documents
+4. Is well-structured and professional in tone
+</user_query_instructions>
 """
\ No newline at end of file
diff --git a/surfsense_backend/app/retriver/documents_hybrid_search.py b/surfsense_backend/app/retriver/documents_hybrid_search.py
index 060c3b17e..2163635eb 100644
--- a/surfsense_backend/app/retriver/documents_hybrid_search.py
+++ b/surfsense_backend/app/retriver/documents_hybrid_search.py
@@ -113,8 +113,6 @@ class DocumentHybridSearchRetriever:
             search_space_id: Optional search space ID to filter results
             document_type: Optional document type to filter results (e.g., "FILE", "CRAWLED_URL")
             
-        Returns:
-            List of dictionaries containing document data and relevance scores
         """
         from sqlalchemy import select, func, text
         from sqlalchemy.orm import joinedload
@@ -224,10 +222,22 @@ class DocumentHybridSearchRetriever:
         # Convert to serializable dictionaries
         serialized_results = []
         for document, score in documents_with_scores:
+            # Fetch associated chunks for this document
+            from sqlalchemy import select
+            from app.db import Chunk
+            
+            chunks_query = select(Chunk).where(Chunk.document_id == document.id).order_by(Chunk.id)
+            chunks_result = await self.db_session.execute(chunks_query)
+            chunks = chunks_result.scalars().all()
+            
+            # Concatenate chunks content
+            concatenated_chunks_content = " ".join([chunk.content for chunk in chunks]) if chunks else document.content
+            
             serialized_results.append({
                 "document_id": document.id,
                 "title": document.title,
                 "content": document.content,
+                "chunks_content": concatenated_chunks_content,
                 "document_type": document.document_type.value if hasattr(document, 'document_type') else None,
                 "metadata": document.document_metadata,
                 "score": float(score),  # Ensure score is a Python float
diff --git a/surfsense_backend/app/routes/chats_routes.py b/surfsense_backend/app/routes/chats_routes.py
index 62c7e8a9b..9a2aa79b6 100644
--- a/surfsense_backend/app/routes/chats_routes.py
+++ b/surfsense_backend/app/routes/chats_routes.py
@@ -11,6 +11,8 @@ from sqlalchemy.exc import IntegrityError, OperationalError
 from sqlalchemy.ext.asyncio import AsyncSession
 from sqlalchemy.future import select
 from langchain.schema import HumanMessage, AIMessage
+
+
 router = APIRouter()
 
 @router.post("/chat")
@@ -28,6 +30,8 @@ async def handle_chat_data(
     search_space_id = request.data.get('search_space_id')
     research_mode: str = request.data.get('research_mode')
     selected_connectors: List[str] = request.data.get('selected_connectors')
+    
+    search_mode_str = request.data.get('search_mode', "CHUNKS")
 
     # Convert search_space_id to integer if it's a string
     if search_space_id and isinstance(search_space_id, str):
@@ -66,7 +70,8 @@ async def handle_chat_data(
         session,
         research_mode,
         selected_connectors,
-        langchain_chat_history
+        langchain_chat_history,
+        search_mode_str
     ))
     response.headers['x-vercel-ai-data-stream'] = 'v1'
     return response
diff --git a/surfsense_backend/app/tasks/stream_connector_search_results.py b/surfsense_backend/app/tasks/stream_connector_search_results.py
index 2f3b50a0f..aa5f40179 100644
--- a/surfsense_backend/app/tasks/stream_connector_search_results.py
+++ b/surfsense_backend/app/tasks/stream_connector_search_results.py
@@ -6,6 +6,8 @@ from app.agents.researcher.state import State
 from app.utils.streaming_service import StreamingService
 from sqlalchemy.ext.asyncio import AsyncSession
 
+from app.agents.researcher.configuration import SearchMode
+
 
 async def stream_connector_search_results(
     user_query: str, 
@@ -14,7 +16,8 @@ async def stream_connector_search_results(
     session: AsyncSession, 
     research_mode: str, 
     selected_connectors: List[str],
-    langchain_chat_history: List[Any]
+    langchain_chat_history: List[Any],
+    search_mode_str: str
 ) -> AsyncGenerator[str, None]:
     """
     Stream connector search results to the client
@@ -41,6 +44,11 @@ async def stream_connector_search_results(
     # Convert UUID to string if needed
     user_id_str = str(user_id) if isinstance(user_id, UUID) else user_id
     
+    if search_mode_str == "CHUNKS":
+        search_mode = SearchMode.CHUNKS
+    elif search_mode_str == "DOCUMENTS":
+        search_mode = SearchMode.DOCUMENTS
+    
     # Sample configuration
     config = {
         "configurable": {
@@ -48,7 +56,8 @@ async def stream_connector_search_results(
             "num_sections": NUM_SECTIONS,
             "connectors_to_search": selected_connectors,
             "user_id": user_id_str,
-            "search_space_id": search_space_id
+            "search_space_id": search_space_id,
+            "search_mode": search_mode
         }
     }
     # Initialize state with database session and streaming service
diff --git a/surfsense_backend/app/utils/connector_service.py b/surfsense_backend/app/utils/connector_service.py
index c7ad692e0..49c3b083a 100644
--- a/surfsense_backend/app/utils/connector_service.py
+++ b/surfsense_backend/app/utils/connector_service.py
@@ -4,32 +4,47 @@ import asyncio
 from sqlalchemy.ext.asyncio import AsyncSession
 from sqlalchemy.future import select
 from app.retriver.chunks_hybrid_search import ChucksHybridSearchRetriever
+from app.retriver.documents_hybrid_search import DocumentHybridSearchRetriever
 from app.db import SearchSourceConnector, SearchSourceConnectorType
 from tavily import TavilyClient
 from linkup import LinkupClient
 
+from app.agents.researcher.configuration import SearchMode
+
 
 class ConnectorService:
     def __init__(self, session: AsyncSession):
         self.session = session
-        self.retriever = ChucksHybridSearchRetriever(session)
+        self.chunk_retriever = ChucksHybridSearchRetriever(session)
+        self.document_retriever = DocumentHybridSearchRetriever(session)
         self.source_id_counter = 1
         self.counter_lock = asyncio.Lock()  # Lock to protect counter in multithreaded environments
     
-    async def search_crawled_urls(self, user_query: str, user_id: str, search_space_id: int, top_k: int = 20) -> tuple:
+    async def search_crawled_urls(self, user_query: str, user_id: str, search_space_id: int, top_k: int = 20, search_mode: SearchMode = SearchMode.CHUNKS) -> tuple:
         """
         Search for crawled URLs and return both the source information and langchain documents
         
         Returns:
             tuple: (sources_info, langchain_documents)
         """
-        crawled_urls_chunks = await self.retriever.hybrid_search(
-            query_text=user_query,
-            top_k=top_k,
-            user_id=user_id,
-            search_space_id=search_space_id,
-            document_type="CRAWLED_URL"
-        )
+        if search_mode == SearchMode.CHUNKS:
+            crawled_urls_chunks = await self.chunk_retriever.hybrid_search(
+                query_text=user_query,
+                top_k=top_k,
+                user_id=user_id,
+                search_space_id=search_space_id,
+                document_type="CRAWLED_URL"
+            )
+        elif search_mode == SearchMode.DOCUMENTS:
+            crawled_urls_chunks = await self.document_retriever.hybrid_search(
+                query_text=user_query,
+                top_k=top_k,
+                user_id=user_id,
+                search_space_id=search_space_id,
+                document_type="CRAWLED_URL"
+            )
+            # Transform document retriever results to match expected format
+            crawled_urls_chunks = self._transform_document_results(crawled_urls_chunks)
 
         # Early return if no results
         if not crawled_urls_chunks:
@@ -71,20 +86,31 @@ class ConnectorService:
         
         return result_object, crawled_urls_chunks
     
-    async def search_files(self, user_query: str, user_id: str, search_space_id: int, top_k: int = 20) -> tuple:
+    async def search_files(self, user_query: str, user_id: str, search_space_id: int, top_k: int = 20, search_mode: SearchMode = SearchMode.CHUNKS) -> tuple:
         """
         Search for files and return both the source information and langchain documents
         
         Returns:
             tuple: (sources_info, langchain_documents)
         """
-        files_chunks = await self.retriever.hybrid_search(
-            query_text=user_query,
-            top_k=top_k,
-            user_id=user_id,
-            search_space_id=search_space_id,
-            document_type="FILE"
-        )
+        if search_mode == SearchMode.CHUNKS:
+            files_chunks = await self.chunk_retriever.hybrid_search(
+                query_text=user_query,
+                top_k=top_k,
+                user_id=user_id,
+                search_space_id=search_space_id,
+                document_type="FILE"
+            )
+        elif search_mode == SearchMode.DOCUMENTS:
+            files_chunks = await self.document_retriever.hybrid_search(
+                query_text=user_query,
+                top_k=top_k,
+                user_id=user_id,
+                search_space_id=search_space_id,
+                document_type="FILE"
+            )
+            # Transform document retriever results to match expected format
+            files_chunks = self._transform_document_results(files_chunks)
         
         # Early return if no results
         if not files_chunks:
@@ -126,6 +152,31 @@ class ConnectorService:
         
         return result_object, files_chunks
     
+    def _transform_document_results(self, document_results: List[Dict]) -> List[Dict]:
+        """
+        Transform results from document_retriever.hybrid_search() to match the format
+        expected by the processing code.
+        
+        Args:
+            document_results: Results from document_retriever.hybrid_search()
+            
+        Returns:
+            List of transformed results in the format expected by the processing code
+        """
+        transformed_results = []
+        for doc in document_results:
+            transformed_results.append({
+                'document': {
+                    'id': doc.get('document_id'),
+                    'title': doc.get('title', 'Untitled Document'),
+                    'document_type': doc.get('document_type'),
+                    'metadata': doc.get('metadata', {}),
+                },
+                'content': doc.get('chunks_content', doc.get('content', '')),
+                'score': doc.get('score', 0.0)
+            })
+        return transformed_results
+    
     async def get_connector_by_type(self, user_id: str, connector_type: SearchSourceConnectorType) -> Optional[SearchSourceConnector]:
         """
         Get a connector by type for a specific user
@@ -249,20 +300,31 @@ class ConnectorService:
                 "sources": [],
             }, []
     
-    async def search_slack(self, user_query: str, user_id: str, search_space_id: int, top_k: int = 20) -> tuple:
+    async def search_slack(self, user_query: str, user_id: str, search_space_id: int, top_k: int = 20, search_mode: SearchMode = SearchMode.CHUNKS) -> tuple:
         """
         Search for slack and return both the source information and langchain documents
         
         Returns:
             tuple: (sources_info, langchain_documents)
         """
-        slack_chunks = await self.retriever.hybrid_search(
-            query_text=user_query,
-            top_k=top_k,
-            user_id=user_id,
-            search_space_id=search_space_id,
-            document_type="SLACK_CONNECTOR"
-        )
+        if search_mode == SearchMode.CHUNKS:
+            slack_chunks = await self.chunk_retriever.hybrid_search(
+                query_text=user_query,
+                top_k=top_k,
+                user_id=user_id,
+                search_space_id=search_space_id,
+                document_type="SLACK_CONNECTOR"
+            )
+        elif search_mode == SearchMode.DOCUMENTS:
+            slack_chunks = await self.document_retriever.hybrid_search(
+                query_text=user_query,
+                top_k=top_k,
+                user_id=user_id,
+                search_space_id=search_space_id,
+                document_type="SLACK_CONNECTOR"
+            )
+            # Transform document retriever results to match expected format
+            slack_chunks = self._transform_document_results(slack_chunks)
         
         # Early return if no results
         if not slack_chunks:
@@ -323,7 +385,7 @@ class ConnectorService:
         
         return result_object, slack_chunks
         
-    async def search_notion(self, user_query: str, user_id: str, search_space_id: int, top_k: int = 20) -> tuple:
+    async def search_notion(self, user_query: str, user_id: str, search_space_id: int, top_k: int = 20, search_mode: SearchMode = SearchMode.CHUNKS) -> tuple:
         """
         Search for Notion pages and return both the source information and langchain documents
         
@@ -336,14 +398,25 @@ class ConnectorService:
         Returns:
             tuple: (sources_info, langchain_documents)
         """
-        notion_chunks = await self.retriever.hybrid_search(
-            query_text=user_query,
-            top_k=top_k,
-            user_id=user_id,
-            search_space_id=search_space_id,
-            document_type="NOTION_CONNECTOR"
-        )
-        
+        if search_mode == SearchMode.CHUNKS:
+            notion_chunks = await self.chunk_retriever.hybrid_search(
+                query_text=user_query,
+                top_k=top_k,
+                user_id=user_id,
+                search_space_id=search_space_id,
+                document_type="NOTION_CONNECTOR"
+            )
+        elif search_mode == SearchMode.DOCUMENTS:
+            notion_chunks = await self.document_retriever.hybrid_search(
+                query_text=user_query,
+                top_k=top_k,
+                user_id=user_id,
+                search_space_id=search_space_id,
+                document_type="NOTION_CONNECTOR"
+            )
+            # Transform document retriever results to match expected format
+            notion_chunks = self._transform_document_results(notion_chunks)
+            
         # Early return if no results
         if not notion_chunks:
             return {
@@ -405,7 +478,7 @@ class ConnectorService:
         
         return result_object, notion_chunks
     
-    async def search_extension(self, user_query: str, user_id: str, search_space_id: int, top_k: int = 20) -> tuple:
+    async def search_extension(self, user_query: str, user_id: str, search_space_id: int, top_k: int = 20, search_mode: SearchMode = SearchMode.CHUNKS) -> tuple:
         """
         Search for extension data and return both the source information and langchain documents
         
@@ -418,14 +491,25 @@ class ConnectorService:
         Returns:
             tuple: (sources_info, langchain_documents)
         """
-        extension_chunks = await self.retriever.hybrid_search(
-            query_text=user_query,
-            top_k=top_k,
-            user_id=user_id,
-            search_space_id=search_space_id,
-            document_type="EXTENSION"
-        )
-        
+        if search_mode == SearchMode.CHUNKS:
+            extension_chunks = await self.chunk_retriever.hybrid_search(
+                query_text=user_query,
+                top_k=top_k,
+                user_id=user_id,
+                search_space_id=search_space_id,
+                document_type="EXTENSION"
+            )
+        elif search_mode == SearchMode.DOCUMENTS:
+            extension_chunks = await self.document_retriever.hybrid_search(
+                query_text=user_query,
+                top_k=top_k,
+                user_id=user_id,
+                search_space_id=search_space_id,
+                document_type="EXTENSION"
+            )
+            # Transform document retriever results to match expected format
+            extension_chunks = self._transform_document_results(extension_chunks)
+
         # Early return if no results
         if not extension_chunks:
             return {
@@ -505,7 +589,7 @@ class ConnectorService:
         
         return result_object, extension_chunks
     
-    async def search_youtube(self, user_query: str, user_id: str, search_space_id: int, top_k: int = 20) -> tuple:
+    async def search_youtube(self, user_query: str, user_id: str, search_space_id: int, top_k: int = 20, search_mode: SearchMode = SearchMode.CHUNKS) -> tuple:
         """
         Search for YouTube videos and return both the source information and langchain documents
         
@@ -518,13 +602,24 @@ class ConnectorService:
         Returns:
             tuple: (sources_info, langchain_documents)
         """
-        youtube_chunks = await self.retriever.hybrid_search(
-            query_text=user_query,
-            top_k=top_k,
-            user_id=user_id,
-            search_space_id=search_space_id,
-            document_type="YOUTUBE_VIDEO"
-        )
+        if search_mode == SearchMode.CHUNKS:
+            youtube_chunks = await self.chunk_retriever.hybrid_search(
+                query_text=user_query,
+                top_k=top_k,
+                user_id=user_id,
+                search_space_id=search_space_id,
+                document_type="YOUTUBE_VIDEO"
+            )
+        elif search_mode == SearchMode.DOCUMENTS:
+            youtube_chunks = await self.document_retriever.hybrid_search(
+                query_text=user_query,
+                top_k=top_k,
+                user_id=user_id,
+                search_space_id=search_space_id,
+                document_type="YOUTUBE_VIDEO"
+            )
+            # Transform document retriever results to match expected format
+            youtube_chunks = self._transform_document_results(youtube_chunks)
         
         # Early return if no results
         if not youtube_chunks:
@@ -587,20 +682,31 @@ class ConnectorService:
         
         return result_object, youtube_chunks
 
-    async def search_github(self, user_query: str, user_id: int, search_space_id: int, top_k: int = 20) -> tuple:
+    async def search_github(self, user_query: str, user_id: int, search_space_id: int, top_k: int = 20, search_mode: SearchMode = SearchMode.CHUNKS) -> tuple:
         """
         Search for GitHub documents and return both the source information and langchain documents
         
         Returns:
             tuple: (sources_info, langchain_documents)
         """
-        github_chunks = await self.retriever.hybrid_search(
-            query_text=user_query,
-            top_k=top_k,
-            user_id=user_id,
-            search_space_id=search_space_id,
-            document_type="GITHUB_CONNECTOR"
-        )
+        if search_mode == SearchMode.CHUNKS:
+            github_chunks = await self.chunk_retriever.hybrid_search(
+                query_text=user_query,
+                top_k=top_k,
+                user_id=user_id,
+                search_space_id=search_space_id,
+                document_type="GITHUB_CONNECTOR"
+            )
+        elif search_mode == SearchMode.DOCUMENTS:
+            github_chunks = await self.document_retriever.hybrid_search(
+                query_text=user_query,
+                top_k=top_k,
+                user_id=user_id,
+                search_space_id=search_space_id,
+                document_type="GITHUB_CONNECTOR"
+            )
+            # Transform document retriever results to match expected format
+            github_chunks = self._transform_document_results(github_chunks)
         
         # Early return if no results
         if not github_chunks:
@@ -643,7 +749,7 @@ class ConnectorService:
         
         return result_object, github_chunks
 
-    async def search_linear(self, user_query: str, user_id: str, search_space_id: int, top_k: int = 20) -> tuple:
+    async def search_linear(self, user_query: str, user_id: str, search_space_id: int, top_k: int = 20, search_mode: SearchMode = SearchMode.CHUNKS) -> tuple:
         """
         Search for Linear issues and comments and return both the source information and langchain documents
         
@@ -656,14 +762,25 @@ class ConnectorService:
         Returns:
             tuple: (sources_info, langchain_documents)
         """
-        linear_chunks = await self.retriever.hybrid_search(
-            query_text=user_query,
-            top_k=top_k,
-            user_id=user_id,
-            search_space_id=search_space_id,
-            document_type="LINEAR_CONNECTOR"
-        )
-        
+        if search_mode == SearchMode.CHUNKS:
+            linear_chunks = await self.chunk_retriever.hybrid_search(
+                query_text=user_query,
+                top_k=top_k,
+                user_id=user_id,
+                search_space_id=search_space_id,
+                document_type="LINEAR_CONNECTOR"
+            )
+        elif search_mode == SearchMode.DOCUMENTS:
+            linear_chunks = await self.document_retriever.hybrid_search(
+                query_text=user_query,
+                top_k=top_k,
+                user_id=user_id,
+                search_space_id=search_space_id,
+                document_type="LINEAR_CONNECTOR"
+            )
+            # Transform document retriever results to match expected format
+            linear_chunks = self._transform_document_results(linear_chunks)
+
         # Early return if no results
         if not linear_chunks:
             return {
diff --git a/surfsense_web/app/dashboard/[search_space_id]/researcher/[chat_id]/page.tsx b/surfsense_web/app/dashboard/[search_space_id]/researcher/[chat_id]/page.tsx
index bc58e8c30..78239e241 100644
--- a/surfsense_web/app/dashboard/[search_space_id]/researcher/[chat_id]/page.tsx
+++ b/surfsense_web/app/dashboard/[search_space_id]/researcher/[chat_id]/page.tsx
@@ -13,7 +13,9 @@ import {
   ArrowDown,
   CircleUser,
   Database,
-  SendHorizontal
+  SendHorizontal,
+  FileText,
+  Grid3x3
 } from 'lucide-react';
 import { Card, CardContent, CardHeader, CardTitle } from '@/components/ui/card';
 import { Button } from '@/components/ui/button';
@@ -248,6 +250,7 @@ const ChatPage = () => {
   const tabsListRef = useRef<HTMLDivElement>(null);
   const [terminalExpanded, setTerminalExpanded] = useState(false);
   const [selectedConnectors, setSelectedConnectors] = useState<string[]>(["CRAWLED_URL"]);
+  const [searchMode, setSearchMode] = useState<'DOCUMENTS' | 'CHUNKS'>('DOCUMENTS');
   const [researchMode, setResearchMode] = useState<ResearchMode>("GENERAL");
   const [currentTime, setCurrentTime] = useState<string>('');
   const [currentDate, setCurrentDate] = useState<string>('');
@@ -362,7 +365,8 @@ const ChatPage = () => {
       data: {
         search_space_id: search_space_id,
         selected_connectors: selectedConnectors,
-        research_mode: researchMode
+        research_mode: researchMode,
+        search_mode: searchMode
       }
     },
     onError: (error) => {
@@ -557,11 +561,6 @@ const ChatPage = () => {
     }
   }, [terminalExpanded]);
 
-  // Get total sources count for a connector type
-  const getSourcesCount = (connectorType: string) => {
-    return getSourcesCountUtil(getMessageConnectorSources(messages[messages.length - 1]), connectorType);
-  };
-
   // Function to check scroll position and update indicators
   const updateScrollIndicators = () => {
     updateScrollIndicatorsUtil(tabsListRef as React.RefObject<HTMLDivElement>, setCanScrollLeft, setCanScrollRight);
@@ -587,23 +586,6 @@ const ChatPage = () => {
   // Use the scroll to bottom hook
   useScrollToBottom(messagesEndRef as React.RefObject<HTMLDivElement>, [messages]);
 
-  // Function to get sources for the main view
-  const getMainViewSources = (connector: any) => {
-    return getMainViewSourcesUtil(connector, INITIAL_SOURCES_DISPLAY);
-  };
-
-  // Function to get filtered sources for the dialog with null check
-  const getFilteredSourcesWithCheck = (connector: any, sourceFilter: string) => {
-    if (!connector?.sources) return [];
-    return getFilteredSourcesUtil(connector, sourceFilter);
-  };
-
-  // Function to get paginated dialog sources with null check
-  const getPaginatedDialogSourcesWithCheck = (connector: any, sourceFilter: string, expandedSources: boolean, sourcesPage: number, sourcesPerPage: number) => {
-    if (!connector?.sources) return [];
-    return getPaginatedDialogSourcesUtil(connector, sourceFilter, expandedSources, sourcesPage, sourcesPerPage);
-  };
-
   // Function to get a citation source by ID
   const getCitationSource = React.useCallback((citationId: number, messageIndex?: number): Source | null => {
     if (!messages || messages.length === 0) return null;
@@ -995,15 +977,17 @@ const ChatPage = () => {
               <span className="sr-only">Send</span>
             </Button>
           </form>
-          <div className="flex items-center justify-between px-2 py-1 mt-8">
-            <div className="flex items-center gap-4">
+          <div className="flex items-center justify-between px-2 py-2 mt-3">
+            <div className="flex items-center space-x-3">
               {/* Connector Selection Dialog */}
               <Dialog>
                 <DialogTrigger asChild>
-                  <ConnectorButton
-                    selectedConnectors={selectedConnectors}
-                    onClick={() => { }}
-                  />
+                  <div className="h-8">
+                    <ConnectorButton
+                      selectedConnectors={selectedConnectors}
+                      onClick={() => { }}
+                    />
+                  </div>
                 </DialogTrigger>
                 <DialogContent className="sm:max-w-md">
                   <DialogHeader>
@@ -1070,12 +1054,40 @@ const ChatPage = () => {
                 </DialogContent>
               </Dialog>
 
+              {/* Search Mode Control */}
+              <div className="flex items-center p-0.5 rounded-md border border-border bg-muted/20 h-8">
+                <button
+                  onClick={() => setSearchMode('DOCUMENTS')}
+                  className={`flex h-full items-center justify-center gap-1 px-2 rounded text-xs font-medium transition-colors flex-1 whitespace-nowrap overflow-hidden ${
+                    searchMode === 'DOCUMENTS'
+                      ? 'bg-primary text-primary-foreground shadow-sm'
+                      : 'text-muted-foreground hover:text-foreground hover:bg-muted/50'
+                  }`}
+                >
+                  <FileText className="h-3 w-3 flex-shrink-0 mr-1" />
+                  <span>Full Document</span>
+                </button>
+                <button
+                  onClick={() => setSearchMode('CHUNKS')}
+                  className={`flex h-full items-center justify-center gap-1 px-2 rounded text-xs font-medium transition-colors flex-1 whitespace-nowrap overflow-hidden ${
+                    searchMode === 'CHUNKS'
+                      ? 'bg-primary text-primary-foreground shadow-sm'
+                      : 'text-muted-foreground hover:text-foreground hover:bg-muted/50'
+                  }`}
+                >
+                  <Grid3x3 className="h-3 w-3 flex-shrink-0 mr-1" />
+                  <span>Document Chunks</span>
+                </button>
+              </div>
+
               {/* Research Mode Segmented Control */}
-              <SegmentedControl<ResearchMode>
-                value={researchMode}
-                onChange={setResearchMode}
-                options={researcherOptions}
-              />
+              <div className="h-8">
+                <SegmentedControl<ResearchMode>
+                  value={researchMode}
+                  onChange={setResearchMode}
+                  options={researcherOptions}
+                />
+              </div>
             </div>
           </div>
         </div>
diff --git a/surfsense_web/components/chat/ConnectorComponents.tsx b/surfsense_web/components/chat/ConnectorComponents.tsx
index e52348c4a..163d5bf20 100644
--- a/surfsense_web/components/chat/ConnectorComponents.tsx
+++ b/surfsense_web/components/chat/ConnectorComponents.tsx
@@ -147,7 +147,7 @@ export const ConnectorButton = ({ selectedConnectors, onClick, connectorSources
   return (
     <Button
       variant="outline"
-      className="h-7 px-2 text-xs font-medium rounded-md border-border relative overflow-hidden group scale-90 origin-left"
+      className="h-8 px-2 text-xs font-medium rounded-md border-border relative overflow-hidden group"
       onClick={onClick}
       aria-label={selectedCount === 0 ? "Select Connectors" : `${selectedCount} connectors selected`}
     >
diff --git a/surfsense_web/components/chat/SegmentedControl.tsx b/surfsense_web/components/chat/SegmentedControl.tsx
index d886a3b79..7d81a528a 100644
--- a/surfsense_web/components/chat/SegmentedControl.tsx
+++ b/surfsense_web/components/chat/SegmentedControl.tsx
@@ -15,11 +15,11 @@ type SegmentedControlProps<T extends string> = {
  */
 function SegmentedControl<T extends string>({ value, onChange, options }: SegmentedControlProps<T>) {
   return (
-    <div className="flex rounded-md border border-border overflow-hidden scale-90 origin-left">
+    <div className="flex h-7 rounded-md border border-border overflow-hidden">
       {options.map((option) => (
         <button
           key={option.value}
-          className={`flex items-center gap-1 px-2 py-1 text-xs transition-colors ${
+          className={`flex h-full items-center gap-1 px-2 text-xs transition-colors ${
             value === option.value 
               ? 'bg-primary text-primary-foreground' 
               : 'hover:bg-muted'

From a9db0a8cebc27cf35aaa548f56b3655359bdcaf4 Mon Sep 17 00:00:00 2001
From: "DESKTOP-RTLN3BA\\$punk" <vermarohanfinal@gmail.com>
Date: Sun, 11 May 2025 23:05:56 -0700
Subject: [PATCH 40/70] feat: Introduce the RAPTOR Search.

---
 .../app/agents/researcher/configuration.py    |   7 +
 .../app/agents/researcher/nodes.py            |  32 ++-
 .../researcher/sub_section_writer/nodes.py    |  10 +-
 .../researcher/sub_section_writer/prompts.py  |  21 ++
 .../app/retriver/documents_hybrid_search.py   |  14 +-
 surfsense_backend/app/routes/chats_routes.py  |   7 +-
 .../tasks/stream_connector_search_results.py  |  13 +-
 .../app/utils/connector_service.py            | 253 +++++++++++++-----
 .../researcher/[chat_id]/page.tsx             |  82 +++---
 .../components/chat/ConnectorComponents.tsx   |   2 +-
 .../components/chat/SegmentedControl.tsx      |   4 +-
 11 files changed, 318 insertions(+), 127 deletions(-)

diff --git a/surfsense_backend/app/agents/researcher/configuration.py b/surfsense_backend/app/agents/researcher/configuration.py
index 8ba3849a3..0eb34b540 100644
--- a/surfsense_backend/app/agents/researcher/configuration.py
+++ b/surfsense_backend/app/agents/researcher/configuration.py
@@ -3,10 +3,16 @@
 from __future__ import annotations
 
 from dataclasses import dataclass, fields
+from enum import Enum
 from typing import Optional, List, Any
 
 from langchain_core.runnables import RunnableConfig
 
+class SearchMode(Enum): 
+    """Enum defining the type of search mode."""
+    CHUNKS = "CHUNKS"
+    DOCUMENTS = "DOCUMENTS"
+
 
 @dataclass(kw_only=True)
 class Configuration:
@@ -18,6 +24,7 @@ class Configuration:
     connectors_to_search: List[str]
     user_id: str
     search_space_id: int
+    search_mode: SearchMode
 
 
     @classmethod
diff --git a/surfsense_backend/app/agents/researcher/nodes.py b/surfsense_backend/app/agents/researcher/nodes.py
index b0b81aef3..644ddd918 100644
--- a/surfsense_backend/app/agents/researcher/nodes.py
+++ b/surfsense_backend/app/agents/researcher/nodes.py
@@ -10,7 +10,7 @@ from langchain_core.runnables import RunnableConfig
 from pydantic import BaseModel, Field
 from sqlalchemy.ext.asyncio import AsyncSession
 
-from .configuration import Configuration
+from .configuration import Configuration, SearchMode
 from .prompts import get_answer_outline_system_prompt
 from .state import State
 from .sub_section_writer.graph import graph as sub_section_writer_graph
@@ -149,7 +149,8 @@ async def fetch_relevant_documents(
     writer: StreamWriter = None,
     state: State = None,
     top_k: int = 10,
-    connector_service: ConnectorService = None
+    connector_service: ConnectorService = None,
+    search_mode: SearchMode = SearchMode.CHUNKS
 ) -> List[Dict[str, Any]]:
     """
     Fetch relevant documents for research questions using the provided connectors.
@@ -213,7 +214,8 @@ async def fetch_relevant_documents(
                         user_query=reformulated_query,
                         user_id=user_id,
                         search_space_id=search_space_id,
-                        top_k=top_k
+                        top_k=top_k,
+                        search_mode=search_mode
                     )
                     
                     # Add to sources and raw documents
@@ -231,7 +233,8 @@ async def fetch_relevant_documents(
                         user_query=reformulated_query,
                         user_id=user_id,
                         search_space_id=search_space_id,
-                        top_k=top_k
+                        top_k=top_k,
+                        search_mode=search_mode
                     )
                     
                     # Add to sources and raw documents
@@ -249,7 +252,8 @@ async def fetch_relevant_documents(
                         user_query=reformulated_query,
                         user_id=user_id,
                         search_space_id=search_space_id,
-                        top_k=top_k
+                        top_k=top_k,
+                        search_mode=search_mode
                     )
                     
                     # Add to sources and raw documents
@@ -267,7 +271,8 @@ async def fetch_relevant_documents(
                         user_query=reformulated_query,
                         user_id=user_id,
                         search_space_id=search_space_id,
-                        top_k=top_k
+                        top_k=top_k,
+                        search_mode=search_mode
                     )
                     
                     # Add to sources and raw documents
@@ -286,7 +291,8 @@ async def fetch_relevant_documents(
                         user_query=reformulated_query,
                         user_id=user_id,
                         search_space_id=search_space_id,
-                        top_k=top_k
+                        top_k=top_k,
+                        search_mode=search_mode
                     )
                     
                     # Add to sources and raw documents
@@ -304,7 +310,8 @@ async def fetch_relevant_documents(
                         user_query=reformulated_query,
                         user_id=user_id,
                         search_space_id=search_space_id,
-                        top_k=top_k
+                        top_k=top_k,
+                        search_mode=search_mode
                     )
                     
                     # Add to sources and raw documents
@@ -322,7 +329,8 @@ async def fetch_relevant_documents(
                         user_query=reformulated_query,
                         user_id=user_id,
                         search_space_id=search_space_id,
-                        top_k=top_k
+                        top_k=top_k,
+                        search_mode=search_mode
                     )
                     
                     # Add to sources and raw documents
@@ -340,7 +348,8 @@ async def fetch_relevant_documents(
                         user_query=reformulated_query,
                         user_id=user_id,
                         search_space_id=search_space_id,
-                        top_k=top_k
+                        top_k=top_k,
+                        search_mode=search_mode
                     )
                     
                     # Add to sources and raw documents
@@ -558,7 +567,8 @@ async def process_sections(state: State, config: RunnableConfig, writer: StreamW
                 writer=writer,
                 state=state,
                 top_k=TOP_K,
-                connector_service=connector_service
+                connector_service=connector_service,
+                search_mode=configuration.search_mode
             )
         except Exception as e:
             error_message = f"Error fetching relevant documents: {str(e)}"
diff --git a/surfsense_backend/app/agents/researcher/sub_section_writer/nodes.py b/surfsense_backend/app/agents/researcher/sub_section_writer/nodes.py
index 765b619b8..5853283c3 100644
--- a/surfsense_backend/app/agents/researcher/sub_section_writer/nodes.py
+++ b/surfsense_backend/app/agents/researcher/sub_section_writer/nodes.py
@@ -141,6 +141,11 @@ async def write_sub_section(state: State, config: RunnableConfig) -> Dict[str, A
     
     # Construct a clear, structured query for the LLM
     human_message_content = f"""
+    Source material:
+    <documents>
+        {documents_text}
+    </documents>
+    
     Now user's query is: 
     <user_query>
         {user_query}
@@ -158,11 +163,6 @@ async def write_sub_section(state: State, config: RunnableConfig) -> Dict[str, A
     <guiding_questions>
         {questions_text}
     </guiding_questions>
-    
-    Use the provided documents as your source material and cite them properly using the IEEE citation format [X] where X is the source_id.
-    <documents>
-        {documents_text}
-    </documents>
     """
     
     # Create messages for the LLM
diff --git a/surfsense_backend/app/agents/researcher/sub_section_writer/prompts.py b/surfsense_backend/app/agents/researcher/sub_section_writer/prompts.py
index 18a91eb07..48345c9c1 100644
--- a/surfsense_backend/app/agents/researcher/sub_section_writer/prompts.py
+++ b/surfsense_backend/app/agents/researcher/sub_section_writer/prompts.py
@@ -25,6 +25,8 @@ You are a research assistant tasked with analyzing documents and providing compr
 16. CRITICAL: Citations must ONLY appear as [X] or [X], [Y], [Z] format - never with parentheses, hyperlinks, or other formatting.
 17. CRITICAL: Never make up citation numbers. Only use source_id values that are explicitly provided in the document metadata.
 18. CRITICAL: If you are unsure about a source_id, do not include a citation rather than guessing or making one up.
+19. CRITICAL: Focus only on answering the user's query. Any guiding questions provided are for your thinking process only and should not be mentioned in your response.
+20. CRITICAL: Ensure your response aligns with the provided sub-section title and section position.
 </instructions>
 
 <format>
@@ -37,6 +39,8 @@ You are a research assistant tasked with analyzing documents and providing compr
 - NEVER create your own citation numbering system - use the exact source_id values from the documents.
 - NEVER format citations as clickable links or as markdown links like "([1](https://example.com))". Always use plain square brackets only.
 - NEVER make up citation numbers if you are unsure about the source_id. It is better to omit the citation than to guess.
+- NEVER include or mention the guiding questions in your response. They are only to help guide your thinking.
+- ALWAYS focus on answering the user's query directly from the information in the documents.
 </format>
 
 <input_example>
@@ -84,4 +88,21 @@ ONLY use plain square brackets [1] or multiple citations [1], [2], [3]
 </incorrect_citation_formats>
 
 Note that the citation numbers match exactly with the source_id values (1, 13, and 21) and are not renumbered sequentially. Citations follow IEEE style with square brackets and appear at the end of sentences.
+
+<user_query_instructions>
+When you see a user query like:
+    <user_query>
+        Give all linear issues.
+    </user_query>
+
+Focus exclusively on answering this query using information from the provided documents. 
+
+If guiding questions are provided in a <guiding_questions> section, use them only to guide your thinking process. Do not mention or list these questions in your response.
+
+Make sure your response:
+1. Directly answers the user's query
+2. Fits the provided sub-section title and section position
+3. Uses proper citations for all information from documents
+4. Is well-structured and professional in tone
+</user_query_instructions>
 """
\ No newline at end of file
diff --git a/surfsense_backend/app/retriver/documents_hybrid_search.py b/surfsense_backend/app/retriver/documents_hybrid_search.py
index 060c3b17e..2163635eb 100644
--- a/surfsense_backend/app/retriver/documents_hybrid_search.py
+++ b/surfsense_backend/app/retriver/documents_hybrid_search.py
@@ -113,8 +113,6 @@ class DocumentHybridSearchRetriever:
             search_space_id: Optional search space ID to filter results
             document_type: Optional document type to filter results (e.g., "FILE", "CRAWLED_URL")
             
-        Returns:
-            List of dictionaries containing document data and relevance scores
         """
         from sqlalchemy import select, func, text
         from sqlalchemy.orm import joinedload
@@ -224,10 +222,22 @@ class DocumentHybridSearchRetriever:
         # Convert to serializable dictionaries
         serialized_results = []
         for document, score in documents_with_scores:
+            # Fetch associated chunks for this document
+            from sqlalchemy import select
+            from app.db import Chunk
+            
+            chunks_query = select(Chunk).where(Chunk.document_id == document.id).order_by(Chunk.id)
+            chunks_result = await self.db_session.execute(chunks_query)
+            chunks = chunks_result.scalars().all()
+            
+            # Concatenate chunks content
+            concatenated_chunks_content = " ".join([chunk.content for chunk in chunks]) if chunks else document.content
+            
             serialized_results.append({
                 "document_id": document.id,
                 "title": document.title,
                 "content": document.content,
+                "chunks_content": concatenated_chunks_content,
                 "document_type": document.document_type.value if hasattr(document, 'document_type') else None,
                 "metadata": document.document_metadata,
                 "score": float(score),  # Ensure score is a Python float
diff --git a/surfsense_backend/app/routes/chats_routes.py b/surfsense_backend/app/routes/chats_routes.py
index 62c7e8a9b..9a2aa79b6 100644
--- a/surfsense_backend/app/routes/chats_routes.py
+++ b/surfsense_backend/app/routes/chats_routes.py
@@ -11,6 +11,8 @@ from sqlalchemy.exc import IntegrityError, OperationalError
 from sqlalchemy.ext.asyncio import AsyncSession
 from sqlalchemy.future import select
 from langchain.schema import HumanMessage, AIMessage
+
+
 router = APIRouter()
 
 @router.post("/chat")
@@ -28,6 +30,8 @@ async def handle_chat_data(
     search_space_id = request.data.get('search_space_id')
     research_mode: str = request.data.get('research_mode')
     selected_connectors: List[str] = request.data.get('selected_connectors')
+    
+    search_mode_str = request.data.get('search_mode', "CHUNKS")
 
     # Convert search_space_id to integer if it's a string
     if search_space_id and isinstance(search_space_id, str):
@@ -66,7 +70,8 @@ async def handle_chat_data(
         session,
         research_mode,
         selected_connectors,
-        langchain_chat_history
+        langchain_chat_history,
+        search_mode_str
     ))
     response.headers['x-vercel-ai-data-stream'] = 'v1'
     return response
diff --git a/surfsense_backend/app/tasks/stream_connector_search_results.py b/surfsense_backend/app/tasks/stream_connector_search_results.py
index 2f3b50a0f..aa5f40179 100644
--- a/surfsense_backend/app/tasks/stream_connector_search_results.py
+++ b/surfsense_backend/app/tasks/stream_connector_search_results.py
@@ -6,6 +6,8 @@ from app.agents.researcher.state import State
 from app.utils.streaming_service import StreamingService
 from sqlalchemy.ext.asyncio import AsyncSession
 
+from app.agents.researcher.configuration import SearchMode
+
 
 async def stream_connector_search_results(
     user_query: str, 
@@ -14,7 +16,8 @@ async def stream_connector_search_results(
     session: AsyncSession, 
     research_mode: str, 
     selected_connectors: List[str],
-    langchain_chat_history: List[Any]
+    langchain_chat_history: List[Any],
+    search_mode_str: str
 ) -> AsyncGenerator[str, None]:
     """
     Stream connector search results to the client
@@ -41,6 +44,11 @@ async def stream_connector_search_results(
     # Convert UUID to string if needed
     user_id_str = str(user_id) if isinstance(user_id, UUID) else user_id
     
+    if search_mode_str == "CHUNKS":
+        search_mode = SearchMode.CHUNKS
+    elif search_mode_str == "DOCUMENTS":
+        search_mode = SearchMode.DOCUMENTS
+    
     # Sample configuration
     config = {
         "configurable": {
@@ -48,7 +56,8 @@ async def stream_connector_search_results(
             "num_sections": NUM_SECTIONS,
             "connectors_to_search": selected_connectors,
             "user_id": user_id_str,
-            "search_space_id": search_space_id
+            "search_space_id": search_space_id,
+            "search_mode": search_mode
         }
     }
     # Initialize state with database session and streaming service
diff --git a/surfsense_backend/app/utils/connector_service.py b/surfsense_backend/app/utils/connector_service.py
index c7ad692e0..49c3b083a 100644
--- a/surfsense_backend/app/utils/connector_service.py
+++ b/surfsense_backend/app/utils/connector_service.py
@@ -4,32 +4,47 @@ import asyncio
 from sqlalchemy.ext.asyncio import AsyncSession
 from sqlalchemy.future import select
 from app.retriver.chunks_hybrid_search import ChucksHybridSearchRetriever
+from app.retriver.documents_hybrid_search import DocumentHybridSearchRetriever
 from app.db import SearchSourceConnector, SearchSourceConnectorType
 from tavily import TavilyClient
 from linkup import LinkupClient
 
+from app.agents.researcher.configuration import SearchMode
+
 
 class ConnectorService:
     def __init__(self, session: AsyncSession):
         self.session = session
-        self.retriever = ChucksHybridSearchRetriever(session)
+        self.chunk_retriever = ChucksHybridSearchRetriever(session)
+        self.document_retriever = DocumentHybridSearchRetriever(session)
         self.source_id_counter = 1
         self.counter_lock = asyncio.Lock()  # Lock to protect counter in multithreaded environments
     
-    async def search_crawled_urls(self, user_query: str, user_id: str, search_space_id: int, top_k: int = 20) -> tuple:
+    async def search_crawled_urls(self, user_query: str, user_id: str, search_space_id: int, top_k: int = 20, search_mode: SearchMode = SearchMode.CHUNKS) -> tuple:
         """
         Search for crawled URLs and return both the source information and langchain documents
         
         Returns:
             tuple: (sources_info, langchain_documents)
         """
-        crawled_urls_chunks = await self.retriever.hybrid_search(
-            query_text=user_query,
-            top_k=top_k,
-            user_id=user_id,
-            search_space_id=search_space_id,
-            document_type="CRAWLED_URL"
-        )
+        if search_mode == SearchMode.CHUNKS:
+            crawled_urls_chunks = await self.chunk_retriever.hybrid_search(
+                query_text=user_query,
+                top_k=top_k,
+                user_id=user_id,
+                search_space_id=search_space_id,
+                document_type="CRAWLED_URL"
+            )
+        elif search_mode == SearchMode.DOCUMENTS:
+            crawled_urls_chunks = await self.document_retriever.hybrid_search(
+                query_text=user_query,
+                top_k=top_k,
+                user_id=user_id,
+                search_space_id=search_space_id,
+                document_type="CRAWLED_URL"
+            )
+            # Transform document retriever results to match expected format
+            crawled_urls_chunks = self._transform_document_results(crawled_urls_chunks)
 
         # Early return if no results
         if not crawled_urls_chunks:
@@ -71,20 +86,31 @@ class ConnectorService:
         
         return result_object, crawled_urls_chunks
     
-    async def search_files(self, user_query: str, user_id: str, search_space_id: int, top_k: int = 20) -> tuple:
+    async def search_files(self, user_query: str, user_id: str, search_space_id: int, top_k: int = 20, search_mode: SearchMode = SearchMode.CHUNKS) -> tuple:
         """
         Search for files and return both the source information and langchain documents
         
         Returns:
             tuple: (sources_info, langchain_documents)
         """
-        files_chunks = await self.retriever.hybrid_search(
-            query_text=user_query,
-            top_k=top_k,
-            user_id=user_id,
-            search_space_id=search_space_id,
-            document_type="FILE"
-        )
+        if search_mode == SearchMode.CHUNKS:
+            files_chunks = await self.chunk_retriever.hybrid_search(
+                query_text=user_query,
+                top_k=top_k,
+                user_id=user_id,
+                search_space_id=search_space_id,
+                document_type="FILE"
+            )
+        elif search_mode == SearchMode.DOCUMENTS:
+            files_chunks = await self.document_retriever.hybrid_search(
+                query_text=user_query,
+                top_k=top_k,
+                user_id=user_id,
+                search_space_id=search_space_id,
+                document_type="FILE"
+            )
+            # Transform document retriever results to match expected format
+            files_chunks = self._transform_document_results(files_chunks)
         
         # Early return if no results
         if not files_chunks:
@@ -126,6 +152,31 @@ class ConnectorService:
         
         return result_object, files_chunks
     
+    def _transform_document_results(self, document_results: List[Dict]) -> List[Dict]:
+        """
+        Transform results from document_retriever.hybrid_search() to match the format
+        expected by the processing code.
+        
+        Args:
+            document_results: Results from document_retriever.hybrid_search()
+            
+        Returns:
+            List of transformed results in the format expected by the processing code
+        """
+        transformed_results = []
+        for doc in document_results:
+            transformed_results.append({
+                'document': {
+                    'id': doc.get('document_id'),
+                    'title': doc.get('title', 'Untitled Document'),
+                    'document_type': doc.get('document_type'),
+                    'metadata': doc.get('metadata', {}),
+                },
+                'content': doc.get('chunks_content', doc.get('content', '')),
+                'score': doc.get('score', 0.0)
+            })
+        return transformed_results
+    
     async def get_connector_by_type(self, user_id: str, connector_type: SearchSourceConnectorType) -> Optional[SearchSourceConnector]:
         """
         Get a connector by type for a specific user
@@ -249,20 +300,31 @@ class ConnectorService:
                 "sources": [],
             }, []
     
-    async def search_slack(self, user_query: str, user_id: str, search_space_id: int, top_k: int = 20) -> tuple:
+    async def search_slack(self, user_query: str, user_id: str, search_space_id: int, top_k: int = 20, search_mode: SearchMode = SearchMode.CHUNKS) -> tuple:
         """
         Search for slack and return both the source information and langchain documents
         
         Returns:
             tuple: (sources_info, langchain_documents)
         """
-        slack_chunks = await self.retriever.hybrid_search(
-            query_text=user_query,
-            top_k=top_k,
-            user_id=user_id,
-            search_space_id=search_space_id,
-            document_type="SLACK_CONNECTOR"
-        )
+        if search_mode == SearchMode.CHUNKS:
+            slack_chunks = await self.chunk_retriever.hybrid_search(
+                query_text=user_query,
+                top_k=top_k,
+                user_id=user_id,
+                search_space_id=search_space_id,
+                document_type="SLACK_CONNECTOR"
+            )
+        elif search_mode == SearchMode.DOCUMENTS:
+            slack_chunks = await self.document_retriever.hybrid_search(
+                query_text=user_query,
+                top_k=top_k,
+                user_id=user_id,
+                search_space_id=search_space_id,
+                document_type="SLACK_CONNECTOR"
+            )
+            # Transform document retriever results to match expected format
+            slack_chunks = self._transform_document_results(slack_chunks)
         
         # Early return if no results
         if not slack_chunks:
@@ -323,7 +385,7 @@ class ConnectorService:
         
         return result_object, slack_chunks
         
-    async def search_notion(self, user_query: str, user_id: str, search_space_id: int, top_k: int = 20) -> tuple:
+    async def search_notion(self, user_query: str, user_id: str, search_space_id: int, top_k: int = 20, search_mode: SearchMode = SearchMode.CHUNKS) -> tuple:
         """
         Search for Notion pages and return both the source information and langchain documents
         
@@ -336,14 +398,25 @@ class ConnectorService:
         Returns:
             tuple: (sources_info, langchain_documents)
         """
-        notion_chunks = await self.retriever.hybrid_search(
-            query_text=user_query,
-            top_k=top_k,
-            user_id=user_id,
-            search_space_id=search_space_id,
-            document_type="NOTION_CONNECTOR"
-        )
-        
+        if search_mode == SearchMode.CHUNKS:
+            notion_chunks = await self.chunk_retriever.hybrid_search(
+                query_text=user_query,
+                top_k=top_k,
+                user_id=user_id,
+                search_space_id=search_space_id,
+                document_type="NOTION_CONNECTOR"
+            )
+        elif search_mode == SearchMode.DOCUMENTS:
+            notion_chunks = await self.document_retriever.hybrid_search(
+                query_text=user_query,
+                top_k=top_k,
+                user_id=user_id,
+                search_space_id=search_space_id,
+                document_type="NOTION_CONNECTOR"
+            )
+            # Transform document retriever results to match expected format
+            notion_chunks = self._transform_document_results(notion_chunks)
+            
         # Early return if no results
         if not notion_chunks:
             return {
@@ -405,7 +478,7 @@ class ConnectorService:
         
         return result_object, notion_chunks
     
-    async def search_extension(self, user_query: str, user_id: str, search_space_id: int, top_k: int = 20) -> tuple:
+    async def search_extension(self, user_query: str, user_id: str, search_space_id: int, top_k: int = 20, search_mode: SearchMode = SearchMode.CHUNKS) -> tuple:
         """
         Search for extension data and return both the source information and langchain documents
         
@@ -418,14 +491,25 @@ class ConnectorService:
         Returns:
             tuple: (sources_info, langchain_documents)
         """
-        extension_chunks = await self.retriever.hybrid_search(
-            query_text=user_query,
-            top_k=top_k,
-            user_id=user_id,
-            search_space_id=search_space_id,
-            document_type="EXTENSION"
-        )
-        
+        if search_mode == SearchMode.CHUNKS:
+            extension_chunks = await self.chunk_retriever.hybrid_search(
+                query_text=user_query,
+                top_k=top_k,
+                user_id=user_id,
+                search_space_id=search_space_id,
+                document_type="EXTENSION"
+            )
+        elif search_mode == SearchMode.DOCUMENTS:
+            extension_chunks = await self.document_retriever.hybrid_search(
+                query_text=user_query,
+                top_k=top_k,
+                user_id=user_id,
+                search_space_id=search_space_id,
+                document_type="EXTENSION"
+            )
+            # Transform document retriever results to match expected format
+            extension_chunks = self._transform_document_results(extension_chunks)
+
         # Early return if no results
         if not extension_chunks:
             return {
@@ -505,7 +589,7 @@ class ConnectorService:
         
         return result_object, extension_chunks
     
-    async def search_youtube(self, user_query: str, user_id: str, search_space_id: int, top_k: int = 20) -> tuple:
+    async def search_youtube(self, user_query: str, user_id: str, search_space_id: int, top_k: int = 20, search_mode: SearchMode = SearchMode.CHUNKS) -> tuple:
         """
         Search for YouTube videos and return both the source information and langchain documents
         
@@ -518,13 +602,24 @@ class ConnectorService:
         Returns:
             tuple: (sources_info, langchain_documents)
         """
-        youtube_chunks = await self.retriever.hybrid_search(
-            query_text=user_query,
-            top_k=top_k,
-            user_id=user_id,
-            search_space_id=search_space_id,
-            document_type="YOUTUBE_VIDEO"
-        )
+        if search_mode == SearchMode.CHUNKS:
+            youtube_chunks = await self.chunk_retriever.hybrid_search(
+                query_text=user_query,
+                top_k=top_k,
+                user_id=user_id,
+                search_space_id=search_space_id,
+                document_type="YOUTUBE_VIDEO"
+            )
+        elif search_mode == SearchMode.DOCUMENTS:
+            youtube_chunks = await self.document_retriever.hybrid_search(
+                query_text=user_query,
+                top_k=top_k,
+                user_id=user_id,
+                search_space_id=search_space_id,
+                document_type="YOUTUBE_VIDEO"
+            )
+            # Transform document retriever results to match expected format
+            youtube_chunks = self._transform_document_results(youtube_chunks)
         
         # Early return if no results
         if not youtube_chunks:
@@ -587,20 +682,31 @@ class ConnectorService:
         
         return result_object, youtube_chunks
 
-    async def search_github(self, user_query: str, user_id: int, search_space_id: int, top_k: int = 20) -> tuple:
+    async def search_github(self, user_query: str, user_id: int, search_space_id: int, top_k: int = 20, search_mode: SearchMode = SearchMode.CHUNKS) -> tuple:
         """
         Search for GitHub documents and return both the source information and langchain documents
         
         Returns:
             tuple: (sources_info, langchain_documents)
         """
-        github_chunks = await self.retriever.hybrid_search(
-            query_text=user_query,
-            top_k=top_k,
-            user_id=user_id,
-            search_space_id=search_space_id,
-            document_type="GITHUB_CONNECTOR"
-        )
+        if search_mode == SearchMode.CHUNKS:
+            github_chunks = await self.chunk_retriever.hybrid_search(
+                query_text=user_query,
+                top_k=top_k,
+                user_id=user_id,
+                search_space_id=search_space_id,
+                document_type="GITHUB_CONNECTOR"
+            )
+        elif search_mode == SearchMode.DOCUMENTS:
+            github_chunks = await self.document_retriever.hybrid_search(
+                query_text=user_query,
+                top_k=top_k,
+                user_id=user_id,
+                search_space_id=search_space_id,
+                document_type="GITHUB_CONNECTOR"
+            )
+            # Transform document retriever results to match expected format
+            github_chunks = self._transform_document_results(github_chunks)
         
         # Early return if no results
         if not github_chunks:
@@ -643,7 +749,7 @@ class ConnectorService:
         
         return result_object, github_chunks
 
-    async def search_linear(self, user_query: str, user_id: str, search_space_id: int, top_k: int = 20) -> tuple:
+    async def search_linear(self, user_query: str, user_id: str, search_space_id: int, top_k: int = 20, search_mode: SearchMode = SearchMode.CHUNKS) -> tuple:
         """
         Search for Linear issues and comments and return both the source information and langchain documents
         
@@ -656,14 +762,25 @@ class ConnectorService:
         Returns:
             tuple: (sources_info, langchain_documents)
         """
-        linear_chunks = await self.retriever.hybrid_search(
-            query_text=user_query,
-            top_k=top_k,
-            user_id=user_id,
-            search_space_id=search_space_id,
-            document_type="LINEAR_CONNECTOR"
-        )
-        
+        if search_mode == SearchMode.CHUNKS:
+            linear_chunks = await self.chunk_retriever.hybrid_search(
+                query_text=user_query,
+                top_k=top_k,
+                user_id=user_id,
+                search_space_id=search_space_id,
+                document_type="LINEAR_CONNECTOR"
+            )
+        elif search_mode == SearchMode.DOCUMENTS:
+            linear_chunks = await self.document_retriever.hybrid_search(
+                query_text=user_query,
+                top_k=top_k,
+                user_id=user_id,
+                search_space_id=search_space_id,
+                document_type="LINEAR_CONNECTOR"
+            )
+            # Transform document retriever results to match expected format
+            linear_chunks = self._transform_document_results(linear_chunks)
+
         # Early return if no results
         if not linear_chunks:
             return {
diff --git a/surfsense_web/app/dashboard/[search_space_id]/researcher/[chat_id]/page.tsx b/surfsense_web/app/dashboard/[search_space_id]/researcher/[chat_id]/page.tsx
index bc58e8c30..78239e241 100644
--- a/surfsense_web/app/dashboard/[search_space_id]/researcher/[chat_id]/page.tsx
+++ b/surfsense_web/app/dashboard/[search_space_id]/researcher/[chat_id]/page.tsx
@@ -13,7 +13,9 @@ import {
   ArrowDown,
   CircleUser,
   Database,
-  SendHorizontal
+  SendHorizontal,
+  FileText,
+  Grid3x3
 } from 'lucide-react';
 import { Card, CardContent, CardHeader, CardTitle } from '@/components/ui/card';
 import { Button } from '@/components/ui/button';
@@ -248,6 +250,7 @@ const ChatPage = () => {
   const tabsListRef = useRef<HTMLDivElement>(null);
   const [terminalExpanded, setTerminalExpanded] = useState(false);
   const [selectedConnectors, setSelectedConnectors] = useState<string[]>(["CRAWLED_URL"]);
+  const [searchMode, setSearchMode] = useState<'DOCUMENTS' | 'CHUNKS'>('DOCUMENTS');
   const [researchMode, setResearchMode] = useState<ResearchMode>("GENERAL");
   const [currentTime, setCurrentTime] = useState<string>('');
   const [currentDate, setCurrentDate] = useState<string>('');
@@ -362,7 +365,8 @@ const ChatPage = () => {
       data: {
         search_space_id: search_space_id,
         selected_connectors: selectedConnectors,
-        research_mode: researchMode
+        research_mode: researchMode,
+        search_mode: searchMode
       }
     },
     onError: (error) => {
@@ -557,11 +561,6 @@ const ChatPage = () => {
     }
   }, [terminalExpanded]);
 
-  // Get total sources count for a connector type
-  const getSourcesCount = (connectorType: string) => {
-    return getSourcesCountUtil(getMessageConnectorSources(messages[messages.length - 1]), connectorType);
-  };
-
   // Function to check scroll position and update indicators
   const updateScrollIndicators = () => {
     updateScrollIndicatorsUtil(tabsListRef as React.RefObject<HTMLDivElement>, setCanScrollLeft, setCanScrollRight);
@@ -587,23 +586,6 @@ const ChatPage = () => {
   // Use the scroll to bottom hook
   useScrollToBottom(messagesEndRef as React.RefObject<HTMLDivElement>, [messages]);
 
-  // Function to get sources for the main view
-  const getMainViewSources = (connector: any) => {
-    return getMainViewSourcesUtil(connector, INITIAL_SOURCES_DISPLAY);
-  };
-
-  // Function to get filtered sources for the dialog with null check
-  const getFilteredSourcesWithCheck = (connector: any, sourceFilter: string) => {
-    if (!connector?.sources) return [];
-    return getFilteredSourcesUtil(connector, sourceFilter);
-  };
-
-  // Function to get paginated dialog sources with null check
-  const getPaginatedDialogSourcesWithCheck = (connector: any, sourceFilter: string, expandedSources: boolean, sourcesPage: number, sourcesPerPage: number) => {
-    if (!connector?.sources) return [];
-    return getPaginatedDialogSourcesUtil(connector, sourceFilter, expandedSources, sourcesPage, sourcesPerPage);
-  };
-
   // Function to get a citation source by ID
   const getCitationSource = React.useCallback((citationId: number, messageIndex?: number): Source | null => {
     if (!messages || messages.length === 0) return null;
@@ -995,15 +977,17 @@ const ChatPage = () => {
               <span className="sr-only">Send</span>
             </Button>
           </form>
-          <div className="flex items-center justify-between px-2 py-1 mt-8">
-            <div className="flex items-center gap-4">
+          <div className="flex items-center justify-between px-2 py-2 mt-3">
+            <div className="flex items-center space-x-3">
               {/* Connector Selection Dialog */}
               <Dialog>
                 <DialogTrigger asChild>
-                  <ConnectorButton
-                    selectedConnectors={selectedConnectors}
-                    onClick={() => { }}
-                  />
+                  <div className="h-8">
+                    <ConnectorButton
+                      selectedConnectors={selectedConnectors}
+                      onClick={() => { }}
+                    />
+                  </div>
                 </DialogTrigger>
                 <DialogContent className="sm:max-w-md">
                   <DialogHeader>
@@ -1070,12 +1054,40 @@ const ChatPage = () => {
                 </DialogContent>
               </Dialog>
 
+              {/* Search Mode Control */}
+              <div className="flex items-center p-0.5 rounded-md border border-border bg-muted/20 h-8">
+                <button
+                  onClick={() => setSearchMode('DOCUMENTS')}
+                  className={`flex h-full items-center justify-center gap-1 px-2 rounded text-xs font-medium transition-colors flex-1 whitespace-nowrap overflow-hidden ${
+                    searchMode === 'DOCUMENTS'
+                      ? 'bg-primary text-primary-foreground shadow-sm'
+                      : 'text-muted-foreground hover:text-foreground hover:bg-muted/50'
+                  }`}
+                >
+                  <FileText className="h-3 w-3 flex-shrink-0 mr-1" />
+                  <span>Full Document</span>
+                </button>
+                <button
+                  onClick={() => setSearchMode('CHUNKS')}
+                  className={`flex h-full items-center justify-center gap-1 px-2 rounded text-xs font-medium transition-colors flex-1 whitespace-nowrap overflow-hidden ${
+                    searchMode === 'CHUNKS'
+                      ? 'bg-primary text-primary-foreground shadow-sm'
+                      : 'text-muted-foreground hover:text-foreground hover:bg-muted/50'
+                  }`}
+                >
+                  <Grid3x3 className="h-3 w-3 flex-shrink-0 mr-1" />
+                  <span>Document Chunks</span>
+                </button>
+              </div>
+
               {/* Research Mode Segmented Control */}
-              <SegmentedControl<ResearchMode>
-                value={researchMode}
-                onChange={setResearchMode}
-                options={researcherOptions}
-              />
+              <div className="h-8">
+                <SegmentedControl<ResearchMode>
+                  value={researchMode}
+                  onChange={setResearchMode}
+                  options={researcherOptions}
+                />
+              </div>
             </div>
           </div>
         </div>
diff --git a/surfsense_web/components/chat/ConnectorComponents.tsx b/surfsense_web/components/chat/ConnectorComponents.tsx
index e52348c4a..163d5bf20 100644
--- a/surfsense_web/components/chat/ConnectorComponents.tsx
+++ b/surfsense_web/components/chat/ConnectorComponents.tsx
@@ -147,7 +147,7 @@ export const ConnectorButton = ({ selectedConnectors, onClick, connectorSources
   return (
     <Button
       variant="outline"
-      className="h-7 px-2 text-xs font-medium rounded-md border-border relative overflow-hidden group scale-90 origin-left"
+      className="h-8 px-2 text-xs font-medium rounded-md border-border relative overflow-hidden group"
       onClick={onClick}
       aria-label={selectedCount === 0 ? "Select Connectors" : `${selectedCount} connectors selected`}
     >
diff --git a/surfsense_web/components/chat/SegmentedControl.tsx b/surfsense_web/components/chat/SegmentedControl.tsx
index d886a3b79..7d81a528a 100644
--- a/surfsense_web/components/chat/SegmentedControl.tsx
+++ b/surfsense_web/components/chat/SegmentedControl.tsx
@@ -15,11 +15,11 @@ type SegmentedControlProps<T extends string> = {
  */
 function SegmentedControl<T extends string>({ value, onChange, options }: SegmentedControlProps<T>) {
   return (
-    <div className="flex rounded-md border border-border overflow-hidden scale-90 origin-left">
+    <div className="flex h-7 rounded-md border border-border overflow-hidden">
       {options.map((option) => (
         <button
           key={option.value}
-          className={`flex items-center gap-1 px-2 py-1 text-xs transition-colors ${
+          className={`flex h-full items-center gap-1 px-2 text-xs transition-colors ${
             value === option.value 
               ? 'bg-primary text-primary-foreground' 
               : 'hover:bg-muted'

From 6e6bb3972f79b304f030e28283e7ea96e4fe89f3 Mon Sep 17 00:00:00 2001
From: mpawank <mpawankumar356@gmai.com>
Date: Tue, 13 May 2025 15:19:00 +0530
Subject: [PATCH 41/70] mpawanl

---
 README.md | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/README.md b/README.md
index e8979cf50..98005b35c 100644
--- a/README.md
+++ b/README.md
@@ -237,3 +237,11 @@ Fine-tuning the Backend is always desired.
  </picture>
 </a>
 
+---
+
+## 📘 Documentation (Coming Soon)
+
+We're working on docs via:
+
+- [Mintlify](https://mintlify.com/)
+- [Fumadocs](https://fumadocs.com/)

From 9ccbf848e8ea6f1c0d3240292212ee575f9a9b75 Mon Sep 17 00:00:00 2001
From: mpawank <mpawankumar356@gmai.com>
Date: Wed, 14 May 2025 00:55:22 +0530
Subject: [PATCH 42/70] updated layout.tsx for better seo

---
 surfsense_web/app/layout.tsx | 106 +++++++++++++++++++++++++----------
 1 file changed, 77 insertions(+), 29 deletions(-)

diff --git a/surfsense_web/app/layout.tsx b/surfsense_web/app/layout.tsx
index 6b60891a4..30903a608 100644
--- a/surfsense_web/app/layout.tsx
+++ b/surfsense_web/app/layout.tsx
@@ -15,35 +15,83 @@ const roboto = Roboto({
 });
 
 export const metadata: Metadata = {
-	title: "SurfSense - A Personal NotebookLM and Perplexity-like AI Assistant for Everyone.",
-	description:
-		"Have your own private NotebookLM and Perplexity with better integrations.",
-	openGraph: {
-		images: [
-			{
-				url: "https://surfsense.net/og-image.png",
-				width: 1200,
-				height: 630,
-				alt: "SurfSense - A Personal NotebookLM and Perplexity-like AI Assistant for Everyone.",
-			},
-		],
-	},
-	twitter: {
-		card: "summary_large_image",
-		site: "https://surfsense.net",
-		creator: "https://surfsense.net",
-		title: "SurfSense - A Personal NotebookLM and Perplexity-like AI Assistant for Everyone.",
-		description:
-			"Have your own private NotebookLM and Perplexity with better integrations.",
-		images: [
-			{
-				url: "https://surfsense.net/og-image.png",
-				width: 1200,
-				height: 630,
-				alt: "SurfSense - A Personal NotebookLM and Perplexity-like AI Assistant for Everyone.",
-			},
-		],
-	},
+  title: "SurfSense – Customizable AI Research & Knowledge Management Assistant",
+  description:
+    "SurfSense is an AI-powered research assistant that integrates with tools like Notion, GitHub, Slack, and more to help you efficiently manage, search, and chat with your documents. Generate podcasts, perform hybrid search, and unlock insights from your knowledge base.",
+  keywords: [
+    "SurfSense",
+    "AI research assistant",
+    "AI knowledge management",
+    "AI document assistant",
+    "customizable AI assistant",
+    "notion integration",
+    "slack integration",
+    "github integration",
+    "hybrid search",
+    "vector search",
+    "RAG",
+    "LangChain",
+    "FastAPI",
+    "LLM apps",
+    "AI document chat",
+    "knowledge management AI",
+    "AI-powered document search",
+    "personal AI assistant",
+    "AI research tools",
+    "AI podcast generator",
+    "AI knowledge base",
+    "AI document assistant tools",
+    "AI-powered search assistant",
+  ],
+  openGraph: {
+    title: "SurfSense – AI Research & Knowledge Management Assistant",
+    description:
+      "Connect your documents and tools like Notion, Slack, GitHub, and more to your private AI assistant. SurfSense offers powerful search, document chat, podcast generation, and RAG APIs to enhance your workflow.",
+    url: "https://surfsense.net",
+    siteName: "SurfSense",
+    type: "website",
+    images: [
+      {
+        url: "https://surfsense.net/og-image.png",
+        width: 1200,
+        height: 630,
+        alt: "SurfSense AI Research Assistant",
+      },
+    ],
+    locale: "en_US",
+  },
+  twitter: {
+    card: "summary_large_image",
+    title: "SurfSense – AI Assistant for Research & Knowledge Management",
+    description:
+      "Have your own NotebookLM or Perplexity, but better. SurfSense connects external tools, allows chat with your documents, and generates fast, high-quality podcasts.",
+    creator: "@surfsenseapp", // Replace with actual handle
+    images: [
+      {
+        url: "https://surfsense.net/og-image-twitter.png", // Consider a different image for Twitter
+        width: 1200,
+        height: 630,
+        alt: "SurfSense AI Assistant Preview",
+      },
+    ],
+  },
+  // Adding structured data (JSON-LD) for rich search results
+  structuredData: {
+    "@context": "https://schema.org",
+    "@type": "WebSite",
+    name: "SurfSense",
+    description: "AI-powered research assistant that integrates with tools like Notion, GitHub, and Slack. Offers powerful search and knowledge management capabilities.",
+    url: "https://surfsense.net",
+    sameAs: [
+      "https://twitter.com/surfsenseapp", // Replace with actual social links
+      "https://www.linkedin.com/company/surfsense", // Replace with actual social links
+    ],
+    potentialAction: {
+      "@type": "SearchAction",
+      target: "https://surfsense.net/search?q={search_term_string}",
+      query: "{search_term_string}",
+    },
+  },
 };
 
 export default async function RootLayout({

From 288798ade4664cbd73c01f331f7e5bbf30813423 Mon Sep 17 00:00:00 2001
From: mpawank <mpawankumar356@gmai.com>
Date: Wed, 14 May 2025 01:22:25 +0530
Subject: [PATCH 43/70] updated seo

---
 surfsense_web/app/layout.tsx | 24 ++++--------------------
 1 file changed, 4 insertions(+), 20 deletions(-)

diff --git a/surfsense_web/app/layout.tsx b/surfsense_web/app/layout.tsx
index 30903a608..33bf10bb4 100644
--- a/surfsense_web/app/layout.tsx
+++ b/surfsense_web/app/layout.tsx
@@ -65,33 +65,17 @@ export const metadata: Metadata = {
     title: "SurfSense – AI Assistant for Research & Knowledge Management",
     description:
       "Have your own NotebookLM or Perplexity, but better. SurfSense connects external tools, allows chat with your documents, and generates fast, high-quality podcasts.",
-    creator: "@surfsenseapp", // Replace with actual handle
+    creator: "https://surfsense.net",
+    site: "https://surfsense.net",
     images: [
       {
-        url: "https://surfsense.net/og-image-twitter.png", // Consider a different image for Twitter
+        url: "https://surfsense.net/og-image-twitter.png",
         width: 1200,
         height: 630,
         alt: "SurfSense AI Assistant Preview",
       },
     ],
-  },
-  // Adding structured data (JSON-LD) for rich search results
-  structuredData: {
-    "@context": "https://schema.org",
-    "@type": "WebSite",
-    name: "SurfSense",
-    description: "AI-powered research assistant that integrates with tools like Notion, GitHub, and Slack. Offers powerful search and knowledge management capabilities.",
-    url: "https://surfsense.net",
-    sameAs: [
-      "https://twitter.com/surfsenseapp", // Replace with actual social links
-      "https://www.linkedin.com/company/surfsense", // Replace with actual social links
-    ],
-    potentialAction: {
-      "@type": "SearchAction",
-      target: "https://surfsense.net/search?q={search_term_string}",
-      query: "{search_term_string}",
-    },
-  },
+  }
 };
 
 export default async function RootLayout({

From 9d5dd56306e914b4cdef54e6b83b2c32d0954a70 Mon Sep 17 00:00:00 2001
From: mpawank <mpawankumar356@gmai.com>
Date: Wed, 14 May 2025 08:49:52 +0530
Subject: [PATCH 44/70] removed change from r

---
 README.md | 146 ++++++++++++++++++++++++++----------------------------
 1 file changed, 71 insertions(+), 75 deletions(-)

diff --git a/README.md b/README.md
index 98005b35c..cfcf0c55f 100644
--- a/README.md
+++ b/README.md
@@ -1,17 +1,13 @@
-
 ![new_header](https://github.com/user-attachments/assets/e236b764-0ddc-42ff-a1f1-8fbb3d2e0e65)
 
-
-
-
 # SurfSense
+
 While tools like NotebookLM and Perplexity are impressive and highly effective for conducting research on any topic/query, SurfSense elevates this capability by integrating with your personal knowledge base. It is a highly customizable AI research agent, connected to external sources such as search engines (Tavily, LinkUp), Slack, Linear, Notion, YouTube, GitHub and more to come.
 
 <div align="center">
 <a href="https://trendshift.io/repositories/13606" target="_blank"><img src="https://trendshift.io/api/badge/repositories/13606" alt="MODSetter%2FSurfSense | Trendshift" style="width: 250px; height: 55px;" width="250" height="55"/></a>
 </div>
 
-
 # Video
 
 https://github.com/user-attachments/assets/48142909-6391-4084-b7e8-81da388bb1fc
@@ -24,31 +20,46 @@ https://github.com/user-attachments/assets/d516982f-de00-4c41-9e4c-632a7d942f41
 
 https://github.com/user-attachments/assets/bf64a6ca-934b-47ac-9e1b-edac5fe972ec
 
-
-
 ## Key Features
+
 ### 1. Latest
 
-#### 💡 **Idea**: 
+#### 💡 **Idea**:
+
 Have your own highly customizable private NotebookLM and Perplexity integrated with external sources.
+
 #### 📁 **Multiple File Format Uploading Support**
-Save content from your own personal files *(Documents, images and supports **27 file extensions**)* to your own personal knowledge base .
+
+Save content from your own personal files _(Documents, images and supports **27 file extensions**)_ to your own personal knowledge base .
+
 #### 🔍 **Powerful Search**
+
 Quickly research or find anything in your saved content .
+
 #### 💬 **Chat with your Saved Content**
- Interact in Natural Language and get cited answers.
+
+Interact in Natural Language and get cited answers.
+
 #### 📄 **Cited Answers**
+
 Get Cited answers just like Perplexity.
+
 #### 🔔 **Privacy & Local LLM Support**
+
 Works Flawlessly with Ollama local LLMs.
+
 #### 🏠 **Self Hostable**
+
 Open source and easy to deploy locally.
-#### 🎙️ Podcasts 
+
+#### 🎙️ Podcasts
+
 - Blazingly fast podcast generation agent. (Creates a 3-minute podcast in under 20 seconds.)
 - Convert your chat conversations into engaging audio content
 - Support for multiple TTS providers (OpenAI, Azure, Google Vertex AI)
 
 #### 📊 **Advanced RAG Techniques**
+
 - Supports 150+ LLM's
 - Supports 6000+ Embedding Models.
 - Supports all major Rerankers (Pinecode, Cohere, Flashrank etc)
@@ -57,6 +68,7 @@ Open source and easy to deploy locally.
 - RAG as a Service API Backend.
 
 #### ℹ️ **External Sources**
+
 - Search Engines (Tavily, LinkUp)
 - Slack
 - Linear
@@ -66,19 +78,16 @@ Open source and easy to deploy locally.
 - and more to come.....
 
 #### 🔖 Cross Browser Extension
+
 - The SurfSense extension can be used to save any webpage you like.
 - Its main usecase is to save any webpages protected beyond authentication.
 
-
 ## FEATURE REQUESTS AND FUTURE
 
-
 **SurfSense is actively being developed.** While it's not yet production-ready, you can help us speed up the process.
 
 Join the [SurfSense Discord](https://discord.gg/ejRNvftDp9) and help shape the future of SurfSense!
 
-
-
 ## How to get started?
 
 ### Installation Options
@@ -86,6 +95,7 @@ Join the [SurfSense Discord](https://discord.gg/ejRNvftDp9) and help shape the f
 SurfSense provides two installation methods:
 
 1. **[Docker Installation](https://www.surfsense.net/docs/docker-installation)** - The easiest way to get SurfSense up and running with all dependencies containerized.
+
    - Includes pgAdmin for database management through a web UI
    - Supports environment variable customization via `.env` file
    - See [Docker Setup Guide](DOCKER_SETUP.md) for detailed instructions
@@ -95,6 +105,7 @@ SurfSense provides two installation methods:
 Both installation guides include detailed OS-specific instructions for Windows, macOS, and Linux.
 
 Before installation, make sure to complete the [prerequisite setup steps](https://www.surfsense.net/docs/) including:
+
 - PGVector setup
 - Google OAuth configuration
 - Unstructured.io API key
@@ -102,22 +113,21 @@ Before installation, make sure to complete the [prerequisite setup steps](https:
 
 ## Screenshots
 
-**Search Spaces** 
+**Search Spaces**
 
 ![search_spaces](https://github.com/user-attachments/assets/e254c38c-f937-44b6-9e9d-770db583d099)
 
-**Manage Documents** 
+**Manage Documents**
 ![documents](https://github.com/user-attachments/assets/7001e306-eb06-4009-89c6-8fadfdc3fc4d)
 
-**Research Agent** 
+**Research Agent**
 
 ![researcher](https://github.com/user-attachments/assets/fda3e61f-f936-4b66-b565-d84edde44a67)
 
-**Podcast Agent** 
+**Podcast Agent**
 ![podcasts](https://github.com/user-attachments/assets/6cb82ffd-9e14-4172-bc79-67faf34c4c1c)
 
-
-**Agent Chat** 
+**Agent Chat**
 
 ![chat](https://github.com/user-attachments/assets/bb352d52-1c6d-4020-926b-722d0b98b491)
 
@@ -127,89 +137,86 @@ Before installation, make sure to complete the [prerequisite setup steps](https:
 
 ![ext2](https://github.com/user-attachments/assets/a9b9f1aa-2677-404d-b0a0-c1b2dddf24a7)
 
-
 ## Tech Stack
 
+### **BackEnd**
 
- ### **BackEnd** 
+- **FastAPI**: Modern, fast web framework for building APIs with Python
 
--  **FastAPI**: Modern, fast web framework for building APIs with Python
-  
--  **PostgreSQL with pgvector**: Database with vector search capabilities for similarity searches
+- **PostgreSQL with pgvector**: Database with vector search capabilities for similarity searches
 
--  **SQLAlchemy**: SQL toolkit and ORM (Object-Relational Mapping) for database interactions
+- **SQLAlchemy**: SQL toolkit and ORM (Object-Relational Mapping) for database interactions
 
--  **Alembic**: A database migrations tool for SQLAlchemy.
+- **Alembic**: A database migrations tool for SQLAlchemy.
 
--  **FastAPI Users**: Authentication and user management with JWT and OAuth support
+- **FastAPI Users**: Authentication and user management with JWT and OAuth support
 
--  **LangGraph**: Framework for developing AI-agents.
-  
--  **LangChain**: Framework for developing AI-powered applications.
+- **LangGraph**: Framework for developing AI-agents.
 
--  **LLM Integration**: Integration with LLM models through LiteLLM
+- **LangChain**: Framework for developing AI-powered applications.
 
--  **Rerankers**: Advanced result ranking for improved search relevance
+- **LLM Integration**: Integration with LLM models through LiteLLM
 
--  **Hybrid Search**: Combines vector similarity and full-text search for optimal results using Reciprocal Rank Fusion (RRF)
+- **Rerankers**: Advanced result ranking for improved search relevance
 
--  **Vector Embeddings**: Document and text embeddings for semantic search
+- **Hybrid Search**: Combines vector similarity and full-text search for optimal results using Reciprocal Rank Fusion (RRF)
 
--  **pgvector**: PostgreSQL extension for efficient vector similarity operations
+- **Vector Embeddings**: Document and text embeddings for semantic search
 
--  **Chonkie**: Advanced document chunking and embedding library
- - Uses `AutoEmbeddings` for flexible embedding model selection
- -  `LateChunker` for optimized document chunking based on embedding model's max sequence length
+- **pgvector**: PostgreSQL extension for efficient vector similarity operations
 
+- **Chonkie**: Advanced document chunking and embedding library
+- Uses `AutoEmbeddings` for flexible embedding model selection
+- `LateChunker` for optimized document chunking based on embedding model's max sequence length
 
-  
 ---
- ### **FrontEnd**
 
--  **Next.js 15.2.3**: React framework featuring App Router, server components, automatic code-splitting, and optimized rendering.
+### **FrontEnd**
 
--  **React 19.0.0**: JavaScript library for building user interfaces.
+- **Next.js 15.2.3**: React framework featuring App Router, server components, automatic code-splitting, and optimized rendering.
 
--  **TypeScript**: Static type-checking for JavaScript, enhancing code quality and developer experience.
+- **React 19.0.0**: JavaScript library for building user interfaces.
+
+- **TypeScript**: Static type-checking for JavaScript, enhancing code quality and developer experience.
 - **Vercel AI SDK Kit UI Stream Protocol**: To create scalable chat UI.
 
--  **Tailwind CSS 4.x**: Utility-first CSS framework for building custom UI designs.
+- **Tailwind CSS 4.x**: Utility-first CSS framework for building custom UI designs.
 
--  **Shadcn**: Headless components library.
+- **Shadcn**: Headless components library.
 
--  **Lucide React**: Icon set implemented as React components.
+- **Lucide React**: Icon set implemented as React components.
 
--  **Framer Motion**: Animation library for React.
+- **Framer Motion**: Animation library for React.
 
--  **Sonner**: Toast notification library.
+- **Sonner**: Toast notification library.
 
--  **Geist**: Font family from Vercel.
+- **Geist**: Font family from Vercel.
 
--  **React Hook Form**: Form state management and validation.
+- **React Hook Form**: Form state management and validation.
 
--  **Zod**: TypeScript-first schema validation with static type inference.
+- **Zod**: TypeScript-first schema validation with static type inference.
 
--  **@hookform/resolvers**: Resolvers for using validation libraries with React Hook Form.
+- **@hookform/resolvers**: Resolvers for using validation libraries with React Hook Form.
 
--  **@tanstack/react-table**: Headless UI for building powerful tables & datagrids.
+- **@tanstack/react-table**: Headless UI for building powerful tables & datagrids.
 
+### **DevOps**
 
- ### **DevOps**
+- **Docker**: Container platform for consistent deployment across environments
 
--  **Docker**: Container platform for consistent deployment across environments
-  
--  **Docker Compose**: Tool for defining and running multi-container Docker applications
+- **Docker Compose**: Tool for defining and running multi-container Docker applications
 
--  **pgAdmin**: Web-based PostgreSQL administration tool included in Docker setup
+- **pgAdmin**: Web-based PostgreSQL administration tool included in Docker setup
 
+### **Extension**
 
-### **Extension** 
- Manifest v3 on Plasmo
+Manifest v3 on Plasmo
 
 ## Future Work
+
 - Add More Connectors.
 - Patch minor bugs.
-- Implement Canvas. 
+- Implement Canvas.
 - Complete Hybrid Search. **[Done]**
 - Add support for file uploads QA. **[Done]**
 - Shift to WebSockets for Streaming responses. **[Deprecated in favor of AI SDK Stream Protocol]**
@@ -220,9 +227,7 @@ Before installation, make sure to complete the [prerequisite setup steps](https:
 - Basic keyword search page for saved sessions **[Done]**
 - Multi & Single Document Chat **[Done]**
 
-
-
-## Contribute 
+## Contribute
 
 Contributions are very welcome! A contribution can be as small as a ⭐ or even finding and creating issues.
 Fine-tuning the Backend is always desired.
@@ -236,12 +241,3 @@ Fine-tuning the Backend is always desired.
    <img alt="Star History Chart" src="https://api.star-history.com/svg?repos=MODSetter/SurfSense&type=Date" />
  </picture>
 </a>
-
----
-
-## 📘 Documentation (Coming Soon)
-
-We're working on docs via:
-
-- [Mintlify](https://mintlify.com/)
-- [Fumadocs](https://fumadocs.com/)

From 445e02b71b31d9484d78b1e32835466f45f55e49 Mon Sep 17 00:00:00 2001
From: mpawank <mpawankumar356@gmai.com>
Date: Wed, 14 May 2025 08:54:40 +0530
Subject: [PATCH 45/70] removed changes from readme

---
 README.md | 138 ++++++++++++++++++++++++++----------------------------
 1 file changed, 67 insertions(+), 71 deletions(-)

diff --git a/README.md b/README.md
index cfcf0c55f..e8979cf50 100644
--- a/README.md
+++ b/README.md
@@ -1,13 +1,17 @@
+
 ![new_header](https://github.com/user-attachments/assets/e236b764-0ddc-42ff-a1f1-8fbb3d2e0e65)
 
-# SurfSense
 
+
+
+# SurfSense
 While tools like NotebookLM and Perplexity are impressive and highly effective for conducting research on any topic/query, SurfSense elevates this capability by integrating with your personal knowledge base. It is a highly customizable AI research agent, connected to external sources such as search engines (Tavily, LinkUp), Slack, Linear, Notion, YouTube, GitHub and more to come.
 
 <div align="center">
 <a href="https://trendshift.io/repositories/13606" target="_blank"><img src="https://trendshift.io/api/badge/repositories/13606" alt="MODSetter%2FSurfSense | Trendshift" style="width: 250px; height: 55px;" width="250" height="55"/></a>
 </div>
 
+
 # Video
 
 https://github.com/user-attachments/assets/48142909-6391-4084-b7e8-81da388bb1fc
@@ -20,46 +24,31 @@ https://github.com/user-attachments/assets/d516982f-de00-4c41-9e4c-632a7d942f41
 
 https://github.com/user-attachments/assets/bf64a6ca-934b-47ac-9e1b-edac5fe972ec
 
-## Key Features
 
+
+## Key Features
 ### 1. Latest
 
-#### 💡 **Idea**:
-
+#### 💡 **Idea**: 
 Have your own highly customizable private NotebookLM and Perplexity integrated with external sources.
-
 #### 📁 **Multiple File Format Uploading Support**
-
-Save content from your own personal files _(Documents, images and supports **27 file extensions**)_ to your own personal knowledge base .
-
+Save content from your own personal files *(Documents, images and supports **27 file extensions**)* to your own personal knowledge base .
 #### 🔍 **Powerful Search**
-
 Quickly research or find anything in your saved content .
-
 #### 💬 **Chat with your Saved Content**
-
-Interact in Natural Language and get cited answers.
-
+ Interact in Natural Language and get cited answers.
 #### 📄 **Cited Answers**
-
 Get Cited answers just like Perplexity.
-
 #### 🔔 **Privacy & Local LLM Support**
-
 Works Flawlessly with Ollama local LLMs.
-
 #### 🏠 **Self Hostable**
-
 Open source and easy to deploy locally.
-
-#### 🎙️ Podcasts
-
+#### 🎙️ Podcasts 
 - Blazingly fast podcast generation agent. (Creates a 3-minute podcast in under 20 seconds.)
 - Convert your chat conversations into engaging audio content
 - Support for multiple TTS providers (OpenAI, Azure, Google Vertex AI)
 
 #### 📊 **Advanced RAG Techniques**
-
 - Supports 150+ LLM's
 - Supports 6000+ Embedding Models.
 - Supports all major Rerankers (Pinecode, Cohere, Flashrank etc)
@@ -68,7 +57,6 @@ Open source and easy to deploy locally.
 - RAG as a Service API Backend.
 
 #### ℹ️ **External Sources**
-
 - Search Engines (Tavily, LinkUp)
 - Slack
 - Linear
@@ -78,16 +66,19 @@ Open source and easy to deploy locally.
 - and more to come.....
 
 #### 🔖 Cross Browser Extension
-
 - The SurfSense extension can be used to save any webpage you like.
 - Its main usecase is to save any webpages protected beyond authentication.
 
+
 ## FEATURE REQUESTS AND FUTURE
 
+
 **SurfSense is actively being developed.** While it's not yet production-ready, you can help us speed up the process.
 
 Join the [SurfSense Discord](https://discord.gg/ejRNvftDp9) and help shape the future of SurfSense!
 
+
+
 ## How to get started?
 
 ### Installation Options
@@ -95,7 +86,6 @@ Join the [SurfSense Discord](https://discord.gg/ejRNvftDp9) and help shape the f
 SurfSense provides two installation methods:
 
 1. **[Docker Installation](https://www.surfsense.net/docs/docker-installation)** - The easiest way to get SurfSense up and running with all dependencies containerized.
-
    - Includes pgAdmin for database management through a web UI
    - Supports environment variable customization via `.env` file
    - See [Docker Setup Guide](DOCKER_SETUP.md) for detailed instructions
@@ -105,7 +95,6 @@ SurfSense provides two installation methods:
 Both installation guides include detailed OS-specific instructions for Windows, macOS, and Linux.
 
 Before installation, make sure to complete the [prerequisite setup steps](https://www.surfsense.net/docs/) including:
-
 - PGVector setup
 - Google OAuth configuration
 - Unstructured.io API key
@@ -113,21 +102,22 @@ Before installation, make sure to complete the [prerequisite setup steps](https:
 
 ## Screenshots
 
-**Search Spaces**
+**Search Spaces** 
 
 ![search_spaces](https://github.com/user-attachments/assets/e254c38c-f937-44b6-9e9d-770db583d099)
 
-**Manage Documents**
+**Manage Documents** 
 ![documents](https://github.com/user-attachments/assets/7001e306-eb06-4009-89c6-8fadfdc3fc4d)
 
-**Research Agent**
+**Research Agent** 
 
 ![researcher](https://github.com/user-attachments/assets/fda3e61f-f936-4b66-b565-d84edde44a67)
 
-**Podcast Agent**
+**Podcast Agent** 
 ![podcasts](https://github.com/user-attachments/assets/6cb82ffd-9e14-4172-bc79-67faf34c4c1c)
 
-**Agent Chat**
+
+**Agent Chat** 
 
 ![chat](https://github.com/user-attachments/assets/bb352d52-1c6d-4020-926b-722d0b98b491)
 
@@ -137,86 +127,89 @@ Before installation, make sure to complete the [prerequisite setup steps](https:
 
 ![ext2](https://github.com/user-attachments/assets/a9b9f1aa-2677-404d-b0a0-c1b2dddf24a7)
 
+
 ## Tech Stack
 
-### **BackEnd**
 
-- **FastAPI**: Modern, fast web framework for building APIs with Python
+ ### **BackEnd** 
 
-- **PostgreSQL with pgvector**: Database with vector search capabilities for similarity searches
+-  **FastAPI**: Modern, fast web framework for building APIs with Python
+  
+-  **PostgreSQL with pgvector**: Database with vector search capabilities for similarity searches
 
-- **SQLAlchemy**: SQL toolkit and ORM (Object-Relational Mapping) for database interactions
+-  **SQLAlchemy**: SQL toolkit and ORM (Object-Relational Mapping) for database interactions
 
-- **Alembic**: A database migrations tool for SQLAlchemy.
+-  **Alembic**: A database migrations tool for SQLAlchemy.
 
-- **FastAPI Users**: Authentication and user management with JWT and OAuth support
+-  **FastAPI Users**: Authentication and user management with JWT and OAuth support
 
-- **LangGraph**: Framework for developing AI-agents.
+-  **LangGraph**: Framework for developing AI-agents.
+  
+-  **LangChain**: Framework for developing AI-powered applications.
 
-- **LangChain**: Framework for developing AI-powered applications.
+-  **LLM Integration**: Integration with LLM models through LiteLLM
 
-- **LLM Integration**: Integration with LLM models through LiteLLM
+-  **Rerankers**: Advanced result ranking for improved search relevance
 
-- **Rerankers**: Advanced result ranking for improved search relevance
+-  **Hybrid Search**: Combines vector similarity and full-text search for optimal results using Reciprocal Rank Fusion (RRF)
 
-- **Hybrid Search**: Combines vector similarity and full-text search for optimal results using Reciprocal Rank Fusion (RRF)
+-  **Vector Embeddings**: Document and text embeddings for semantic search
 
-- **Vector Embeddings**: Document and text embeddings for semantic search
+-  **pgvector**: PostgreSQL extension for efficient vector similarity operations
 
-- **pgvector**: PostgreSQL extension for efficient vector similarity operations
+-  **Chonkie**: Advanced document chunking and embedding library
+ - Uses `AutoEmbeddings` for flexible embedding model selection
+ -  `LateChunker` for optimized document chunking based on embedding model's max sequence length
 
-- **Chonkie**: Advanced document chunking and embedding library
-- Uses `AutoEmbeddings` for flexible embedding model selection
-- `LateChunker` for optimized document chunking based on embedding model's max sequence length
 
+  
 ---
+ ### **FrontEnd**
 
-### **FrontEnd**
+-  **Next.js 15.2.3**: React framework featuring App Router, server components, automatic code-splitting, and optimized rendering.
 
-- **Next.js 15.2.3**: React framework featuring App Router, server components, automatic code-splitting, and optimized rendering.
+-  **React 19.0.0**: JavaScript library for building user interfaces.
 
-- **React 19.0.0**: JavaScript library for building user interfaces.
-
-- **TypeScript**: Static type-checking for JavaScript, enhancing code quality and developer experience.
+-  **TypeScript**: Static type-checking for JavaScript, enhancing code quality and developer experience.
 - **Vercel AI SDK Kit UI Stream Protocol**: To create scalable chat UI.
 
-- **Tailwind CSS 4.x**: Utility-first CSS framework for building custom UI designs.
+-  **Tailwind CSS 4.x**: Utility-first CSS framework for building custom UI designs.
 
-- **Shadcn**: Headless components library.
+-  **Shadcn**: Headless components library.
 
-- **Lucide React**: Icon set implemented as React components.
+-  **Lucide React**: Icon set implemented as React components.
 
-- **Framer Motion**: Animation library for React.
+-  **Framer Motion**: Animation library for React.
 
-- **Sonner**: Toast notification library.
+-  **Sonner**: Toast notification library.
 
-- **Geist**: Font family from Vercel.
+-  **Geist**: Font family from Vercel.
 
-- **React Hook Form**: Form state management and validation.
+-  **React Hook Form**: Form state management and validation.
 
-- **Zod**: TypeScript-first schema validation with static type inference.
+-  **Zod**: TypeScript-first schema validation with static type inference.
 
-- **@hookform/resolvers**: Resolvers for using validation libraries with React Hook Form.
+-  **@hookform/resolvers**: Resolvers for using validation libraries with React Hook Form.
 
-- **@tanstack/react-table**: Headless UI for building powerful tables & datagrids.
+-  **@tanstack/react-table**: Headless UI for building powerful tables & datagrids.
 
-### **DevOps**
 
-- **Docker**: Container platform for consistent deployment across environments
+ ### **DevOps**
 
-- **Docker Compose**: Tool for defining and running multi-container Docker applications
+-  **Docker**: Container platform for consistent deployment across environments
+  
+-  **Docker Compose**: Tool for defining and running multi-container Docker applications
 
-- **pgAdmin**: Web-based PostgreSQL administration tool included in Docker setup
+-  **pgAdmin**: Web-based PostgreSQL administration tool included in Docker setup
 
-### **Extension**
 
-Manifest v3 on Plasmo
+### **Extension** 
+ Manifest v3 on Plasmo
 
 ## Future Work
-
 - Add More Connectors.
 - Patch minor bugs.
-- Implement Canvas.
+- Implement Canvas. 
 - Complete Hybrid Search. **[Done]**
 - Add support for file uploads QA. **[Done]**
 - Shift to WebSockets for Streaming responses. **[Deprecated in favor of AI SDK Stream Protocol]**
@@ -227,7 +220,9 @@ Manifest v3 on Plasmo
 - Basic keyword search page for saved sessions **[Done]**
 - Multi & Single Document Chat **[Done]**
 
-## Contribute
+
+
+## Contribute 
 
 Contributions are very welcome! A contribution can be as small as a ⭐ or even finding and creating issues.
 Fine-tuning the Backend is always desired.
@@ -241,3 +236,4 @@ Fine-tuning the Backend is always desired.
    <img alt="Star History Chart" src="https://api.star-history.com/svg?repos=MODSetter/SurfSense&type=Date" />
  </picture>
 </a>
+

From 4d5fb212df4ee8321c10eb16657f739317e72985 Mon Sep 17 00:00:00 2001
From: "DESKTOP-RTLN3BA\\$punk" <vermarohanfinal@gmail.com>
Date: Tue, 13 May 2025 21:13:11 -0700
Subject: [PATCH 46/70] feat: Added Speech to Text support.

- Supports audio & video files.
- Will be useful for Youtube vids which dont have transcripts.
---
 README.md                                     |  69 ++++++----
 surfsense_backend/.env.example                |   5 +
 .../app/agents/podcaster/nodes.py             |  25 ++--
 surfsense_backend/app/config/__init__.py      |   8 +-
 .../app/routes/documents_routes.py            | 126 ++++++++++++------
 .../documents/upload/page.tsx                 |   6 +-
 .../content/docs/docker-installation.mdx      |   3 +
 .../content/docs/manual-installation.mdx      |   3 +
 8 files changed, 172 insertions(+), 73 deletions(-)

diff --git a/README.md b/README.md
index e8979cf50..7ffecc9f3 100644
--- a/README.md
+++ b/README.md
@@ -27,28 +27,27 @@ https://github.com/user-attachments/assets/bf64a6ca-934b-47ac-9e1b-edac5fe972ec
 
 
 ## Key Features
-### 1. Latest
 
-#### 💡 **Idea**: 
+### 💡 **Idea**: 
 Have your own highly customizable private NotebookLM and Perplexity integrated with external sources.
-#### 📁 **Multiple File Format Uploading Support**
-Save content from your own personal files *(Documents, images and supports **27 file extensions**)* to your own personal knowledge base .
-#### 🔍 **Powerful Search**
+### 📁 **Multiple File Format Uploading Support**
+Save content from your own personal files *(Documents, images, videos and supports **34 file extensions**)* to your own personal knowledge base .
+### 🔍 **Powerful Search**
 Quickly research or find anything in your saved content .
-#### 💬 **Chat with your Saved Content**
+### 💬 **Chat with your Saved Content**
  Interact in Natural Language and get cited answers.
-#### 📄 **Cited Answers**
+### 📄 **Cited Answers**
 Get Cited answers just like Perplexity.
-#### 🔔 **Privacy & Local LLM Support**
+### 🔔 **Privacy & Local LLM Support**
 Works Flawlessly with Ollama local LLMs.
-#### 🏠 **Self Hostable**
+### 🏠 **Self Hostable**
 Open source and easy to deploy locally.
-#### 🎙️ Podcasts 
+### 🎙️ Podcasts 
 - Blazingly fast podcast generation agent. (Creates a 3-minute podcast in under 20 seconds.)
 - Convert your chat conversations into engaging audio content
 - Support for multiple TTS providers (OpenAI, Azure, Google Vertex AI)
 
-#### 📊 **Advanced RAG Techniques**
+### 📊 **Advanced RAG Techniques**
 - Supports 150+ LLM's
 - Supports 6000+ Embedding Models.
 - Supports all major Rerankers (Pinecode, Cohere, Flashrank etc)
@@ -56,7 +55,7 @@ Open source and easy to deploy locally.
 - Utilizes Hybrid Search (Semantic + Full Text Search combined with Reciprocal Rank Fusion).
 - RAG as a Service API Backend.
 
-#### ℹ️ **External Sources**
+### ℹ️ **External Sources**
 - Search Engines (Tavily, LinkUp)
 - Slack
 - Linear
@@ -65,7 +64,39 @@ Open source and easy to deploy locally.
 - GitHub
 - and more to come.....
 
-#### 🔖 Cross Browser Extension
+### 📄 **Supported File Extensions**
+
+#### Document
+
+`.doc`, `.docx`, `.odt`, `.rtf`, `.pdf`, `.xml`
+
+#### Text & Markup
+
+`.txt`, `.md`, `.markdown`, `.rst`, `.html`, `.org`
+
+#### Spreadsheets & Tables
+
+`.xls`, `.xlsx`, `.csv`, `.tsv`
+
+#### Audio & Video
+
+`.mp3`, `.mpga`, `.m4a`, `.wav`, `.mp4`, `.mpeg`, `.webm`
+
+#### Images
+
+`.jpg`, `.jpeg`, `.png`, `.bmp`, `.tiff`, `.heic`
+
+#### Email & eBooks
+
+`.eml`, `.msg`, `.epub`
+
+#### PowerPoint Presentations & Other
+
+`.ppt`, `.pptx`, `.p7s`
+
+
+
+### 🔖 Cross Browser Extension
 - The SurfSense extension can be used to save any webpage you like.
 - Its main usecase is to save any webpages protected beyond authentication.
 
@@ -209,16 +240,8 @@ Before installation, make sure to complete the [prerequisite setup steps](https:
 ## Future Work
 - Add More Connectors.
 - Patch minor bugs.
-- Implement Canvas. 
-- Complete Hybrid Search. **[Done]**
-- Add support for file uploads QA. **[Done]**
-- Shift to WebSockets for Streaming responses. **[Deprecated in favor of AI SDK Stream Protocol]**
-- Based on feedback, I will work on making it compatible with local models. **[Done]**
-- Cross Browser Extension **[Done]**
-- Critical Notifications **[Done | PAUSED]**
-- Saving Chats **[Done]**
-- Basic keyword search page for saved sessions **[Done]**
-- Multi & Single Document Chat **[Done]**
+- Document Chat **[REIMPLEMENT]**
+- Document Podcasts
 
 
 
diff --git a/surfsense_backend/.env.example b/surfsense_backend/.env.example
index 53a8fb58a..19a411515 100644
--- a/surfsense_backend/.env.example
+++ b/surfsense_backend/.env.example
@@ -18,6 +18,9 @@ LONG_CONTEXT_LLM="gemini/gemini-2.0-flash"
 #LiteLLM TTS Provider: https://docs.litellm.ai/docs/text_to_speech#supported-providers
 TTS_SERVICE="openai/tts-1"
 
+#LiteLLM STT Provider: https://docs.litellm.ai/docs/audio_transcription#supported-providers
+STT_SERVICE="openai/whisper-1"
+
 # Chosen LiteLLM Providers Keys
 OPENAI_API_KEY="sk-proj-iA"
 GEMINI_API_KEY="AIzaSyB6-1641124124124124124124124124124"
@@ -35,3 +38,5 @@ LANGSMITH_PROJECT="surfsense"
 FAST_LLM_API_BASE=""
 STRATEGIC_LLM_API_BASE=""
 LONG_CONTEXT_LLM_API_BASE=""
+TTS_SERVICE_API_BASE=""
+STT_SERVICE_API_BASE=""
diff --git a/surfsense_backend/app/agents/podcaster/nodes.py b/surfsense_backend/app/agents/podcaster/nodes.py
index 19a233a6c..9ea590a53 100644
--- a/surfsense_backend/app/agents/podcaster/nodes.py
+++ b/surfsense_backend/app/agents/podcaster/nodes.py
@@ -135,14 +135,23 @@ async def create_merged_podcast_audio(state: State, config: RunnableConfig) -> D
         filename = f"{temp_dir}/{session_id}_{index}.mp3"
         
         try:
-            # Generate speech using litellm
-            response = await aspeech(
-                model=app_config.TTS_SERVICE,
-                voice=voice,
-                input=dialog,
-                max_retries=2,
-                timeout=600,
-            )
+            if app_config.TTS_SERVICE_API_BASE:
+                response = await aspeech(
+                    model=app_config.TTS_SERVICE,
+                    api_base=app_config.TTS_SERVICE_API_BASE,
+                    voice=voice,
+                    input=dialog,
+                    max_retries=2,
+                    timeout=600,
+                )
+            else:
+                response = await aspeech(
+                    model=app_config.TTS_SERVICE,
+                    voice=voice,
+                    input=dialog,
+                    max_retries=2,
+                    timeout=600,
+                )
             
             # Save the audio to a file - use proper streaming method
             with open(filename, 'wb') as f:
diff --git a/surfsense_backend/app/config/__init__.py b/surfsense_backend/app/config/__init__.py
index 7fd032aa8..eed462792 100644
--- a/surfsense_backend/app/config/__init__.py
+++ b/surfsense_backend/app/config/__init__.py
@@ -6,7 +6,7 @@ from chonkie import AutoEmbeddings, CodeChunker, RecursiveChunker
 from dotenv import load_dotenv
 from langchain_community.chat_models import ChatLiteLLM
 from rerankers import Reranker
-from litellm import speech
+
 
 # Get the base directory of the project
 BASE_DIR = Path(__file__).resolve().parent.parent.parent
@@ -97,6 +97,12 @@ class Config:
     
     # Litellm TTS Configuration
     TTS_SERVICE = os.getenv("TTS_SERVICE")
+    TTS_SERVICE_API_BASE = os.getenv("TTS_SERVICE_API_BASE")
+    
+    # Litellm STT Configuration
+    STT_SERVICE = os.getenv("STT_SERVICE")
+    STT_SERVICE_API_BASE = os.getenv("STT_SERVICE_API_BASE")
+    
     
     # Validation Checks
     # Check embedding dimension
diff --git a/surfsense_backend/app/routes/documents_routes.py b/surfsense_backend/app/routes/documents_routes.py
index bbefbcddc..5ea232729 100644
--- a/surfsense_backend/app/routes/documents_routes.py
+++ b/surfsense_backend/app/routes/documents_routes.py
@@ -1,3 +1,4 @@
+from litellm import atranscription
 from fastapi import APIRouter, Depends, BackgroundTasks, UploadFile, Form, HTTPException
 from sqlalchemy.ext.asyncio import AsyncSession
 from sqlalchemy.future import select
@@ -7,6 +8,7 @@ from app.schemas import DocumentsCreate, DocumentUpdate, DocumentRead
 from app.users import current_active_user
 from app.utils.check_ownership import check_ownership
 from app.tasks.background_tasks import add_received_markdown_file_document, add_extension_received_document, add_received_file_document, add_crawled_url_document, add_youtube_video_document
+from app.config import config as app_config
 # Force asyncio to use standard event loop before unstructured imports
 import asyncio
 try:
@@ -17,9 +19,9 @@ import os
 os.environ["UNSTRUCTURED_HAS_PATCHED_LOOP"] = "1"
 
 
-
 router = APIRouter()
 
+
 @router.post("/documents/")
 async def create_documents(
     request: DocumentsCreate,
@@ -30,19 +32,19 @@ async def create_documents(
     try:
         # Check if the user owns the search space
         await check_ownership(session, SearchSpace, request.search_space_id, user)
-        
+
         if request.document_type == DocumentType.EXTENSION:
             for individual_document in request.content:
                 fastapi_background_tasks.add_task(
-                    process_extension_document_with_new_session, 
-                    individual_document, 
+                    process_extension_document_with_new_session,
+                    individual_document,
                     request.search_space_id
                 )
         elif request.document_type == DocumentType.CRAWLED_URL:
-            for url in request.content:  
+            for url in request.content:
                 fastapi_background_tasks.add_task(
-                    process_crawled_url_with_new_session, 
-                    url, 
+                    process_crawled_url_with_new_session,
+                    url,
                     request.search_space_id
                 )
         elif request.document_type == DocumentType.YOUTUBE_VIDEO:
@@ -57,7 +59,7 @@ async def create_documents(
                 status_code=400,
                 detail="Invalid document type"
             )
-        
+
         await session.commit()
         return {"message": "Documents processed successfully"}
     except HTTPException:
@@ -69,6 +71,7 @@ async def create_documents(
             detail=f"Failed to process documents: {str(e)}"
         )
 
+
 @router.post("/documents/fileupload")
 async def create_documents(
     files: list[UploadFile],
@@ -79,26 +82,26 @@ async def create_documents(
 ):
     try:
         await check_ownership(session, SearchSpace, search_space_id, user)
-        
+
         if not files:
             raise HTTPException(status_code=400, detail="No files provided")
-            
+
         for file in files:
             try:
                 # Save file to a temporary location to avoid stream issues
                 import tempfile
                 import aiofiles
                 import os
-                
+
                 # Create temp file
                 with tempfile.NamedTemporaryFile(delete=False, suffix=os.path.splitext(file.filename)[1]) as temp_file:
                     temp_path = temp_file.name
-                
+
                 # Write uploaded file to temp file
                 content = await file.read()
                 with open(temp_path, "wb") as f:
                     f.write(content)
-                
+
                 # Process in background to avoid uvloop conflicts
                 fastapi_background_tasks.add_task(
                     process_file_in_background_with_new_session,
@@ -111,7 +114,7 @@ async def create_documents(
                     status_code=422,
                     detail=f"Failed to process file {file.filename}: {str(e)}"
                 )
-        
+
         await session.commit()
         return {"message": "Files uploaded for processing"}
     except HTTPException:
@@ -136,14 +139,14 @@ async def process_file_in_background(
             # For markdown files, read the content directly
             with open(file_path, 'r', encoding='utf-8') as f:
                 markdown_content = f.read()
-            
+
             # Clean up the temp file
             import os
             try:
                 os.unlink(file_path)
             except:
                 pass
-            
+
             # Process markdown directly through specialized function
             await add_received_markdown_file_document(
                 session,
@@ -151,10 +154,46 @@ async def process_file_in_background(
                 markdown_content,
                 search_space_id
             )
+        # Check if the file is an audio file
+        elif filename.lower().endswith(('.mp3', '.mp4', '.mpeg', '.mpga', '.m4a', '.wav', '.webm')):
+            # Open the audio file for transcription
+            with open(file_path, "rb") as audio_file:
+                # Use LiteLLM for audio transcription
+                if app_config.STT_SERVICE_API_BASE:
+                    transcription_response = await atranscription(
+                        model=app_config.STT_SERVICE,
+                        file=audio_file,
+                        api_base=app_config.STT_SERVICE_API_BASE
+                    )
+                else:
+                    transcription_response = await atranscription(
+                        model=app_config.STT_SERVICE,
+                        file=audio_file
+                    )
+
+                # Extract the transcribed text
+                transcribed_text = transcription_response.get("text", "")
+
+                # Add metadata about the transcription
+                transcribed_text = f"# Transcription of {filename}\n\n{transcribed_text}"
+
+            # Clean up the temp file
+            try:
+                os.unlink(file_path)
+            except:
+                pass
+
+            # Process transcription as markdown document
+            await add_received_markdown_file_document(
+                session,
+                filename,
+                transcribed_text,
+                search_space_id
+            )
         else:
             # Use synchronous unstructured API to avoid event loop issues
             from langchain_unstructured import UnstructuredLoader
-            
+
             # Process the file
             loader = UnstructuredLoader(
                 file_path,
@@ -165,16 +204,16 @@ async def process_file_in_background(
                 include_metadata=False,
                 strategy="auto",
             )
-            
+
             docs = await loader.aload()
-            
+
             # Clean up the temp file
             import os
             try:
                 os.unlink(file_path)
             except:
                 pass
-            
+
             # Pass the documents to the existing background task
             await add_received_file_document(
                 session,
@@ -186,6 +225,7 @@ async def process_file_in_background(
         import logging
         logging.error(f"Error processing file in background: {str(e)}")
 
+
 @router.get("/documents/", response_model=List[DocumentRead])
 async def read_documents(
     skip: int = 0,
@@ -195,17 +235,18 @@ async def read_documents(
     user: User = Depends(current_active_user)
 ):
     try:
-        query = select(Document).join(SearchSpace).filter(SearchSpace.user_id == user.id)
-        
+        query = select(Document).join(SearchSpace).filter(
+            SearchSpace.user_id == user.id)
+
         # Filter by search_space_id if provided
         if search_space_id is not None:
             query = query.filter(Document.search_space_id == search_space_id)
-            
+
         result = await session.execute(
             query.offset(skip).limit(limit)
         )
         db_documents = result.scalars().all()
-        
+
         # Convert database objects to API-friendly format
         api_documents = []
         for doc in db_documents:
@@ -218,7 +259,7 @@ async def read_documents(
                 created_at=doc.created_at,
                 search_space_id=doc.search_space_id
             ))
-            
+
         return api_documents
     except Exception as e:
         raise HTTPException(
@@ -226,6 +267,7 @@ async def read_documents(
             detail=f"Failed to fetch documents: {str(e)}"
         )
 
+
 @router.get("/documents/{document_id}", response_model=DocumentRead)
 async def read_document(
     document_id: int,
@@ -239,13 +281,13 @@ async def read_document(
             .filter(Document.id == document_id, SearchSpace.user_id == user.id)
         )
         document = result.scalars().first()
-        
+
         if not document:
             raise HTTPException(
                 status_code=404,
                 detail=f"Document with id {document_id} not found"
             )
-            
+
         # Convert database object to API-friendly format
         return DocumentRead(
             id=document.id,
@@ -262,6 +304,7 @@ async def read_document(
             detail=f"Failed to fetch document: {str(e)}"
         )
 
+
 @router.put("/documents/{document_id}", response_model=DocumentRead)
 async def update_document(
     document_id: int,
@@ -277,19 +320,19 @@ async def update_document(
             .filter(Document.id == document_id, SearchSpace.user_id == user.id)
         )
         db_document = result.scalars().first()
-        
+
         if not db_document:
             raise HTTPException(
                 status_code=404,
                 detail=f"Document with id {document_id} not found"
             )
-            
+
         update_data = document_update.model_dump(exclude_unset=True)
         for key, value in update_data.items():
             setattr(db_document, key, value)
         await session.commit()
         await session.refresh(db_document)
-        
+
         # Convert to DocumentRead for response
         return DocumentRead(
             id=db_document.id,
@@ -309,6 +352,7 @@ async def update_document(
             detail=f"Failed to update document: {str(e)}"
         )
 
+
 @router.delete("/documents/{document_id}", response_model=dict)
 async def delete_document(
     document_id: int,
@@ -323,13 +367,13 @@ async def delete_document(
             .filter(Document.id == document_id, SearchSpace.user_id == user.id)
         )
         document = result.scalars().first()
-        
+
         if not document:
             raise HTTPException(
                 status_code=404,
                 detail=f"Document with id {document_id} not found"
             )
-            
+
         await session.delete(document)
         await session.commit()
         return {"message": "Document deleted successfully"}
@@ -340,16 +384,16 @@ async def delete_document(
         raise HTTPException(
             status_code=500,
             detail=f"Failed to delete document: {str(e)}"
-        ) 
-        
-        
+        )
+
+
 async def process_extension_document_with_new_session(
     individual_document,
     search_space_id: int
 ):
     """Create a new session and process extension document."""
     from app.db import async_session_maker
-    
+
     async with async_session_maker() as session:
         try:
             await add_extension_received_document(session, individual_document, search_space_id)
@@ -357,13 +401,14 @@ async def process_extension_document_with_new_session(
             import logging
             logging.error(f"Error processing extension document: {str(e)}")
 
+
 async def process_crawled_url_with_new_session(
     url: str,
     search_space_id: int
 ):
     """Create a new session and process crawled URL."""
     from app.db import async_session_maker
-    
+
     async with async_session_maker() as session:
         try:
             await add_crawled_url_document(session, url, search_space_id)
@@ -371,6 +416,7 @@ async def process_crawled_url_with_new_session(
             import logging
             logging.error(f"Error processing crawled URL: {str(e)}")
 
+
 async def process_file_in_background_with_new_session(
     file_path: str,
     filename: str,
@@ -378,21 +424,21 @@ async def process_file_in_background_with_new_session(
 ):
     """Create a new session and process file."""
     from app.db import async_session_maker
-    
+
     async with async_session_maker() as session:
         await process_file_in_background(file_path, filename, search_space_id, session)
 
+
 async def process_youtube_video_with_new_session(
     url: str,
     search_space_id: int
 ):
     """Create a new session and process YouTube video."""
     from app.db import async_session_maker
-    
+
     async with async_session_maker() as session:
         try:
             await add_youtube_video_document(session, url, search_space_id)
         except Exception as e:
             import logging
             logging.error(f"Error processing YouTube video: {str(e)}")
-
diff --git a/surfsense_web/app/dashboard/[search_space_id]/documents/upload/page.tsx b/surfsense_web/app/dashboard/[search_space_id]/documents/upload/page.tsx
index a6fb3d128..e1adbe2d6 100644
--- a/surfsense_web/app/dashboard/[search_space_id]/documents/upload/page.tsx
+++ b/surfsense_web/app/dashboard/[search_space_id]/documents/upload/page.tsx
@@ -53,7 +53,7 @@ export default function FileUploader() {
         'text/html': ['.html'],
         'image/jpeg': ['.jpeg', '.jpg'],
         'image/png': ['.png'],
-        'text/markdown': ['.md'],
+        'text/markdown': ['.md', '.markdown'],
         'application/vnd.ms-outlook': ['.msg'],
         'application/vnd.oasis.opendocument.text': ['.odt'],
         'text/x-org': ['.org'],
@@ -69,6 +69,10 @@ export default function FileUploader() {
         'application/vnd.ms-excel': ['.xls'],
         'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet': ['.xlsx'],
         'application/xml': ['.xml'],
+        'audio/mpeg': ['.mp3', '.mpeg', '.mpga'],
+        'audio/mp4': ['.mp4', '.m4a'],
+        'audio/wav': ['.wav'],
+        'audio/webm': ['.webm'],
     }
 
     const supportedExtensions = Array.from(new Set(Object.values(acceptedFileTypes).flat())).sort()
diff --git a/surfsense_web/content/docs/docker-installation.mdx b/surfsense_web/content/docs/docker-installation.mdx
index 1545fd598..a1500822d 100644
--- a/surfsense_web/content/docs/docker-installation.mdx
+++ b/surfsense_web/content/docs/docker-installation.mdx
@@ -94,6 +94,7 @@ Before you begin, ensure you have:
 | UNSTRUCTURED_API_KEY       | API key for Unstructured.io service for document parsing                                                                                                                                  |
 | FIRECRAWL_API_KEY          | API key for Firecrawl service for web crawling                                                                                                                                            |
 | TTS_SERVICE                | Text-to-Speech API provider for Podcasts (e.g., `openai/tts-1`, `azure/neural`, `vertex_ai/`). See [supported providers](https://docs.litellm.ai/docs/text_to_speech#supported-providers) |
+| STT_SERVICE                | Speech-to-Text API provider for Podcasts (e.g., `openai/whisper-1`). See [supported providers](https://docs.litellm.ai/docs/audio_transcription#supported-providers)                      |
 
 Include API keys for the LLM providers you're using. For example:
 
@@ -114,6 +115,8 @@ Include API keys for the LLM providers you're using. For example:
 | FAST_LLM_API_BASE | Custom API base URL for the fast LLM |
 | STRATEGIC_LLM_API_BASE | Custom API base URL for the strategic LLM |
 | LONG_CONTEXT_LLM_API_BASE | Custom API base URL for the long context LLM |
+| TTS_SERVICE_API_BASE | Custom API base URL for the Text-to-Speech (TTS) service |
+| STT_SERVICE_API_BASE | Custom API base URL for the Speech-to-Text (STT) service |
 
 For other LLM providers, refer to the [LiteLLM documentation](https://docs.litellm.ai/docs/providers).
 
diff --git a/surfsense_web/content/docs/manual-installation.mdx b/surfsense_web/content/docs/manual-installation.mdx
index 749aac217..b3999dc69 100644
--- a/surfsense_web/content/docs/manual-installation.mdx
+++ b/surfsense_web/content/docs/manual-installation.mdx
@@ -65,6 +65,7 @@ Edit the `.env` file and set the following variables:
 | UNSTRUCTURED_API_KEY       | API key for Unstructured.io service                                                                                                                                                       |
 | FIRECRAWL_API_KEY          | API key for Firecrawl service (if using crawler)                                                                                                                                          |
 | TTS_SERVICE                | Text-to-Speech API provider for Podcasts (e.g., `openai/tts-1`, `azure/neural`, `vertex_ai/`). See [supported providers](https://docs.litellm.ai/docs/text_to_speech#supported-providers) |
+| STT_SERVICE                | Speech-to-Text API provider for Podcasts (e.g., `openai/whisper-1`). See [supported providers](https://docs.litellm.ai/docs/audio_transcription#supported-providers)                      |
 
 **Important**: Since LLM calls are routed through LiteLLM, include API keys for the LLM providers you're using:
 
@@ -86,6 +87,8 @@ Edit the `.env` file and set the following variables:
 | FAST_LLM_API_BASE | Custom API base URL for the fast LLM |
 | STRATEGIC_LLM_API_BASE | Custom API base URL for the strategic LLM |
 | LONG_CONTEXT_LLM_API_BASE | Custom API base URL for the long context LLM |
+| TTS_SERVICE_API_BASE | Custom API base URL for the Text-to-Speech (TTS) service |
+| STT_SERVICE_API_BASE | Custom API base URL for the Speech-to-Text (STT) service |
 
 ### 2. Install Dependencies
 

From a8080d2dc7fb940bc1f7ec0b040c80622d6fa90a Mon Sep 17 00:00:00 2001
From: "DESKTOP-RTLN3BA\\$punk" <vermarohanfinal@gmail.com>
Date: Tue, 13 May 2025 21:13:53 -0700
Subject: [PATCH 47/70] feat: Added Speech to Text support.

- Supports audio & video files.
- Will be useful for Youtube vids which dont have transcripts.
---
 README.md                                     |  69 ++++++----
 surfsense_backend/.env.example                |   5 +
 .../app/agents/podcaster/nodes.py             |  25 ++--
 surfsense_backend/app/config/__init__.py      |   8 +-
 .../app/routes/documents_routes.py            | 126 ++++++++++++------
 .../documents/upload/page.tsx                 |   6 +-
 .../content/docs/docker-installation.mdx      |   3 +
 .../content/docs/manual-installation.mdx      |   3 +
 8 files changed, 172 insertions(+), 73 deletions(-)

diff --git a/README.md b/README.md
index e8979cf50..7ffecc9f3 100644
--- a/README.md
+++ b/README.md
@@ -27,28 +27,27 @@ https://github.com/user-attachments/assets/bf64a6ca-934b-47ac-9e1b-edac5fe972ec
 
 
 ## Key Features
-### 1. Latest
 
-#### 💡 **Idea**: 
+### 💡 **Idea**: 
 Have your own highly customizable private NotebookLM and Perplexity integrated with external sources.
-#### 📁 **Multiple File Format Uploading Support**
-Save content from your own personal files *(Documents, images and supports **27 file extensions**)* to your own personal knowledge base .
-#### 🔍 **Powerful Search**
+### 📁 **Multiple File Format Uploading Support**
+Save content from your own personal files *(Documents, images, videos and supports **34 file extensions**)* to your own personal knowledge base .
+### 🔍 **Powerful Search**
 Quickly research or find anything in your saved content .
-#### 💬 **Chat with your Saved Content**
+### 💬 **Chat with your Saved Content**
  Interact in Natural Language and get cited answers.
-#### 📄 **Cited Answers**
+### 📄 **Cited Answers**
 Get Cited answers just like Perplexity.
-#### 🔔 **Privacy & Local LLM Support**
+### 🔔 **Privacy & Local LLM Support**
 Works Flawlessly with Ollama local LLMs.
-#### 🏠 **Self Hostable**
+### 🏠 **Self Hostable**
 Open source and easy to deploy locally.
-#### 🎙️ Podcasts 
+### 🎙️ Podcasts 
 - Blazingly fast podcast generation agent. (Creates a 3-minute podcast in under 20 seconds.)
 - Convert your chat conversations into engaging audio content
 - Support for multiple TTS providers (OpenAI, Azure, Google Vertex AI)
 
-#### 📊 **Advanced RAG Techniques**
+### 📊 **Advanced RAG Techniques**
 - Supports 150+ LLM's
 - Supports 6000+ Embedding Models.
 - Supports all major Rerankers (Pinecode, Cohere, Flashrank etc)
@@ -56,7 +55,7 @@ Open source and easy to deploy locally.
 - Utilizes Hybrid Search (Semantic + Full Text Search combined with Reciprocal Rank Fusion).
 - RAG as a Service API Backend.
 
-#### ℹ️ **External Sources**
+### ℹ️ **External Sources**
 - Search Engines (Tavily, LinkUp)
 - Slack
 - Linear
@@ -65,7 +64,39 @@ Open source and easy to deploy locally.
 - GitHub
 - and more to come.....
 
-#### 🔖 Cross Browser Extension
+### 📄 **Supported File Extensions**
+
+#### Document
+
+`.doc`, `.docx`, `.odt`, `.rtf`, `.pdf`, `.xml`
+
+#### Text & Markup
+
+`.txt`, `.md`, `.markdown`, `.rst`, `.html`, `.org`
+
+#### Spreadsheets & Tables
+
+`.xls`, `.xlsx`, `.csv`, `.tsv`
+
+#### Audio & Video
+
+`.mp3`, `.mpga`, `.m4a`, `.wav`, `.mp4`, `.mpeg`, `.webm`
+
+#### Images
+
+`.jpg`, `.jpeg`, `.png`, `.bmp`, `.tiff`, `.heic`
+
+#### Email & eBooks
+
+`.eml`, `.msg`, `.epub`
+
+#### PowerPoint Presentations & Other
+
+`.ppt`, `.pptx`, `.p7s`
+
+
+
+### 🔖 Cross Browser Extension
 - The SurfSense extension can be used to save any webpage you like.
 - Its main usecase is to save any webpages protected beyond authentication.
 
@@ -209,16 +240,8 @@ Before installation, make sure to complete the [prerequisite setup steps](https:
 ## Future Work
 - Add More Connectors.
 - Patch minor bugs.
-- Implement Canvas. 
-- Complete Hybrid Search. **[Done]**
-- Add support for file uploads QA. **[Done]**
-- Shift to WebSockets for Streaming responses. **[Deprecated in favor of AI SDK Stream Protocol]**
-- Based on feedback, I will work on making it compatible with local models. **[Done]**
-- Cross Browser Extension **[Done]**
-- Critical Notifications **[Done | PAUSED]**
-- Saving Chats **[Done]**
-- Basic keyword search page for saved sessions **[Done]**
-- Multi & Single Document Chat **[Done]**
+- Document Chat **[REIMPLEMENT]**
+- Document Podcasts
 
 
 
diff --git a/surfsense_backend/.env.example b/surfsense_backend/.env.example
index 53a8fb58a..19a411515 100644
--- a/surfsense_backend/.env.example
+++ b/surfsense_backend/.env.example
@@ -18,6 +18,9 @@ LONG_CONTEXT_LLM="gemini/gemini-2.0-flash"
 #LiteLLM TTS Provider: https://docs.litellm.ai/docs/text_to_speech#supported-providers
 TTS_SERVICE="openai/tts-1"
 
+#LiteLLM STT Provider: https://docs.litellm.ai/docs/audio_transcription#supported-providers
+STT_SERVICE="openai/whisper-1"
+
 # Chosen LiteLLM Providers Keys
 OPENAI_API_KEY="sk-proj-iA"
 GEMINI_API_KEY="AIzaSyB6-1641124124124124124124124124124"
@@ -35,3 +38,5 @@ LANGSMITH_PROJECT="surfsense"
 FAST_LLM_API_BASE=""
 STRATEGIC_LLM_API_BASE=""
 LONG_CONTEXT_LLM_API_BASE=""
+TTS_SERVICE_API_BASE=""
+STT_SERVICE_API_BASE=""
diff --git a/surfsense_backend/app/agents/podcaster/nodes.py b/surfsense_backend/app/agents/podcaster/nodes.py
index 19a233a6c..9ea590a53 100644
--- a/surfsense_backend/app/agents/podcaster/nodes.py
+++ b/surfsense_backend/app/agents/podcaster/nodes.py
@@ -135,14 +135,23 @@ async def create_merged_podcast_audio(state: State, config: RunnableConfig) -> D
         filename = f"{temp_dir}/{session_id}_{index}.mp3"
         
         try:
-            # Generate speech using litellm
-            response = await aspeech(
-                model=app_config.TTS_SERVICE,
-                voice=voice,
-                input=dialog,
-                max_retries=2,
-                timeout=600,
-            )
+            if app_config.TTS_SERVICE_API_BASE:
+                response = await aspeech(
+                    model=app_config.TTS_SERVICE,
+                    api_base=app_config.TTS_SERVICE_API_BASE,
+                    voice=voice,
+                    input=dialog,
+                    max_retries=2,
+                    timeout=600,
+                )
+            else:
+                response = await aspeech(
+                    model=app_config.TTS_SERVICE,
+                    voice=voice,
+                    input=dialog,
+                    max_retries=2,
+                    timeout=600,
+                )
             
             # Save the audio to a file - use proper streaming method
             with open(filename, 'wb') as f:
diff --git a/surfsense_backend/app/config/__init__.py b/surfsense_backend/app/config/__init__.py
index 7fd032aa8..eed462792 100644
--- a/surfsense_backend/app/config/__init__.py
+++ b/surfsense_backend/app/config/__init__.py
@@ -6,7 +6,7 @@ from chonkie import AutoEmbeddings, CodeChunker, RecursiveChunker
 from dotenv import load_dotenv
 from langchain_community.chat_models import ChatLiteLLM
 from rerankers import Reranker
-from litellm import speech
+
 
 # Get the base directory of the project
 BASE_DIR = Path(__file__).resolve().parent.parent.parent
@@ -97,6 +97,12 @@ class Config:
     
     # Litellm TTS Configuration
     TTS_SERVICE = os.getenv("TTS_SERVICE")
+    TTS_SERVICE_API_BASE = os.getenv("TTS_SERVICE_API_BASE")
+    
+    # Litellm STT Configuration
+    STT_SERVICE = os.getenv("STT_SERVICE")
+    STT_SERVICE_API_BASE = os.getenv("STT_SERVICE_API_BASE")
+    
     
     # Validation Checks
     # Check embedding dimension
diff --git a/surfsense_backend/app/routes/documents_routes.py b/surfsense_backend/app/routes/documents_routes.py
index bbefbcddc..5ea232729 100644
--- a/surfsense_backend/app/routes/documents_routes.py
+++ b/surfsense_backend/app/routes/documents_routes.py
@@ -1,3 +1,4 @@
+from litellm import atranscription
 from fastapi import APIRouter, Depends, BackgroundTasks, UploadFile, Form, HTTPException
 from sqlalchemy.ext.asyncio import AsyncSession
 from sqlalchemy.future import select
@@ -7,6 +8,7 @@ from app.schemas import DocumentsCreate, DocumentUpdate, DocumentRead
 from app.users import current_active_user
 from app.utils.check_ownership import check_ownership
 from app.tasks.background_tasks import add_received_markdown_file_document, add_extension_received_document, add_received_file_document, add_crawled_url_document, add_youtube_video_document
+from app.config import config as app_config
 # Force asyncio to use standard event loop before unstructured imports
 import asyncio
 try:
@@ -17,9 +19,9 @@ import os
 os.environ["UNSTRUCTURED_HAS_PATCHED_LOOP"] = "1"
 
 
-
 router = APIRouter()
 
+
 @router.post("/documents/")
 async def create_documents(
     request: DocumentsCreate,
@@ -30,19 +32,19 @@ async def create_documents(
     try:
         # Check if the user owns the search space
         await check_ownership(session, SearchSpace, request.search_space_id, user)
-        
+
         if request.document_type == DocumentType.EXTENSION:
             for individual_document in request.content:
                 fastapi_background_tasks.add_task(
-                    process_extension_document_with_new_session, 
-                    individual_document, 
+                    process_extension_document_with_new_session,
+                    individual_document,
                     request.search_space_id
                 )
         elif request.document_type == DocumentType.CRAWLED_URL:
-            for url in request.content:  
+            for url in request.content:
                 fastapi_background_tasks.add_task(
-                    process_crawled_url_with_new_session, 
-                    url, 
+                    process_crawled_url_with_new_session,
+                    url,
                     request.search_space_id
                 )
         elif request.document_type == DocumentType.YOUTUBE_VIDEO:
@@ -57,7 +59,7 @@ async def create_documents(
                 status_code=400,
                 detail="Invalid document type"
             )
-        
+
         await session.commit()
         return {"message": "Documents processed successfully"}
     except HTTPException:
@@ -69,6 +71,7 @@ async def create_documents(
             detail=f"Failed to process documents: {str(e)}"
         )
 
+
 @router.post("/documents/fileupload")
 async def create_documents(
     files: list[UploadFile],
@@ -79,26 +82,26 @@ async def create_documents(
 ):
     try:
         await check_ownership(session, SearchSpace, search_space_id, user)
-        
+
         if not files:
             raise HTTPException(status_code=400, detail="No files provided")
-            
+
         for file in files:
             try:
                 # Save file to a temporary location to avoid stream issues
                 import tempfile
                 import aiofiles
                 import os
-                
+
                 # Create temp file
                 with tempfile.NamedTemporaryFile(delete=False, suffix=os.path.splitext(file.filename)[1]) as temp_file:
                     temp_path = temp_file.name
-                
+
                 # Write uploaded file to temp file
                 content = await file.read()
                 with open(temp_path, "wb") as f:
                     f.write(content)
-                
+
                 # Process in background to avoid uvloop conflicts
                 fastapi_background_tasks.add_task(
                     process_file_in_background_with_new_session,
@@ -111,7 +114,7 @@ async def create_documents(
                     status_code=422,
                     detail=f"Failed to process file {file.filename}: {str(e)}"
                 )
-        
+
         await session.commit()
         return {"message": "Files uploaded for processing"}
     except HTTPException:
@@ -136,14 +139,14 @@ async def process_file_in_background(
             # For markdown files, read the content directly
             with open(file_path, 'r', encoding='utf-8') as f:
                 markdown_content = f.read()
-            
+
             # Clean up the temp file
             import os
             try:
                 os.unlink(file_path)
             except:
                 pass
-            
+
             # Process markdown directly through specialized function
             await add_received_markdown_file_document(
                 session,
@@ -151,10 +154,46 @@ async def process_file_in_background(
                 markdown_content,
                 search_space_id
             )
+        # Check if the file is an audio file
+        elif filename.lower().endswith(('.mp3', '.mp4', '.mpeg', '.mpga', '.m4a', '.wav', '.webm')):
+            # Open the audio file for transcription
+            with open(file_path, "rb") as audio_file:
+                # Use LiteLLM for audio transcription
+                if app_config.STT_SERVICE_API_BASE:
+                    transcription_response = await atranscription(
+                        model=app_config.STT_SERVICE,
+                        file=audio_file,
+                        api_base=app_config.STT_SERVICE_API_BASE
+                    )
+                else:
+                    transcription_response = await atranscription(
+                        model=app_config.STT_SERVICE,
+                        file=audio_file
+                    )
+
+                # Extract the transcribed text
+                transcribed_text = transcription_response.get("text", "")
+
+                # Add metadata about the transcription
+                transcribed_text = f"# Transcription of {filename}\n\n{transcribed_text}"
+
+            # Clean up the temp file
+            try:
+                os.unlink(file_path)
+            except:
+                pass
+
+            # Process transcription as markdown document
+            await add_received_markdown_file_document(
+                session,
+                filename,
+                transcribed_text,
+                search_space_id
+            )
         else:
             # Use synchronous unstructured API to avoid event loop issues
             from langchain_unstructured import UnstructuredLoader
-            
+
             # Process the file
             loader = UnstructuredLoader(
                 file_path,
@@ -165,16 +204,16 @@ async def process_file_in_background(
                 include_metadata=False,
                 strategy="auto",
             )
-            
+
             docs = await loader.aload()
-            
+
             # Clean up the temp file
             import os
             try:
                 os.unlink(file_path)
             except:
                 pass
-            
+
             # Pass the documents to the existing background task
             await add_received_file_document(
                 session,
@@ -186,6 +225,7 @@ async def process_file_in_background(
         import logging
         logging.error(f"Error processing file in background: {str(e)}")
 
+
 @router.get("/documents/", response_model=List[DocumentRead])
 async def read_documents(
     skip: int = 0,
@@ -195,17 +235,18 @@ async def read_documents(
     user: User = Depends(current_active_user)
 ):
     try:
-        query = select(Document).join(SearchSpace).filter(SearchSpace.user_id == user.id)
-        
+        query = select(Document).join(SearchSpace).filter(
+            SearchSpace.user_id == user.id)
+
         # Filter by search_space_id if provided
         if search_space_id is not None:
             query = query.filter(Document.search_space_id == search_space_id)
-            
+
         result = await session.execute(
             query.offset(skip).limit(limit)
         )
         db_documents = result.scalars().all()
-        
+
         # Convert database objects to API-friendly format
         api_documents = []
         for doc in db_documents:
@@ -218,7 +259,7 @@ async def read_documents(
                 created_at=doc.created_at,
                 search_space_id=doc.search_space_id
             ))
-            
+
         return api_documents
     except Exception as e:
         raise HTTPException(
@@ -226,6 +267,7 @@ async def read_documents(
             detail=f"Failed to fetch documents: {str(e)}"
         )
 
+
 @router.get("/documents/{document_id}", response_model=DocumentRead)
 async def read_document(
     document_id: int,
@@ -239,13 +281,13 @@ async def read_document(
             .filter(Document.id == document_id, SearchSpace.user_id == user.id)
         )
         document = result.scalars().first()
-        
+
         if not document:
             raise HTTPException(
                 status_code=404,
                 detail=f"Document with id {document_id} not found"
             )
-            
+
         # Convert database object to API-friendly format
         return DocumentRead(
             id=document.id,
@@ -262,6 +304,7 @@ async def read_document(
             detail=f"Failed to fetch document: {str(e)}"
         )
 
+
 @router.put("/documents/{document_id}", response_model=DocumentRead)
 async def update_document(
     document_id: int,
@@ -277,19 +320,19 @@ async def update_document(
             .filter(Document.id == document_id, SearchSpace.user_id == user.id)
         )
         db_document = result.scalars().first()
-        
+
         if not db_document:
             raise HTTPException(
                 status_code=404,
                 detail=f"Document with id {document_id} not found"
             )
-            
+
         update_data = document_update.model_dump(exclude_unset=True)
         for key, value in update_data.items():
             setattr(db_document, key, value)
         await session.commit()
         await session.refresh(db_document)
-        
+
         # Convert to DocumentRead for response
         return DocumentRead(
             id=db_document.id,
@@ -309,6 +352,7 @@ async def update_document(
             detail=f"Failed to update document: {str(e)}"
         )
 
+
 @router.delete("/documents/{document_id}", response_model=dict)
 async def delete_document(
     document_id: int,
@@ -323,13 +367,13 @@ async def delete_document(
             .filter(Document.id == document_id, SearchSpace.user_id == user.id)
         )
         document = result.scalars().first()
-        
+
         if not document:
             raise HTTPException(
                 status_code=404,
                 detail=f"Document with id {document_id} not found"
             )
-            
+
         await session.delete(document)
         await session.commit()
         return {"message": "Document deleted successfully"}
@@ -340,16 +384,16 @@ async def delete_document(
         raise HTTPException(
             status_code=500,
             detail=f"Failed to delete document: {str(e)}"
-        ) 
-        
-        
+        )
+
+
 async def process_extension_document_with_new_session(
     individual_document,
     search_space_id: int
 ):
     """Create a new session and process extension document."""
     from app.db import async_session_maker
-    
+
     async with async_session_maker() as session:
         try:
             await add_extension_received_document(session, individual_document, search_space_id)
@@ -357,13 +401,14 @@ async def process_extension_document_with_new_session(
             import logging
             logging.error(f"Error processing extension document: {str(e)}")
 
+
 async def process_crawled_url_with_new_session(
     url: str,
     search_space_id: int
 ):
     """Create a new session and process crawled URL."""
     from app.db import async_session_maker
-    
+
     async with async_session_maker() as session:
         try:
             await add_crawled_url_document(session, url, search_space_id)
@@ -371,6 +416,7 @@ async def process_crawled_url_with_new_session(
             import logging
             logging.error(f"Error processing crawled URL: {str(e)}")
 
+
 async def process_file_in_background_with_new_session(
     file_path: str,
     filename: str,
@@ -378,21 +424,21 @@ async def process_file_in_background_with_new_session(
 ):
     """Create a new session and process file."""
     from app.db import async_session_maker
-    
+
     async with async_session_maker() as session:
         await process_file_in_background(file_path, filename, search_space_id, session)
 
+
 async def process_youtube_video_with_new_session(
     url: str,
     search_space_id: int
 ):
     """Create a new session and process YouTube video."""
     from app.db import async_session_maker
-    
+
     async with async_session_maker() as session:
         try:
             await add_youtube_video_document(session, url, search_space_id)
         except Exception as e:
             import logging
             logging.error(f"Error processing YouTube video: {str(e)}")
-
diff --git a/surfsense_web/app/dashboard/[search_space_id]/documents/upload/page.tsx b/surfsense_web/app/dashboard/[search_space_id]/documents/upload/page.tsx
index a6fb3d128..e1adbe2d6 100644
--- a/surfsense_web/app/dashboard/[search_space_id]/documents/upload/page.tsx
+++ b/surfsense_web/app/dashboard/[search_space_id]/documents/upload/page.tsx
@@ -53,7 +53,7 @@ export default function FileUploader() {
         'text/html': ['.html'],
         'image/jpeg': ['.jpeg', '.jpg'],
         'image/png': ['.png'],
-        'text/markdown': ['.md'],
+        'text/markdown': ['.md', '.markdown'],
         'application/vnd.ms-outlook': ['.msg'],
         'application/vnd.oasis.opendocument.text': ['.odt'],
         'text/x-org': ['.org'],
@@ -69,6 +69,10 @@ export default function FileUploader() {
         'application/vnd.ms-excel': ['.xls'],
         'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet': ['.xlsx'],
         'application/xml': ['.xml'],
+        'audio/mpeg': ['.mp3', '.mpeg', '.mpga'],
+        'audio/mp4': ['.mp4', '.m4a'],
+        'audio/wav': ['.wav'],
+        'audio/webm': ['.webm'],
     }
 
     const supportedExtensions = Array.from(new Set(Object.values(acceptedFileTypes).flat())).sort()
diff --git a/surfsense_web/content/docs/docker-installation.mdx b/surfsense_web/content/docs/docker-installation.mdx
index 1545fd598..a1500822d 100644
--- a/surfsense_web/content/docs/docker-installation.mdx
+++ b/surfsense_web/content/docs/docker-installation.mdx
@@ -94,6 +94,7 @@ Before you begin, ensure you have:
 | UNSTRUCTURED_API_KEY       | API key for Unstructured.io service for document parsing                                                                                                                                  |
 | FIRECRAWL_API_KEY          | API key for Firecrawl service for web crawling                                                                                                                                            |
 | TTS_SERVICE                | Text-to-Speech API provider for Podcasts (e.g., `openai/tts-1`, `azure/neural`, `vertex_ai/`). See [supported providers](https://docs.litellm.ai/docs/text_to_speech#supported-providers) |
+| STT_SERVICE                | Speech-to-Text API provider for Podcasts (e.g., `openai/whisper-1`). See [supported providers](https://docs.litellm.ai/docs/audio_transcription#supported-providers)                      |
 
 Include API keys for the LLM providers you're using. For example:
 
@@ -114,6 +115,8 @@ Include API keys for the LLM providers you're using. For example:
 | FAST_LLM_API_BASE | Custom API base URL for the fast LLM |
 | STRATEGIC_LLM_API_BASE | Custom API base URL for the strategic LLM |
 | LONG_CONTEXT_LLM_API_BASE | Custom API base URL for the long context LLM |
+| TTS_SERVICE_API_BASE | Custom API base URL for the Text-to-Speech (TTS) service |
+| STT_SERVICE_API_BASE | Custom API base URL for the Speech-to-Text (STT) service |
 
 For other LLM providers, refer to the [LiteLLM documentation](https://docs.litellm.ai/docs/providers).
 
diff --git a/surfsense_web/content/docs/manual-installation.mdx b/surfsense_web/content/docs/manual-installation.mdx
index 749aac217..b3999dc69 100644
--- a/surfsense_web/content/docs/manual-installation.mdx
+++ b/surfsense_web/content/docs/manual-installation.mdx
@@ -65,6 +65,7 @@ Edit the `.env` file and set the following variables:
 | UNSTRUCTURED_API_KEY       | API key for Unstructured.io service                                                                                                                                                       |
 | FIRECRAWL_API_KEY          | API key for Firecrawl service (if using crawler)                                                                                                                                          |
 | TTS_SERVICE                | Text-to-Speech API provider for Podcasts (e.g., `openai/tts-1`, `azure/neural`, `vertex_ai/`). See [supported providers](https://docs.litellm.ai/docs/text_to_speech#supported-providers) |
+| STT_SERVICE                | Speech-to-Text API provider for Podcasts (e.g., `openai/whisper-1`). See [supported providers](https://docs.litellm.ai/docs/audio_transcription#supported-providers)                      |
 
 **Important**: Since LLM calls are routed through LiteLLM, include API keys for the LLM providers you're using:
 
@@ -86,6 +87,8 @@ Edit the `.env` file and set the following variables:
 | FAST_LLM_API_BASE | Custom API base URL for the fast LLM |
 | STRATEGIC_LLM_API_BASE | Custom API base URL for the strategic LLM |
 | LONG_CONTEXT_LLM_API_BASE | Custom API base URL for the long context LLM |
+| TTS_SERVICE_API_BASE | Custom API base URL for the Text-to-Speech (TTS) service |
+| STT_SERVICE_API_BASE | Custom API base URL for the Speech-to-Text (STT) service |
 
 ### 2. Install Dependencies
 

From bfeae0cb6766c604cc7393d7269df50cfeefec08 Mon Sep 17 00:00:00 2001
From: Xinwei Xiong <3293172751NSS@gmail.com>
Date: Wed, 14 May 2025 13:12:22 +0800
Subject: [PATCH 48/70] refactor: remove frontend and backend services from
 docker-compose.yml

---
 docker-compose.override.yml | 34 ++++++++++++++++++++++++++++++++++
 docker-compose.yml          | 32 --------------------------------
 2 files changed, 34 insertions(+), 32 deletions(-)
 create mode 100644 docker-compose.override.yml

diff --git a/docker-compose.override.yml b/docker-compose.override.yml
new file mode 100644
index 000000000..c971c68b5
--- /dev/null
+++ b/docker-compose.override.yml
@@ -0,0 +1,34 @@
+version: '3.8'
+
+services:
+  frontend:
+    build:
+      context: ./surfsense_web
+      dockerfile: Dockerfile
+    ports:
+      - "${FRONTEND_PORT:-3000}:3000"
+    volumes:
+      - ./surfsense_web:/app
+      - /app/node_modules
+    depends_on:
+      - backend
+    environment:
+      - NEXT_PUBLIC_API_URL=${NEXT_PUBLIC_API_URL:-http://backend:8000}
+
+  backend:
+    build:
+      context: ./surfsense_backend
+      dockerfile: Dockerfile
+    ports:
+      - "${BACKEND_PORT:-8000}:8000"
+    volumes:
+      - ./surfsense_backend:/app
+    depends_on:
+      - db
+    env_file:
+      - ./surfsense_backend/.env
+    environment:
+      - DATABASE_URL=postgresql+asyncpg://${POSTGRES_USER:-postgres}:${POSTGRES_PASSWORD:-postgres}@db:5432/${POSTGRES_DB:-surfsense}
+      - PYTHONPATH=/app
+      - UVICORN_LOOP=asyncio
+      - UNSTRUCTURED_HAS_PATCHED_LOOP=1
diff --git a/docker-compose.yml b/docker-compose.yml
index f8cfc2518..219933c48 100644
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -1,38 +1,6 @@
 version: '3.8'
 
 services:
-  frontend:
-    build:
-      context: ./surfsense_web
-      dockerfile: Dockerfile
-    ports:
-      - "${FRONTEND_PORT:-3000}:3000"
-    volumes:
-      - ./surfsense_web:/app
-      - /app/node_modules
-    depends_on:
-      - backend
-    environment:
-      - NEXT_PUBLIC_API_URL=${NEXT_PUBLIC_API_URL:-http://backend:8000}
-
-  backend:
-    build:
-      context: ./surfsense_backend
-      dockerfile: Dockerfile
-    ports:
-      - "${BACKEND_PORT:-8000}:8000"
-    volumes:
-      - ./surfsense_backend:/app
-    depends_on:
-      - db
-    env_file:
-      - ./surfsense_backend/.env
-    environment:
-      - DATABASE_URL=postgresql+asyncpg://${POSTGRES_USER:-postgres}:${POSTGRES_PASSWORD:-postgres}@db:5432/${POSTGRES_DB:-surfsense}
-      - PYTHONPATH=/app
-      - UVICORN_LOOP=asyncio
-      - UNSTRUCTURED_HAS_PATCHED_LOOP=1
-
   db:
     image: ankane/pgvector:latest
     ports:

From 6d591094cc580ca4beabb26d2bf936186461aa38 Mon Sep 17 00:00:00 2001
From: Xinwei Xiong <3293172751NSS@gmail.com>
Date: Wed, 14 May 2025 13:18:51 +0800
Subject: [PATCH 49/70] docs: update Docker setup documentation with deployment
 options and configuration details

---
 DEPLOYMENT_GUIDE.md                           | 124 ++++++++++++++++++
 DOCKER_SETUP.md                               |  91 ++++++++++---
 README.md                                     |   3 +
 .../content/docs/docker-installation.mdx      |  24 ++--
 4 files changed, 214 insertions(+), 28 deletions(-)
 create mode 100644 DEPLOYMENT_GUIDE.md

diff --git a/DEPLOYMENT_GUIDE.md b/DEPLOYMENT_GUIDE.md
new file mode 100644
index 000000000..ba3774a8f
--- /dev/null
+++ b/DEPLOYMENT_GUIDE.md
@@ -0,0 +1,124 @@
+# SurfSense Deployment Guide
+
+This guide explains the different deployment options available for SurfSense using Docker Compose.
+
+## Deployment Options
+
+SurfSense uses a flexible Docker Compose configuration that allows you to easily switch between deployment modes without manually editing files. Our approach uses Docker's built-in override functionality with two configuration files:
+
+1. **docker-compose.yml**: Contains essential core services (database and pgAdmin)
+2. **docker-compose.override.yml**: Contains application services (frontend and backend)
+
+This structure provides several advantages:
+- No need to comment/uncomment services manually
+- Clear separation between core infrastructure and application services
+- Easy switching between development and production environments
+
+## Deployment Modes
+
+### Full Stack Mode (Development)
+
+This mode runs everything: frontend, backend, database, and pgAdmin. It's ideal for development environments where you need the complete application stack.
+
+```bash
+# Both files are automatically used (docker-compose.yml + docker-compose.override.yml)
+docker-compose up -d
+```
+
+### Core Services Mode (Production)
+
+This mode runs only the database and pgAdmin services. It's suitable for production environments where you might want to deploy the frontend and backend separately or need to run database migrations.
+
+```bash
+# Explicitly use only the main file
+docker-compose -f docker-compose.yml up -d
+```
+
+## Custom Deployment Options
+
+### Running Specific Services
+
+You can specify which services to start by naming them:
+
+```bash
+# Start only database
+docker-compose up -d db
+
+# Start database and pgAdmin
+docker-compose up -d db pgadmin
+
+# Start only backend (requires db to be running)
+docker-compose up -d backend
+```
+
+### Using Custom Override Files
+
+You can create and use custom override files for different environments:
+
+```bash
+# Create a staging configuration
+docker-compose -f docker-compose.yml -f docker-compose.staging.yml up -d
+```
+
+## Environment Variables
+
+The deployment can be customized using environment variables:
+
+```bash
+# Change default ports
+FRONTEND_PORT=4000 BACKEND_PORT=9000 docker-compose up -d
+
+# Or use a .env file
+# Create or modify .env file with your desired values
+docker-compose up -d
+```
+
+## Common Deployment Workflows
+
+### Initial Setup
+
+```bash
+# Clone the repository
+git clone https://github.com/MODSetter/SurfSense.git
+cd SurfSense
+
+# Copy example env files
+cp .env.example .env
+cp surfsense_backend/.env.example surfsense_backend/.env
+cp surfsense_web/.env.example surfsense_web/.env
+
+# Edit the .env files with your configuration
+
+# Start full stack for development
+docker-compose up -d
+```
+
+### Database-Only Mode (for migrations or maintenance)
+
+```bash
+# Start just the database
+docker-compose -f docker-compose.yml up -d db
+
+# Run migrations or maintenance tasks
+docker-compose exec db psql -U postgres -d surfsense
+```
+
+### Scaling in Production
+
+For production deployments, you might want to:
+
+1. Run core services with Docker Compose
+2. Deploy frontend/backend with specialized services like Vercel, Netlify, or dedicated application servers
+
+This separation allows for better scaling and resource utilization in production environments.
+
+## Troubleshooting
+
+If you encounter issues with the deployment:
+
+- Check container logs: `docker-compose logs -f [service_name]`
+- Ensure all required environment variables are set
+- Verify network connectivity between containers
+- Check that required ports are available and not blocked by firewalls
+
+For more detailed setup instructions, refer to [DOCKER_SETUP.md](DOCKER_SETUP.md). 
\ No newline at end of file
diff --git a/DOCKER_SETUP.md b/DOCKER_SETUP.md
index 7ad878c94..892f97445 100644
--- a/DOCKER_SETUP.md
+++ b/DOCKER_SETUP.md
@@ -36,6 +36,20 @@ PGADMIN_DEFAULT_EMAIL=admin@surfsense.com
 PGADMIN_DEFAULT_PASSWORD=surfsense
 ```
 
+## Deployment Options
+
+SurfSense uses a flexible Docker Compose setup that allows you to choose between different deployment modes:
+
+### Option 1: Full Stack Deployment (Development Mode)
+Includes frontend, backend, database, and pgAdmin. This is the default when running `docker compose up`.
+
+### Option 2: Core Services Only (Production Mode)
+Includes only database and pgAdmin, suitable for production environments where you might deploy frontend/backend separately.
+
+Our setup uses two files:
+- `docker compose.yml`: Contains core services (database and pgAdmin)
+- `docker compose.override.yml`: Contains application services (frontend and backend)
+
 ## Setup
 
 1. Make sure you have all the necessary environment variables set up:
@@ -43,53 +57,85 @@ PGADMIN_DEFAULT_PASSWORD=surfsense
    - Copy `surfsense_web/.env.example` to `surfsense_web/.env` and fill in the required values
    - Optionally: Copy `.env.example` to `.env` in the project root to customize Docker settings
 
-2. Build and start the containers:
+2. Deploy based on your needs:
+
+   **Full Stack (Development Mode)**:
    ```bash
-   docker-compose up --build
+   # Both files are automatically used
+   docker compose up --build
+   ```
+
+   **Core Services Only (Production Mode)**:
+   ```bash
+   # Explicitly use only the main file
+   docker compose -f docker compose.yml up --build
    ```
 
 3. To run in detached mode (in the background):
    ```bash
-   docker-compose up -d
+   # Full stack
+   docker compose up -d
+   
+   # Core services only
+   docker compose -f docker compose.yml up -d
    ```
 
 4. Access the applications:
-   - Frontend: http://localhost:3000
-   - Backend API: http://localhost:8000
-   - API Documentation: http://localhost:8000/docs
+   - Frontend: http://localhost:3000 (when using full stack)
+   - Backend API: http://localhost:8000 (when using full stack)
+   - API Documentation: http://localhost:8000/docs (when using full stack)
    - pgAdmin: http://localhost:5050
 
+## Customizing the Deployment
+
+If you need to make temporary changes to either full stack or core services deployment, you can:
+
+1. **Temporarily disable override file**:
+   ```bash
+   docker compose -f docker compose.yml up -d
+   ```
+
+2. **Use a custom override file**:
+   ```bash
+   docker compose -f docker compose.yml -f custom-override.yml up -d
+   ```
+
+3. **Temporarily modify which services start**:
+   ```bash
+   docker compose up -d db pgadmin
+   ```
+
 ## Useful Commands
 
 - Stop the containers:
   ```bash
-  docker-compose down
+  docker compose down
   ```
 
 - View logs:
   ```bash
   # All services
-  docker-compose logs -f
+  docker compose logs -f
   
   # Specific service
-  docker-compose logs -f backend
-  docker-compose logs -f frontend
-  docker-compose logs -f db
-  docker-compose logs -f pgadmin
+  docker compose logs -f backend
+  docker compose logs -f frontend
+  docker compose logs -f db
+  docker compose logs -f pgadmin
   ```
 
 - Restart a specific service:
   ```bash
-  docker-compose restart backend
+  docker compose restart backend
   ```
 
 - Execute commands in a running container:
   ```bash
   # Backend
-  docker-compose exec backend python -m pytest
+  docker compose exec backend python -m pytest
   
   # Frontend
-  docker-compose exec frontend pnpm lint
+  docker compose exec frontend pnpm lint
   ```
 
 ## Database
@@ -127,7 +173,20 @@ pgAdmin is a web-based administration tool for PostgreSQL. It is included in the
 ## Troubleshooting
 
 - If you encounter permission errors, you may need to run the docker commands with `sudo`.
-- If ports are already in use, modify the port mappings in the `.env` file or directly in the `docker-compose.yml` file.
+- If ports are already in use, modify the port mappings in the `.env` file or directly in the `docker compose.yml` file.
 - For backend dependency issues, you may need to modify the `Dockerfile` in the backend directory.
 - For frontend dependency issues, you may need to modify the `Dockerfile` in the frontend directory.
 - If pgAdmin doesn't connect to the database, ensure you're using `db` as the hostname, not `localhost`, as that's the Docker network name. 
+- If you need only specific services, you can explicitly name them: `docker compose up db pgadmin`
+
+## Understanding Docker Compose File Structure
+
+The project uses Docker's default override mechanism:
+
+1. **docker compose.yml**: Contains essential services (database and pgAdmin)
+2. **docker compose.override.yml**: Contains development services (frontend and backend)
+
+When you run `docker compose up` without additional flags, Docker automatically merges both files.
+When you run `docker compose -f docker compose.yml up`, only the specified file is used.
+
+This approach lets you maintain a cleaner codebase without manually commenting/uncommenting services in your configuration files. 
diff --git a/README.md b/README.md
index e8979cf50..01cf356e4 100644
--- a/README.md
+++ b/README.md
@@ -88,7 +88,10 @@ SurfSense provides two installation methods:
 1. **[Docker Installation](https://www.surfsense.net/docs/docker-installation)** - The easiest way to get SurfSense up and running with all dependencies containerized.
    - Includes pgAdmin for database management through a web UI
    - Supports environment variable customization via `.env` file
+   - Flexible deployment options (full stack or core services only)
+   - No need to manually edit configuration files between environments
    - See [Docker Setup Guide](DOCKER_SETUP.md) for detailed instructions
+   - For deployment scenarios and options, see [Deployment Guide](DEPLOYMENT_GUIDE.md)
 
 2. **[Manual Installation (Recommended)](https://www.surfsense.net/docs/manual-installation)** - For users who prefer more control over their setup or need to customize their deployment.
 
diff --git a/surfsense_web/content/docs/docker-installation.mdx b/surfsense_web/content/docs/docker-installation.mdx
index 1545fd598..07a691f8d 100644
--- a/surfsense_web/content/docs/docker-installation.mdx
+++ b/surfsense_web/content/docs/docker-installation.mdx
@@ -130,7 +130,7 @@ For other LLM providers, refer to the [LiteLLM documentation](https://docs.litel
    **Linux/macOS/Windows:**
 
    ```bash
-   docker-compose up --build
+   docker compose up --build
    ```
 
    To run in detached mode (in the background):
@@ -138,10 +138,10 @@ For other LLM providers, refer to the [LiteLLM documentation](https://docs.litel
    **Linux/macOS/Windows:**
 
    ```bash
-   docker-compose up -d
+   docker compose up -d
    ```
 
-   **Note for Windows users:** If you're using older Docker Desktop versions, you might need to use `docker compose` (with a space) instead of `docker-compose`.
+   **Note for Windows users:** If you're using older Docker Desktop versions, you might need to use `docker compose` (with a space) instead of `docker compose`.
 
 3. **Access the Applications**
 
@@ -177,7 +177,7 @@ pgAdmin is included in the Docker setup to help manage your PostgreSQL database.
   **Linux/macOS/Windows:**
 
   ```bash
-  docker-compose down
+  docker compose down
   ```
 
 - **View logs:**
@@ -186,12 +186,12 @@ pgAdmin is included in the Docker setup to help manage your PostgreSQL database.
 
   ```bash
   # All services
-  docker-compose logs -f
+  docker compose logs -f
 
   # Specific service
-  docker-compose logs -f backend
-  docker-compose logs -f frontend
-  docker-compose logs -f db
+  docker compose logs -f backend
+  docker compose logs -f frontend
+  docker compose logs -f db
   ```
 
 - **Restart a specific service:**
@@ -199,7 +199,7 @@ pgAdmin is included in the Docker setup to help manage your PostgreSQL database.
   **Linux/macOS/Windows:**
 
   ```bash
-  docker-compose restart backend
+  docker compose restart backend
   ```
 
 - **Execute commands in a running container:**
@@ -208,17 +208,17 @@ pgAdmin is included in the Docker setup to help manage your PostgreSQL database.
 
   ```bash
   # Backend
-  docker-compose exec backend python -m pytest
+  docker compose exec backend python -m pytest
 
   # Frontend
-  docker-compose exec frontend pnpm lint
+  docker compose exec frontend pnpm lint
   ```
 
 ## Troubleshooting
 
 - **Linux/macOS:** If you encounter permission errors, you may need to run the docker commands with `sudo`.
 - **Windows:** If you see access denied errors, make sure you're running Command Prompt or PowerShell as Administrator.
-- If ports are already in use, modify the port mappings in the `docker-compose.yml` file.
+- If ports are already in use, modify the port mappings in the `docker compose.yml` file.
 - For backend dependency issues, check the `Dockerfile` in the backend directory.
 - For frontend dependency issues, check the `Dockerfile` in the frontend directory.
 - **Windows-specific:** If you encounter line ending issues (CRLF vs LF), configure Git to handle line endings properly with `git config --global core.autocrlf true` before cloning the repository.

From e8eac2deb0133cdaffee9de5c9105862ad4c79fb Mon Sep 17 00:00:00 2001
From: Xinwei Xiong <3293172751NSS@gmail.com>
Date: Wed, 14 May 2025 14:00:30 +0800
Subject: [PATCH 50/70] docs: update deployment guide and Docker setup
 documentation to use 'docker compose' syntax

---
 DEPLOYMENT_GUIDE.md                           | 24 +++++++++----------
 DOCKER_SETUP.md                               | 24 +++++++++----------
 .../content/docs/docker-installation.mdx      |  2 +-
 3 files changed, 25 insertions(+), 25 deletions(-)

diff --git a/DEPLOYMENT_GUIDE.md b/DEPLOYMENT_GUIDE.md
index ba3774a8f..e4cc86dec 100644
--- a/DEPLOYMENT_GUIDE.md
+++ b/DEPLOYMENT_GUIDE.md
@@ -22,7 +22,7 @@ This mode runs everything: frontend, backend, database, and pgAdmin. It's ideal
 
 ```bash
 # Both files are automatically used (docker-compose.yml + docker-compose.override.yml)
-docker-compose up -d
+docker compose up -d
 ```
 
 ### Core Services Mode (Production)
@@ -31,7 +31,7 @@ This mode runs only the database and pgAdmin services. It's suitable for product
 
 ```bash
 # Explicitly use only the main file
-docker-compose -f docker-compose.yml up -d
+docker compose -f docker-compose.yml up -d
 ```
 
 ## Custom Deployment Options
@@ -42,13 +42,13 @@ You can specify which services to start by naming them:
 
 ```bash
 # Start only database
-docker-compose up -d db
+docker compose up -d db
 
 # Start database and pgAdmin
-docker-compose up -d db pgadmin
+docker compose up -d db pgadmin
 
 # Start only backend (requires db to be running)
-docker-compose up -d backend
+docker compose up -d backend
 ```
 
 ### Using Custom Override Files
@@ -57,7 +57,7 @@ You can create and use custom override files for different environments:
 
 ```bash
 # Create a staging configuration
-docker-compose -f docker-compose.yml -f docker-compose.staging.yml up -d
+docker compose -f docker-compose.yml -f docker-compose.staging.yml up -d
 ```
 
 ## Environment Variables
@@ -66,11 +66,11 @@ The deployment can be customized using environment variables:
 
 ```bash
 # Change default ports
-FRONTEND_PORT=4000 BACKEND_PORT=9000 docker-compose up -d
+FRONTEND_PORT=4000 BACKEND_PORT=9000 docker compose up -d
 
 # Or use a .env file
 # Create or modify .env file with your desired values
-docker-compose up -d
+docker compose up -d
 ```
 
 ## Common Deployment Workflows
@@ -90,17 +90,17 @@ cp surfsense_web/.env.example surfsense_web/.env
 # Edit the .env files with your configuration
 
 # Start full stack for development
-docker-compose up -d
+docker compose up -d
 ```
 
 ### Database-Only Mode (for migrations or maintenance)
 
 ```bash
 # Start just the database
-docker-compose -f docker-compose.yml up -d db
+docker compose -f docker-compose.yml up -d db
 
 # Run migrations or maintenance tasks
-docker-compose exec db psql -U postgres -d surfsense
+docker compose exec db psql -U postgres -d surfsense
 ```
 
 ### Scaling in Production
@@ -116,7 +116,7 @@ This separation allows for better scaling and resource utilization in production
 
 If you encounter issues with the deployment:
 
-- Check container logs: `docker-compose logs -f [service_name]`
+- Check container logs: `docker compose logs -f [service_name]`
 - Ensure all required environment variables are set
 - Verify network connectivity between containers
 - Check that required ports are available and not blocked by firewalls
diff --git a/DOCKER_SETUP.md b/DOCKER_SETUP.md
index 892f97445..6b7ee4764 100644
--- a/DOCKER_SETUP.md
+++ b/DOCKER_SETUP.md
@@ -40,15 +40,15 @@ PGADMIN_DEFAULT_PASSWORD=surfsense
 
 SurfSense uses a flexible Docker Compose setup that allows you to choose between different deployment modes:
 
-### Option 1: Full Stack Deployment (Development Mode)
+### Option 1: Full-Stack Deployment (Development Mode)
 Includes frontend, backend, database, and pgAdmin. This is the default when running `docker compose up`.
 
 ### Option 2: Core Services Only (Production Mode)
 Includes only database and pgAdmin, suitable for production environments where you might deploy frontend/backend separately.
 
 Our setup uses two files:
-- `docker compose.yml`: Contains core services (database and pgAdmin)
-- `docker compose.override.yml`: Contains application services (frontend and backend)
+- `docker-compose.yml`: Contains core services (database and pgAdmin)
+- `docker-compose.override.yml`: Contains application services (frontend and backend)
 
 ## Setup
 
@@ -68,7 +68,7 @@ Our setup uses two files:
    **Core Services Only (Production Mode)**:
    ```bash
    # Explicitly use only the main file
-   docker compose -f docker compose.yml up --build
+   docker compose -f docker-compose.yml up --build
    ```
 
 3. To run in detached mode (in the background):
@@ -77,7 +77,7 @@ Our setup uses two files:
    docker compose up -d
    
    # Core services only
-   docker compose -f docker compose.yml up -d
+   docker compose -f docker-compose.yml up -d
    ```
 
 4. Access the applications:
@@ -92,12 +92,12 @@ If you need to make temporary changes to either full stack or core services depl
 
 1. **Temporarily disable override file**:
    ```bash
-   docker compose -f docker compose.yml up -d
+   docker compose -f docker-compose.yml up -d
    ```
 
 2. **Use a custom override file**:
    ```bash
-   docker compose -f docker compose.yml -f custom-override.yml up -d
+   docker compose -f docker-compose.yml -f custom-override.yml up -d
    ```
 
 3. **Temporarily modify which services start**:
@@ -173,9 +173,9 @@ pgAdmin is a web-based administration tool for PostgreSQL. It is included in the
 ## Troubleshooting
 
 - If you encounter permission errors, you may need to run the docker commands with `sudo`.
-- If ports are already in use, modify the port mappings in the `.env` file or directly in the `docker compose.yml` file.
+- If ports are already in use, modify the port mappings in the `.env` file or directly in the `docker-compose.yml` file.
 - For backend dependency issues, you may need to modify the `Dockerfile` in the backend directory.
-- For frontend dependency issues, you may need to modify the `Dockerfile` in the frontend directory.
+- If you encounter frontend dependency errors, adjust the frontend's `Dockerfile` accordingly.
 - If pgAdmin doesn't connect to the database, ensure you're using `db` as the hostname, not `localhost`, as that's the Docker network name. 
 - If you need only specific services, you can explicitly name them: `docker compose up db pgadmin`
 
@@ -183,10 +183,10 @@ pgAdmin is a web-based administration tool for PostgreSQL. It is included in the
 
 The project uses Docker's default override mechanism:
 
-1. **docker compose.yml**: Contains essential services (database and pgAdmin)
-2. **docker compose.override.yml**: Contains development services (frontend and backend)
+1. **docker-compose.yml**: Contains essential services (database and pgAdmin)
+2. **docker-compose.override.yml**: Contains development services (frontend and backend)
 
 When you run `docker compose up` without additional flags, Docker automatically merges both files.
-When you run `docker compose -f docker compose.yml up`, only the specified file is used.
+When you run `docker compose -f docker-compose.yml up`, only the specified file is used.
 
 This approach lets you maintain a cleaner codebase without manually commenting/uncommenting services in your configuration files. 
diff --git a/surfsense_web/content/docs/docker-installation.mdx b/surfsense_web/content/docs/docker-installation.mdx
index 07a691f8d..6dedaa992 100644
--- a/surfsense_web/content/docs/docker-installation.mdx
+++ b/surfsense_web/content/docs/docker-installation.mdx
@@ -218,7 +218,7 @@ pgAdmin is included in the Docker setup to help manage your PostgreSQL database.
 
 - **Linux/macOS:** If you encounter permission errors, you may need to run the docker commands with `sudo`.
 - **Windows:** If you see access denied errors, make sure you're running Command Prompt or PowerShell as Administrator.
-- If ports are already in use, modify the port mappings in the `docker compose.yml` file.
+- If ports are already in use, modify the port mappings in the `docker-compose.yml` file.
 - For backend dependency issues, check the `Dockerfile` in the backend directory.
 - For frontend dependency issues, check the `Dockerfile` in the frontend directory.
 - **Windows-specific:** If you encounter line ending issues (CRLF vs LF), configure Git to handle line endings properly with `git config --global core.autocrlf true` before cloning the repository.

From aadd9e20e5ab2118543ab8f4af45100249ac9f00 Mon Sep 17 00:00:00 2001
From: Xinwei Xiong <3293172751NSS@gmail.com>
Date: Wed, 14 May 2025 16:40:28 +0800
Subject: [PATCH 51/70] Add PR template with enhanced checklist for SurfSense

---
 .github/PULL_REQUEST_TEMPLATE.md | 45 ++++++++++++++++++++++++++++++++
 1 file changed, 45 insertions(+)
 create mode 100644 .github/PULL_REQUEST_TEMPLATE.md

diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md
new file mode 100644
index 000000000..a80f14583
--- /dev/null
+++ b/.github/PULL_REQUEST_TEMPLATE.md
@@ -0,0 +1,45 @@
+<!--- Provide a general summary of your changes in the Title above -->
+
+## Description
+<!--- Describe your changes in detail -->
+
+## Motivation and Context
+<!--- Why is this change required? What problem does it solve? -->
+<!--- If this PR relates to an open issue, please link to the issue here: FIX #123 -->
+FIX #
+
+## Changes Overview
+<!-- List the primary changes/improvements made in this PR -->
+- 
+
+## Screenshots
+<!-- If applicable, add screenshots or images to demonstrate the changes visually -->
+
+## API Changes
+<!-- Document any API changes if applicable -->
+- [ ] This PR includes API changes
+
+## Types of changes
+<!--- What types of changes does your code introduce? Put an `x` in all the boxes that apply: -->
+- [ ] Bug fix (non-breaking change which fixes an issue)
+- [ ] New feature (non-breaking change which adds functionality)
+- [ ] Performance improvement (non-breaking change which enhances performance)
+- [ ] Documentation update
+- [ ] Breaking change (fix or feature that would cause existing functionality to change)
+
+## Testing
+<!-- Describe the tests that have been run to verify your changes -->
+- [ ] I have tested these changes locally
+- [ ] I have added/updated unit tests
+- [ ] I have added/updated integration tests
+
+## Checklist:
+<!--- Go over all the following points, and put an `x` in all the boxes that apply. -->
+<!--- If you're unsure about any of these, don't hesitate to ask. We're here to help! -->
+- [ ] My code follows the code style of this project
+- [ ] My change requires documentation updates
+- [ ] I have updated the documentation accordingly
+- [ ] My change requires dependency updates
+- [ ] I have updated the dependencies accordingly
+- [ ] My code builds clean without any errors or warnings
+- [ ] All new and existing tests passed 
\ No newline at end of file

From 0bf43571ba390c45ef73bd99e8d3b5bd9351da45 Mon Sep 17 00:00:00 2001
From: "DESKTOP-RTLN3BA\\$punk" <vermarohanfinal@gmail.com>
Date: Fri, 16 May 2025 01:51:03 -0700
Subject: [PATCH 52/70] feat: Basic Streaming

---
 .../app/agents/researcher/nodes.py            | 97 +++++++++++++------
 surfsense_backend/app/utils/query_service.py  |  2 +
 2 files changed, 71 insertions(+), 28 deletions(-)

diff --git a/surfsense_backend/app/agents/researcher/nodes.py b/surfsense_backend/app/agents/researcher/nodes.py
index 644ddd918..fcec44096 100644
--- a/surfsense_backend/app/agents/researcher/nodes.py
+++ b/surfsense_backend/app/agents/researcher/nodes.py
@@ -520,6 +520,10 @@ async def process_sections(state: State, config: RunnableConfig, writer: StreamW
     answer_outline = state.answer_outline
     streaming_service = state.streaming_service
     
+    # Initialize a dictionary to track content for all sections
+    # This is used to maintain section content while streaming multiple sections
+    section_contents = {}
+    
     streaming_service.only_update_terminal(f"🚀 Starting to process research sections...")
     writer({"yeild_value": streaming_service._format_annotations()})
     
@@ -578,7 +582,6 @@ async def process_sections(state: State, config: RunnableConfig, writer: StreamW
             # Log the error and continue with an empty list of documents
             # This allows the process to continue, but the report might lack information
             relevant_documents = []
-            # Consider adding more robust error handling or reporting if needed
     
     print(f"Fetched {len(relevant_documents)} relevant documents for all sections")
     streaming_service.only_update_terminal(f"✨ Starting to draft {len(answer_outline.answer_outline)} sections using {len(relevant_documents)} relevant document chunks")
@@ -597,8 +600,16 @@ async def process_sections(state: State, config: RunnableConfig, writer: StreamW
         else:
             sub_section_type = SubSectionType.MIDDLE
         
+        # Initialize the section_contents entry for this section
+        section_contents[i] = {
+            "title": section.section_title,
+            "content": "",
+            "index": i
+        }
+        
         section_tasks.append(
             process_section_with_documents(
+                section_id=i,
                 section_title=section.section_title,
                 section_questions=section.questions,
                 user_query=configuration.user_query,
@@ -607,7 +618,8 @@ async def process_sections(state: State, config: RunnableConfig, writer: StreamW
                 relevant_documents=relevant_documents,
                 state=state,
                 writer=writer,
-                sub_section_type=sub_section_type
+                sub_section_type=sub_section_type,
+                section_contents=section_contents
             )
         )
     
@@ -649,28 +661,15 @@ async def process_sections(state: State, config: RunnableConfig, writer: StreamW
     streaming_service.only_update_terminal("🎉 Final research report generated successfully!")
     writer({"yeild_value": streaming_service._format_annotations()})
     
-    if hasattr(state, 'streaming_service') and state.streaming_service:
-        # Convert the final report to the expected format for UI:
-        # A list of strings where empty strings represent line breaks
-        formatted_report = []
-        for section in final_report:
-            if section == "\n":
-                # Add an empty string for line breaks
-                formatted_report.append("")
-            else:
-                # Split any multiline content by newlines and add each line
-                section_lines = section.split("\n")
-                formatted_report.extend(section_lines)
-        
-        state.streaming_service.only_update_answer(formatted_report)
-        writer({"yeild_value": state.streaming_service._format_annotations()})
-
+    # Skip the final update since we've been streaming incremental updates
+    # The final answer from each section is already shown in the UI
     
     return {
         "final_written_report": final_written_report
     }
 
 async def process_section_with_documents(
+    section_id: int,
     section_title: str, 
     section_questions: List[str],
     user_id: str, 
@@ -679,12 +678,14 @@ async def process_section_with_documents(
     user_query: str,
     state: State = None,
     writer: StreamWriter = None,
-    sub_section_type: SubSectionType = SubSectionType.MIDDLE
+    sub_section_type: SubSectionType = SubSectionType.MIDDLE,
+    section_contents: Dict[int, Dict[str, Any]] = None
 ) -> str:
     """
     Process a single section using pre-fetched documents.
     
     Args:
+        section_id: The ID of the section
         section_title: The title of the section
         section_questions: List of research questions for this section
         user_id: The user ID
@@ -692,6 +693,8 @@ async def process_section_with_documents(
         relevant_documents: Pre-fetched documents to use for this section
         state: The current state
         writer: StreamWriter for sending progress updates
+        sub_section_type: The type of section (start, middle, end)
+        section_contents: Dictionary to track content across multiple sections
         
     Returns:
         The written section content
@@ -738,23 +741,61 @@ async def process_section_with_documents(
                 "chat_history": state.chat_history
             }
             
-            # Invoke the sub-section writer graph
+            # Invoke the sub-section writer graph with streaming
             print(f"Invoking sub_section_writer for: {section_title}")
             if state and state.streaming_service and writer:
                 state.streaming_service.only_update_terminal(f"🧠 Analyzing information and drafting content for section: \"{section_title}\"")
                 writer({"yeild_value": state.streaming_service._format_annotations()})
-                
-            result = await sub_section_writer_graph.ainvoke(sub_state, config)
             
-            # Return the final answer from the sub_section_writer
-            final_answer = result.get("final_answer", "No content was generated for this section.")
+            # Variables to track streaming state
+            complete_content = ""  # Tracks the complete content received so far
             
-            # Send section content update via streaming if available
+            async for chunk_type, chunk in sub_section_writer_graph.astream(sub_state, config, stream_mode=["values"]):
+                if "final_answer" in chunk:
+                    new_content = chunk["final_answer"]
+                    if new_content and new_content != complete_content:
+                        # Extract only the new content (delta)
+                        delta = new_content[len(complete_content):]
+                        
+                        # Update what we've processed so far
+                        complete_content = new_content
+                        
+                        # Only stream if there's actual new content
+                        if delta and state and state.streaming_service and writer:
+                            # Update terminal with real-time progress indicator
+                            state.streaming_service.only_update_terminal(f"✍️ Writing section {section_id+1}... ({len(complete_content.split())} words)")
+                            
+                            # Update section_contents with just the new delta
+                            section_contents[section_id]["content"] += delta
+                            
+                            # Build UI-friendly content for all sections
+                            complete_answer = []
+                            for i in range(len(section_contents)):
+                                if i in section_contents and section_contents[i]["content"]:
+                                    # Add section header
+                                    complete_answer.append(f"# {section_contents[i]['title']}")
+                                    complete_answer.append("")  # Empty line after title
+                                    
+                                    # Add section content
+                                    content_lines = section_contents[i]["content"].split("\n")
+                                    complete_answer.extend(content_lines)
+                                    complete_answer.append("")  # Empty line after content
+                            
+                            # Update answer in UI in real-time
+                            state.streaming_service.only_update_answer(complete_answer)
+                            writer({"yeild_value": state.streaming_service._format_annotations()})
+            
+            # Set default if no content was received
+            if not complete_content:
+                complete_content = "No content was generated for this section."
+                section_contents[section_id]["content"] = complete_content
+            
+            # Final terminal update
             if state and state.streaming_service and writer:
-                state.streaming_service.only_update_terminal(f"✅ Completed writing section: \"{section_title}\"")
+                state.streaming_service.only_update_terminal(f"✅ Completed section: \"{section_title}\"")
                 writer({"yeild_value": state.streaming_service._format_annotations()})
-                
-            return final_answer
+            
+            return complete_content
     except Exception as e:
         print(f"Error processing section '{section_title}': {str(e)}")
         
diff --git a/surfsense_backend/app/utils/query_service.py b/surfsense_backend/app/utils/query_service.py
index 588a8695b..4442c8fa7 100644
--- a/surfsense_backend/app/utils/query_service.py
+++ b/surfsense_backend/app/utils/query_service.py
@@ -1,3 +1,4 @@
+import datetime
 from langchain.schema import HumanMessage, SystemMessage, AIMessage
 from app.config import config
 from typing import Any, List, Optional
@@ -31,6 +32,7 @@ class QueryService:
             # Create system message with instructions
             system_message = SystemMessage(
                 content=f"""
+                Today's date: {datetime.datetime.now().strftime("%Y-%m-%d")}
                 You are a highly skilled AI assistant specializing in query optimization for advanced research.
                 Your primary objective is to transform a user's initial query into a highly effective search query.
                 This reformulated query will be used to retrieve information from diverse data sources.

From c7a173456a79824f4f48f0a1fbd6f4d661ac751b Mon Sep 17 00:00:00 2001
From: "DESKTOP-RTLN3BA\\$punk" <vermarohanfinal@gmail.com>
Date: Fri, 16 May 2025 01:51:55 -0700
Subject: [PATCH 53/70] feat: Basic Streaming

---
 .../app/agents/researcher/nodes.py            | 97 +++++++++++++------
 surfsense_backend/app/utils/query_service.py  |  2 +
 2 files changed, 71 insertions(+), 28 deletions(-)

diff --git a/surfsense_backend/app/agents/researcher/nodes.py b/surfsense_backend/app/agents/researcher/nodes.py
index 644ddd918..fcec44096 100644
--- a/surfsense_backend/app/agents/researcher/nodes.py
+++ b/surfsense_backend/app/agents/researcher/nodes.py
@@ -520,6 +520,10 @@ async def process_sections(state: State, config: RunnableConfig, writer: StreamW
     answer_outline = state.answer_outline
     streaming_service = state.streaming_service
     
+    # Initialize a dictionary to track content for all sections
+    # This is used to maintain section content while streaming multiple sections
+    section_contents = {}
+    
     streaming_service.only_update_terminal(f"🚀 Starting to process research sections...")
     writer({"yeild_value": streaming_service._format_annotations()})
     
@@ -578,7 +582,6 @@ async def process_sections(state: State, config: RunnableConfig, writer: StreamW
             # Log the error and continue with an empty list of documents
             # This allows the process to continue, but the report might lack information
             relevant_documents = []
-            # Consider adding more robust error handling or reporting if needed
     
     print(f"Fetched {len(relevant_documents)} relevant documents for all sections")
     streaming_service.only_update_terminal(f"✨ Starting to draft {len(answer_outline.answer_outline)} sections using {len(relevant_documents)} relevant document chunks")
@@ -597,8 +600,16 @@ async def process_sections(state: State, config: RunnableConfig, writer: StreamW
         else:
             sub_section_type = SubSectionType.MIDDLE
         
+        # Initialize the section_contents entry for this section
+        section_contents[i] = {
+            "title": section.section_title,
+            "content": "",
+            "index": i
+        }
+        
         section_tasks.append(
             process_section_with_documents(
+                section_id=i,
                 section_title=section.section_title,
                 section_questions=section.questions,
                 user_query=configuration.user_query,
@@ -607,7 +618,8 @@ async def process_sections(state: State, config: RunnableConfig, writer: StreamW
                 relevant_documents=relevant_documents,
                 state=state,
                 writer=writer,
-                sub_section_type=sub_section_type
+                sub_section_type=sub_section_type,
+                section_contents=section_contents
             )
         )
     
@@ -649,28 +661,15 @@ async def process_sections(state: State, config: RunnableConfig, writer: StreamW
     streaming_service.only_update_terminal("🎉 Final research report generated successfully!")
     writer({"yeild_value": streaming_service._format_annotations()})
     
-    if hasattr(state, 'streaming_service') and state.streaming_service:
-        # Convert the final report to the expected format for UI:
-        # A list of strings where empty strings represent line breaks
-        formatted_report = []
-        for section in final_report:
-            if section == "\n":
-                # Add an empty string for line breaks
-                formatted_report.append("")
-            else:
-                # Split any multiline content by newlines and add each line
-                section_lines = section.split("\n")
-                formatted_report.extend(section_lines)
-        
-        state.streaming_service.only_update_answer(formatted_report)
-        writer({"yeild_value": state.streaming_service._format_annotations()})
-
+    # Skip the final update since we've been streaming incremental updates
+    # The final answer from each section is already shown in the UI
     
     return {
         "final_written_report": final_written_report
     }
 
 async def process_section_with_documents(
+    section_id: int,
     section_title: str, 
     section_questions: List[str],
     user_id: str, 
@@ -679,12 +678,14 @@ async def process_section_with_documents(
     user_query: str,
     state: State = None,
     writer: StreamWriter = None,
-    sub_section_type: SubSectionType = SubSectionType.MIDDLE
+    sub_section_type: SubSectionType = SubSectionType.MIDDLE,
+    section_contents: Dict[int, Dict[str, Any]] = None
 ) -> str:
     """
     Process a single section using pre-fetched documents.
     
     Args:
+        section_id: The ID of the section
         section_title: The title of the section
         section_questions: List of research questions for this section
         user_id: The user ID
@@ -692,6 +693,8 @@ async def process_section_with_documents(
         relevant_documents: Pre-fetched documents to use for this section
         state: The current state
         writer: StreamWriter for sending progress updates
+        sub_section_type: The type of section (start, middle, end)
+        section_contents: Dictionary to track content across multiple sections
         
     Returns:
         The written section content
@@ -738,23 +741,61 @@ async def process_section_with_documents(
                 "chat_history": state.chat_history
             }
             
-            # Invoke the sub-section writer graph
+            # Invoke the sub-section writer graph with streaming
             print(f"Invoking sub_section_writer for: {section_title}")
             if state and state.streaming_service and writer:
                 state.streaming_service.only_update_terminal(f"🧠 Analyzing information and drafting content for section: \"{section_title}\"")
                 writer({"yeild_value": state.streaming_service._format_annotations()})
-                
-            result = await sub_section_writer_graph.ainvoke(sub_state, config)
             
-            # Return the final answer from the sub_section_writer
-            final_answer = result.get("final_answer", "No content was generated for this section.")
+            # Variables to track streaming state
+            complete_content = ""  # Tracks the complete content received so far
             
-            # Send section content update via streaming if available
+            async for chunk_type, chunk in sub_section_writer_graph.astream(sub_state, config, stream_mode=["values"]):
+                if "final_answer" in chunk:
+                    new_content = chunk["final_answer"]
+                    if new_content and new_content != complete_content:
+                        # Extract only the new content (delta)
+                        delta = new_content[len(complete_content):]
+                        
+                        # Update what we've processed so far
+                        complete_content = new_content
+                        
+                        # Only stream if there's actual new content
+                        if delta and state and state.streaming_service and writer:
+                            # Update terminal with real-time progress indicator
+                            state.streaming_service.only_update_terminal(f"✍️ Writing section {section_id+1}... ({len(complete_content.split())} words)")
+                            
+                            # Update section_contents with just the new delta
+                            section_contents[section_id]["content"] += delta
+                            
+                            # Build UI-friendly content for all sections
+                            complete_answer = []
+                            for i in range(len(section_contents)):
+                                if i in section_contents and section_contents[i]["content"]:
+                                    # Add section header
+                                    complete_answer.append(f"# {section_contents[i]['title']}")
+                                    complete_answer.append("")  # Empty line after title
+                                    
+                                    # Add section content
+                                    content_lines = section_contents[i]["content"].split("\n")
+                                    complete_answer.extend(content_lines)
+                                    complete_answer.append("")  # Empty line after content
+                            
+                            # Update answer in UI in real-time
+                            state.streaming_service.only_update_answer(complete_answer)
+                            writer({"yeild_value": state.streaming_service._format_annotations()})
+            
+            # Set default if no content was received
+            if not complete_content:
+                complete_content = "No content was generated for this section."
+                section_contents[section_id]["content"] = complete_content
+            
+            # Final terminal update
             if state and state.streaming_service and writer:
-                state.streaming_service.only_update_terminal(f"✅ Completed writing section: \"{section_title}\"")
+                state.streaming_service.only_update_terminal(f"✅ Completed section: \"{section_title}\"")
                 writer({"yeild_value": state.streaming_service._format_annotations()})
-                
-            return final_answer
+            
+            return complete_content
     except Exception as e:
         print(f"Error processing section '{section_title}': {str(e)}")
         
diff --git a/surfsense_backend/app/utils/query_service.py b/surfsense_backend/app/utils/query_service.py
index 588a8695b..4442c8fa7 100644
--- a/surfsense_backend/app/utils/query_service.py
+++ b/surfsense_backend/app/utils/query_service.py
@@ -1,3 +1,4 @@
+import datetime
 from langchain.schema import HumanMessage, SystemMessage, AIMessage
 from app.config import config
 from typing import Any, List, Optional
@@ -31,6 +32,7 @@ class QueryService:
             # Create system message with instructions
             system_message = SystemMessage(
                 content=f"""
+                Today's date: {datetime.datetime.now().strftime("%Y-%m-%d")}
                 You are a highly skilled AI assistant specializing in query optimization for advanced research.
                 Your primary objective is to transform a user's initial query into a highly effective search query.
                 This reformulated query will be used to retrieve information from diverse data sources.

From f824842ec3399e24a23274a2c3897235de2de376 Mon Sep 17 00:00:00 2001
From: "DESKTOP-RTLN3BA\\$punk" <vermarohanfinal@gmail.com>
Date: Mon, 19 May 2025 20:48:48 -0700
Subject: [PATCH 54/70] refactor: Improved prompt for personalization.

---
 .../researcher/sub_section_writer/nodes.py    |  5 +--
 .../researcher/sub_section_writer/prompts.py  | 33 +++++++++++++++----
 surfsense_backend/app/app.py                  |  7 +++-
 surfsense_backend/app/config/__init__.py      |  2 ++
 4 files changed, 38 insertions(+), 9 deletions(-)

diff --git a/surfsense_backend/app/agents/researcher/sub_section_writer/nodes.py b/surfsense_backend/app/agents/researcher/sub_section_writer/nodes.py
index 5853283c3..deb48449f 100644
--- a/surfsense_backend/app/agents/researcher/sub_section_writer/nodes.py
+++ b/surfsense_backend/app/agents/researcher/sub_section_writer/nodes.py
@@ -112,6 +112,7 @@ async def write_sub_section(state: State, config: RunnableConfig) -> Dict[str, A
         <document>
             <metadata>
                 <source_id>{document_id}</source_id>
+                <source_type>{doc_info.get("document_type", "CRAWLED_URL")}</source_type>
             </metadata>
             <content>
                 {content}
@@ -133,11 +134,11 @@ async def write_sub_section(state: State, config: RunnableConfig) -> Dict[str, A
     # Provide more context based on the subsection type
     section_position_context = ""
     if sub_section_type == SubSectionType.START:
-        section_position_context = "This is the INTRODUCTION section. Focus on providing an overview of the topic, setting the context, and introducing key concepts that will be discussed in later sections. Do not provide any conclusions in this section, as conclusions should only appear in the final section."
+        section_position_context = "This is the INTRODUCTION section. "
     elif sub_section_type == SubSectionType.MIDDLE:
         section_position_context = "This is a MIDDLE section. Ensure this content flows naturally from previous sections and into subsequent ones. This could be any middle section in the document, so maintain coherence with the overall structure while addressing the specific topic of this section. Do not provide any conclusions in this section, as conclusions should only appear in the final section."
     elif sub_section_type == SubSectionType.END:
-        section_position_context = "This is the CONCLUSION section. Focus on summarizing key points, providing closure, and possibly suggesting implications or future directions related to the topic."
+        section_position_context = "This is the CONCLUSION section. Focus on summarizing key points, providing closure."
     
     # Construct a clear, structured query for the LLM
     human_message_content = f"""
diff --git a/surfsense_backend/app/agents/researcher/sub_section_writer/prompts.py b/surfsense_backend/app/agents/researcher/sub_section_writer/prompts.py
index 48345c9c1..e87c9e8a8 100644
--- a/surfsense_backend/app/agents/researcher/sub_section_writer/prompts.py
+++ b/surfsense_backend/app/agents/researcher/sub_section_writer/prompts.py
@@ -4,16 +4,28 @@ import datetime
 def get_citation_system_prompt():
     return f"""
 Today's date: {datetime.datetime.now().strftime("%Y-%m-%d")}
-You are a research assistant tasked with analyzing documents and providing comprehensive answers with proper citations in IEEE format.
+You are SurfSense, an advanced AI research assistant that synthesizes information from multiple knowledge sources to provide comprehensive, well-cited answers to user queries.
 
+<knowledge_sources>
+- EXTENSION: "Web content saved via SurfSense browser extension" (personal browsing history)
+- CRAWLED_URL: "Webpages indexed by SurfSense web crawler" (personally selected websites)
+- FILE: "User-uploaded documents (PDFs, Word, etc.)" (personal files)
+- SLACK_CONNECTOR: "Slack conversations and shared content" (personal workspace communications)
+- NOTION_CONNECTOR: "Notion workspace pages and databases" (personal knowledge management)
+- YOUTUBE_VIDEO: "YouTube video transcripts and metadata" (personally saved videos)
+- GITHUB_CONNECTOR: "GitHub repository content and issues" (personal repositories and interactions)
+- LINEAR_CONNECTOR: "Linear project issues and discussions" (personal project management)
+- TAVILY_API: "Tavily search API results" (personalized search results)
+- LINKUP_API: "Linkup search API results" (personalized search results)
+</knowledge_sources>
 <instructions>
 1. Carefully analyze all provided documents in the <document> section's.
 2. Extract relevant information that addresses the user's query.
-3. Synthesize a comprehensive, well-structured answer using information from these documents.
+3. Synthesize a comprehensive, personalized answer using information from the user's personal knowledge sources.
 4. For EVERY piece of information you include from the documents, add an IEEE-style citation in square brackets [X] where X is the source_id from the document's metadata.
 5. Make sure ALL factual statements from the documents have proper citations.
 6. If multiple documents support the same point, include all relevant citations [X], [Y].
-7. Present information in a logical, coherent flow.
+7. Present information in a logical, coherent flow that reflects the user's personal context.
 8. Use your own words to connect ideas, but cite ALL information from the documents.
 9. If documents contain conflicting information, acknowledge this and present both perspectives with appropriate citations.
 10. Do not make up or include information not found in the provided documents.
@@ -27,10 +39,12 @@ You are a research assistant tasked with analyzing documents and providing compr
 18. CRITICAL: If you are unsure about a source_id, do not include a citation rather than guessing or making one up.
 19. CRITICAL: Focus only on answering the user's query. Any guiding questions provided are for your thinking process only and should not be mentioned in your response.
 20. CRITICAL: Ensure your response aligns with the provided sub-section title and section position.
+21. CRITICAL: Remember that all knowledge sources contain personal information - provide answers that reflect this personal context.
 </instructions>
 
 <format>
 - Write in clear, professional language suitable for academic or technical audiences
+- Tailor your response to the user's personal context based on their knowledge sources
 - Organize your response with appropriate paragraphs, headings, and structure
 - Every fact from the documents must have an IEEE-style citation in square brackets [X] where X is the EXACT source_id from the document's metadata
 - Citations should appear at the end of the sentence containing the information they support
@@ -41,12 +55,15 @@ You are a research assistant tasked with analyzing documents and providing compr
 - NEVER make up citation numbers if you are unsure about the source_id. It is better to omit the citation than to guess.
 - NEVER include or mention the guiding questions in your response. They are only to help guide your thinking.
 - ALWAYS focus on answering the user's query directly from the information in the documents.
+- ALWAYS provide personalized answers that reflect the user's own knowledge and context.
 </format>
 
 <input_example>
+<documents>
     <document>
         <metadata>
             <source_id>1</source_id>
+            <source_type>EXTENSION</source_type>
         </metadata>
         <content>
             The Great Barrier Reef is the world's largest coral reef system, stretching over 2,300 kilometers along the coast of Queensland, Australia. It comprises over 2,900 individual reefs and 900 islands.
@@ -56,6 +73,7 @@ You are a research assistant tasked with analyzing documents and providing compr
     <document>
         <metadata>
             <source_id>13</source_id>
+            <source_type>YOUTUBE_VIDEO</source_type>
         </metadata>
         <content>
             Climate change poses a significant threat to coral reefs worldwide. Rising ocean temperatures have led to mass coral bleaching events in the Great Barrier Reef in 2016, 2017, and 2020.
@@ -65,15 +83,17 @@ You are a research assistant tasked with analyzing documents and providing compr
     <document>
         <metadata>
             <source_id>21</source_id>
+            <source_type>CRAWLED_URL</source_type>
         </metadata>
         <content>
             The Great Barrier Reef was designated a UNESCO World Heritage Site in 1981 due to its outstanding universal value and biological diversity. It is home to over 1,500 species of fish and 400 types of coral.
         </content>
     </document>
+</documents>
 </input_example>
 
 <output_example>
-    The Great Barrier Reef is the world's largest coral reef system, stretching over 2,300 kilometers along the coast of Queensland, Australia [1]. It was designated a UNESCO World Heritage Site in 1981 due to its outstanding universal value and biological diversity [21]. The reef is home to over 1,500 species of fish and 400 types of coral [21]. Unfortunately, climate change poses a significant threat to coral reefs worldwide, with rising ocean temperatures leading to mass coral bleaching events in the Great Barrier Reef in 2016, 2017, and 2020 [13]. The reef system comprises over 2,900 individual reefs and 900 islands [1], making it an ecological treasure that requires protection from multiple threats [1], [13].
+    Based on your saved browser content and videos, the Great Barrier Reef is the world's largest coral reef system, stretching over 2,300 kilometers along the coast of Queensland, Australia [1]. From your browsing history, you've looked into its designation as a UNESCO World Heritage Site in 1981 due to its outstanding universal value and biological diversity [21]. The reef is home to over 1,500 species of fish and 400 types of coral [21]. According to a YouTube video you've watched, climate change poses a significant threat to coral reefs worldwide, with rising ocean temperatures leading to mass coral bleaching events in the Great Barrier Reef in 2016, 2017, and 2020 [13]. The reef system comprises over 2,900 individual reefs and 900 islands [1], making it an ecological treasure that requires protection from multiple threats [1], [13].
 </output_example>
 
 <incorrect_citation_formats>
@@ -95,14 +115,15 @@ When you see a user query like:
         Give all linear issues.
     </user_query>
 
-Focus exclusively on answering this query using information from the provided documents. 
+Focus exclusively on answering this query using information from the provided documents, which contain the user's personal knowledge and data.
 
 If guiding questions are provided in a <guiding_questions> section, use them only to guide your thinking process. Do not mention or list these questions in your response.
 
 Make sure your response:
-1. Directly answers the user's query
+1. Directly answers the user's query with personalized information from their own knowledge sources
 2. Fits the provided sub-section title and section position
 3. Uses proper citations for all information from documents
 4. Is well-structured and professional in tone
+5. Acknowledges the personal nature of the information being provided
 </user_query_instructions>
 """
\ No newline at end of file
diff --git a/surfsense_backend/app/app.py b/surfsense_backend/app/app.py
index 500861c69..5649d2d12 100644
--- a/surfsense_backend/app/app.py
+++ b/surfsense_backend/app/app.py
@@ -60,7 +60,12 @@ app.include_router(
     tags=["users"],
 )
 app.include_router(
-    fastapi_users.get_oauth_router(google_oauth_client, auth_backend, SECRET, is_verified_by_default=True),
+    fastapi_users.get_oauth_router(
+        google_oauth_client,
+        auth_backend,
+        SECRET,
+        is_verified_by_default=True
+    ),
     prefix="/auth/google",
     tags=["auth"],
 )
diff --git a/surfsense_backend/app/config/__init__.py b/surfsense_backend/app/config/__init__.py
index eed462792..9dc8627e5 100644
--- a/surfsense_backend/app/config/__init__.py
+++ b/surfsense_backend/app/config/__init__.py
@@ -59,6 +59,8 @@ class Config:
         fast_llm_instance = ChatLiteLLM(model=FAST_LLM, api_base=FAST_LLM_API_BASE)
     else:
         fast_llm_instance = ChatLiteLLM(model=FAST_LLM)
+        
+        
     
     # STRATEGIC LLM
     STRATEGIC_LLM = os.getenv("STRATEGIC_LLM")

From ad13d815a5878c55f2d28d18305c7019261c777f Mon Sep 17 00:00:00 2001
From: "DESKTOP-RTLN3BA\\$punk" <vermarohanfinal@gmail.com>
Date: Mon, 19 May 2025 20:49:47 -0700
Subject: [PATCH 55/70] refactor: Improved prompt for personalization.

---
 .../researcher/sub_section_writer/nodes.py    |  5 +--
 .../researcher/sub_section_writer/prompts.py  | 33 +++++++++++++++----
 surfsense_backend/app/app.py                  |  7 +++-
 surfsense_backend/app/config/__init__.py      |  2 ++
 4 files changed, 38 insertions(+), 9 deletions(-)

diff --git a/surfsense_backend/app/agents/researcher/sub_section_writer/nodes.py b/surfsense_backend/app/agents/researcher/sub_section_writer/nodes.py
index 5853283c3..deb48449f 100644
--- a/surfsense_backend/app/agents/researcher/sub_section_writer/nodes.py
+++ b/surfsense_backend/app/agents/researcher/sub_section_writer/nodes.py
@@ -112,6 +112,7 @@ async def write_sub_section(state: State, config: RunnableConfig) -> Dict[str, A
         <document>
             <metadata>
                 <source_id>{document_id}</source_id>
+                <source_type>{doc_info.get("document_type", "CRAWLED_URL")}</source_type>
             </metadata>
             <content>
                 {content}
@@ -133,11 +134,11 @@ async def write_sub_section(state: State, config: RunnableConfig) -> Dict[str, A
     # Provide more context based on the subsection type
     section_position_context = ""
     if sub_section_type == SubSectionType.START:
-        section_position_context = "This is the INTRODUCTION section. Focus on providing an overview of the topic, setting the context, and introducing key concepts that will be discussed in later sections. Do not provide any conclusions in this section, as conclusions should only appear in the final section."
+        section_position_context = "This is the INTRODUCTION section. "
     elif sub_section_type == SubSectionType.MIDDLE:
         section_position_context = "This is a MIDDLE section. Ensure this content flows naturally from previous sections and into subsequent ones. This could be any middle section in the document, so maintain coherence with the overall structure while addressing the specific topic of this section. Do not provide any conclusions in this section, as conclusions should only appear in the final section."
     elif sub_section_type == SubSectionType.END:
-        section_position_context = "This is the CONCLUSION section. Focus on summarizing key points, providing closure, and possibly suggesting implications or future directions related to the topic."
+        section_position_context = "This is the CONCLUSION section. Focus on summarizing key points, providing closure."
     
     # Construct a clear, structured query for the LLM
     human_message_content = f"""
diff --git a/surfsense_backend/app/agents/researcher/sub_section_writer/prompts.py b/surfsense_backend/app/agents/researcher/sub_section_writer/prompts.py
index 48345c9c1..e87c9e8a8 100644
--- a/surfsense_backend/app/agents/researcher/sub_section_writer/prompts.py
+++ b/surfsense_backend/app/agents/researcher/sub_section_writer/prompts.py
@@ -4,16 +4,28 @@ import datetime
 def get_citation_system_prompt():
     return f"""
 Today's date: {datetime.datetime.now().strftime("%Y-%m-%d")}
-You are a research assistant tasked with analyzing documents and providing comprehensive answers with proper citations in IEEE format.
+You are SurfSense, an advanced AI research assistant that synthesizes information from multiple knowledge sources to provide comprehensive, well-cited answers to user queries.
 
+<knowledge_sources>
+- EXTENSION: "Web content saved via SurfSense browser extension" (personal browsing history)
+- CRAWLED_URL: "Webpages indexed by SurfSense web crawler" (personally selected websites)
+- FILE: "User-uploaded documents (PDFs, Word, etc.)" (personal files)
+- SLACK_CONNECTOR: "Slack conversations and shared content" (personal workspace communications)
+- NOTION_CONNECTOR: "Notion workspace pages and databases" (personal knowledge management)
+- YOUTUBE_VIDEO: "YouTube video transcripts and metadata" (personally saved videos)
+- GITHUB_CONNECTOR: "GitHub repository content and issues" (personal repositories and interactions)
+- LINEAR_CONNECTOR: "Linear project issues and discussions" (personal project management)
+- TAVILY_API: "Tavily search API results" (personalized search results)
+- LINKUP_API: "Linkup search API results" (personalized search results)
+</knowledge_sources>
 <instructions>
 1. Carefully analyze all provided documents in the <document> section's.
 2. Extract relevant information that addresses the user's query.
-3. Synthesize a comprehensive, well-structured answer using information from these documents.
+3. Synthesize a comprehensive, personalized answer using information from the user's personal knowledge sources.
 4. For EVERY piece of information you include from the documents, add an IEEE-style citation in square brackets [X] where X is the source_id from the document's metadata.
 5. Make sure ALL factual statements from the documents have proper citations.
 6. If multiple documents support the same point, include all relevant citations [X], [Y].
-7. Present information in a logical, coherent flow.
+7. Present information in a logical, coherent flow that reflects the user's personal context.
 8. Use your own words to connect ideas, but cite ALL information from the documents.
 9. If documents contain conflicting information, acknowledge this and present both perspectives with appropriate citations.
 10. Do not make up or include information not found in the provided documents.
@@ -27,10 +39,12 @@ You are a research assistant tasked with analyzing documents and providing compr
 18. CRITICAL: If you are unsure about a source_id, do not include a citation rather than guessing or making one up.
 19. CRITICAL: Focus only on answering the user's query. Any guiding questions provided are for your thinking process only and should not be mentioned in your response.
 20. CRITICAL: Ensure your response aligns with the provided sub-section title and section position.
+21. CRITICAL: Remember that all knowledge sources contain personal information - provide answers that reflect this personal context.
 </instructions>
 
 <format>
 - Write in clear, professional language suitable for academic or technical audiences
+- Tailor your response to the user's personal context based on their knowledge sources
 - Organize your response with appropriate paragraphs, headings, and structure
 - Every fact from the documents must have an IEEE-style citation in square brackets [X] where X is the EXACT source_id from the document's metadata
 - Citations should appear at the end of the sentence containing the information they support
@@ -41,12 +55,15 @@ You are a research assistant tasked with analyzing documents and providing compr
 - NEVER make up citation numbers if you are unsure about the source_id. It is better to omit the citation than to guess.
 - NEVER include or mention the guiding questions in your response. They are only to help guide your thinking.
 - ALWAYS focus on answering the user's query directly from the information in the documents.
+- ALWAYS provide personalized answers that reflect the user's own knowledge and context.
 </format>
 
 <input_example>
+<documents>
     <document>
         <metadata>
             <source_id>1</source_id>
+            <source_type>EXTENSION</source_type>
         </metadata>
         <content>
             The Great Barrier Reef is the world's largest coral reef system, stretching over 2,300 kilometers along the coast of Queensland, Australia. It comprises over 2,900 individual reefs and 900 islands.
@@ -56,6 +73,7 @@ You are a research assistant tasked with analyzing documents and providing compr
     <document>
         <metadata>
             <source_id>13</source_id>
+            <source_type>YOUTUBE_VIDEO</source_type>
         </metadata>
         <content>
             Climate change poses a significant threat to coral reefs worldwide. Rising ocean temperatures have led to mass coral bleaching events in the Great Barrier Reef in 2016, 2017, and 2020.
@@ -65,15 +83,17 @@ You are a research assistant tasked with analyzing documents and providing compr
     <document>
         <metadata>
             <source_id>21</source_id>
+            <source_type>CRAWLED_URL</source_type>
         </metadata>
         <content>
             The Great Barrier Reef was designated a UNESCO World Heritage Site in 1981 due to its outstanding universal value and biological diversity. It is home to over 1,500 species of fish and 400 types of coral.
         </content>
     </document>
+</documents>
 </input_example>
 
 <output_example>
-    The Great Barrier Reef is the world's largest coral reef system, stretching over 2,300 kilometers along the coast of Queensland, Australia [1]. It was designated a UNESCO World Heritage Site in 1981 due to its outstanding universal value and biological diversity [21]. The reef is home to over 1,500 species of fish and 400 types of coral [21]. Unfortunately, climate change poses a significant threat to coral reefs worldwide, with rising ocean temperatures leading to mass coral bleaching events in the Great Barrier Reef in 2016, 2017, and 2020 [13]. The reef system comprises over 2,900 individual reefs and 900 islands [1], making it an ecological treasure that requires protection from multiple threats [1], [13].
+    Based on your saved browser content and videos, the Great Barrier Reef is the world's largest coral reef system, stretching over 2,300 kilometers along the coast of Queensland, Australia [1]. From your browsing history, you've looked into its designation as a UNESCO World Heritage Site in 1981 due to its outstanding universal value and biological diversity [21]. The reef is home to over 1,500 species of fish and 400 types of coral [21]. According to a YouTube video you've watched, climate change poses a significant threat to coral reefs worldwide, with rising ocean temperatures leading to mass coral bleaching events in the Great Barrier Reef in 2016, 2017, and 2020 [13]. The reef system comprises over 2,900 individual reefs and 900 islands [1], making it an ecological treasure that requires protection from multiple threats [1], [13].
 </output_example>
 
 <incorrect_citation_formats>
@@ -95,14 +115,15 @@ When you see a user query like:
         Give all linear issues.
     </user_query>
 
-Focus exclusively on answering this query using information from the provided documents. 
+Focus exclusively on answering this query using information from the provided documents, which contain the user's personal knowledge and data.
 
 If guiding questions are provided in a <guiding_questions> section, use them only to guide your thinking process. Do not mention or list these questions in your response.
 
 Make sure your response:
-1. Directly answers the user's query
+1. Directly answers the user's query with personalized information from their own knowledge sources
 2. Fits the provided sub-section title and section position
 3. Uses proper citations for all information from documents
 4. Is well-structured and professional in tone
+5. Acknowledges the personal nature of the information being provided
 </user_query_instructions>
 """
\ No newline at end of file
diff --git a/surfsense_backend/app/app.py b/surfsense_backend/app/app.py
index 500861c69..5649d2d12 100644
--- a/surfsense_backend/app/app.py
+++ b/surfsense_backend/app/app.py
@@ -60,7 +60,12 @@ app.include_router(
     tags=["users"],
 )
 app.include_router(
-    fastapi_users.get_oauth_router(google_oauth_client, auth_backend, SECRET, is_verified_by_default=True),
+    fastapi_users.get_oauth_router(
+        google_oauth_client,
+        auth_backend,
+        SECRET,
+        is_verified_by_default=True
+    ),
     prefix="/auth/google",
     tags=["auth"],
 )
diff --git a/surfsense_backend/app/config/__init__.py b/surfsense_backend/app/config/__init__.py
index eed462792..9dc8627e5 100644
--- a/surfsense_backend/app/config/__init__.py
+++ b/surfsense_backend/app/config/__init__.py
@@ -59,6 +59,8 @@ class Config:
         fast_llm_instance = ChatLiteLLM(model=FAST_LLM, api_base=FAST_LLM_API_BASE)
     else:
         fast_llm_instance = ChatLiteLLM(model=FAST_LLM)
+        
+        
     
     # STRATEGIC LLM
     STRATEGIC_LLM = os.getenv("STRATEGIC_LLM")

From 4c8b03ce2b3d89cbbf40762219834146488ba3f5 Mon Sep 17 00:00:00 2001
From: "DESKTOP-RTLN3BA\\$punk" <vermarohanfinal@gmail.com>
Date: Mon, 19 May 2025 23:19:20 -0700
Subject: [PATCH 56/70] feat: Code Block Syntax Highlighting & Copy Func

---
 .../app/tasks/background_tasks.py             |   2 +-
 .../researcher/[chat_id]/page.tsx             |   1 -
 surfsense_web/app/globals.css                 |   2 +
 surfsense_web/components/markdown-viewer.tsx  | 123 +++++++++++-
 surfsense_web/package.json                    |   2 +
 surfsense_web/pnpm-lock.yaml                  | 183 +++++++++++++++++-
 6 files changed, 300 insertions(+), 13 deletions(-)

diff --git a/surfsense_backend/app/tasks/background_tasks.py b/surfsense_backend/app/tasks/background_tasks.py
index 1d278f715..2e5e361a6 100644
--- a/surfsense_backend/app/tasks/background_tasks.py
+++ b/surfsense_backend/app/tasks/background_tasks.py
@@ -419,7 +419,7 @@ async def add_youtube_video_document(
                 content=chunk.text,
                 embedding=config.embedding_model_instance.embed(chunk.text),
             )
-            for chunk in config.chunker_instance.chunk(transcript_text)
+            for chunk in config.chunker_instance.chunk(combined_document_string)
         ]
 
         # Create document
diff --git a/surfsense_web/app/dashboard/[search_space_id]/researcher/[chat_id]/page.tsx b/surfsense_web/app/dashboard/[search_space_id]/researcher/[chat_id]/page.tsx
index 78239e241..80f98c84d 100644
--- a/surfsense_web/app/dashboard/[search_space_id]/researcher/[chat_id]/page.tsx
+++ b/surfsense_web/app/dashboard/[search_space_id]/researcher/[chat_id]/page.tsx
@@ -48,7 +48,6 @@ import {
   researcherOptions
 } from '@/components/chat';
 import { MarkdownViewer } from '@/components/markdown-viewer';
-import { connectorSourcesMenu as defaultConnectorSourcesMenu } from '@/components/chat/connector-sources';
 import { Logo } from '@/components/Logo';
 import { useSearchSourceConnectors } from '@/hooks';
 
diff --git a/surfsense_web/app/globals.css b/surfsense_web/app/globals.css
index 98e4411fb..5dbd332d0 100644
--- a/surfsense_web/app/globals.css
+++ b/surfsense_web/app/globals.css
@@ -45,6 +45,7 @@
   --sidebar-accent-foreground: oklch(0.205 0 0);
   --sidebar-border: oklch(0.922 0 0);
   --sidebar-ring: oklch(0.708 0 0);
+  --syntax-bg: #f5f5f5;
 }
 
 .dark {
@@ -80,6 +81,7 @@
   --sidebar-accent-foreground: oklch(0.985 0 0);
   --sidebar-border: oklch(0.269 0 0);
   --sidebar-ring: oklch(0.439 0 0);
+  --syntax-bg: #1e1e1e;
 }
 
 @theme inline {
diff --git a/surfsense_web/components/markdown-viewer.tsx b/surfsense_web/components/markdown-viewer.tsx
index 1df650114..f4bebf9c7 100644
--- a/surfsense_web/components/markdown-viewer.tsx
+++ b/surfsense_web/components/markdown-viewer.tsx
@@ -1,4 +1,4 @@
-import React, { useMemo } from "react";
+import React, { useMemo, useState, useEffect } from "react";
 import ReactMarkdown from "react-markdown";
 import rehypeRaw from "rehype-raw";
 import rehypeSanitize from "rehype-sanitize";
@@ -6,6 +6,10 @@ import remarkGfm from "remark-gfm";
 import { cn } from "@/lib/utils";
 import { Citation } from "./chat/Citation";
 import { Source } from "./chat/types";
+import { Prism as SyntaxHighlighter } from "react-syntax-highlighter";
+import { oneLight, oneDark } from "react-syntax-highlighter/dist/cjs/styles/prism";
+import { Check, Copy } from "lucide-react";
+import { useTheme } from "next-themes";
 
 interface MarkdownViewerProps {
   content: string;
@@ -75,16 +79,19 @@ export function MarkdownViewer({ content, className, getCitationSource }: Markdo
       td: ({node, ...props}: any) => <td className="px-3 py-2 border-t border-border" {...props} />,
       code: ({node, className, children, ...props}: any) => {
         const match = /language-(\w+)/.exec(className || '');
+        const language = match ? match[1] : '';
         const isInline = !match;
-        return isInline 
-          ? <code className="bg-muted px-1 py-0.5 rounded text-xs" {...props}>{children}</code>
-          : (
-            <div className="relative my-4">
-              <pre className="bg-muted p-4 rounded-md overflow-x-auto">
-                <code className="text-xs" {...props}>{children}</code>
-              </pre>
-            </div>
-          );
+        
+        if (isInline) {
+          return <code className="bg-muted px-1 py-0.5 rounded text-xs" {...props}>{children}</code>;
+        }
+        
+        // For code blocks, add syntax highlighting and copy functionality
+        return (
+          <CodeBlock language={language} {...props}>
+            {String(children).replace(/\n$/, '')}
+          </CodeBlock>
+        );
       }
     };
   }, [getCitationSource]);
@@ -102,6 +109,102 @@ export function MarkdownViewer({ content, className, getCitationSource }: Markdo
   );
 }
 
+// Code block component with syntax highlighting and copy functionality
+const CodeBlock = ({ children, language }: { children: string, language: string }) => {
+  const [copied, setCopied] = useState(false);
+  const { resolvedTheme, theme } = useTheme();
+  const [mounted, setMounted] = useState(false);
+
+  // Prevent hydration issues
+  useEffect(() => {
+    setMounted(true);
+  }, []);
+
+  const handleCopy = async () => {
+    await navigator.clipboard.writeText(children);
+    setCopied(true);
+    setTimeout(() => setCopied(false), 2000);
+  };
+
+  // Choose theme based on current system/user preference
+  const isDarkTheme = mounted && (resolvedTheme === 'dark' || theme === 'dark');
+  const syntaxTheme = isDarkTheme ? oneDark : oneLight;
+
+  return (
+    <div className="relative my-4 group">
+      <div className="absolute right-2 top-2 z-10">
+        <button
+          onClick={handleCopy}
+          className="p-1.5 rounded-md bg-background/80 hover:bg-background border border-border flex items-center justify-center transition-colors"
+          aria-label="Copy code"
+        >
+          {copied ? 
+            <Check size={14} className="text-green-500" /> : 
+            <Copy size={14} className="text-muted-foreground" />
+          }
+        </button>
+      </div>
+      {mounted ? (
+        <SyntaxHighlighter
+          language={language || 'text'}
+          style={{
+            ...syntaxTheme,
+            'pre[class*="language-"]': {
+              ...syntaxTheme['pre[class*="language-"]'],
+              margin: 0,
+              border: 'none',
+              borderRadius: '0.375rem',
+              background: 'var(--syntax-bg)'
+            },
+            'code[class*="language-"]': {
+              ...syntaxTheme['code[class*="language-"]'],
+              border: 'none',
+              background: 'var(--syntax-bg)'
+            }
+          }}
+          customStyle={{
+            margin: 0,
+            borderRadius: '0.375rem',
+            fontSize: '0.75rem',
+            lineHeight: '1.5rem',
+            backgroundColor: 'var(--syntax-bg)',
+            border: 'none',
+          }}
+          codeTagProps={{
+            className: "font-mono",
+            style: {
+              border: 'none',
+              background: 'var(--syntax-bg)'
+            }
+          }}
+          showLineNumbers={false}
+          wrapLines={false}
+          lineProps={{
+            style: {
+              wordBreak: 'break-all', 
+              whiteSpace: 'pre-wrap',
+              border: 'none',
+              borderBottom: 'none',
+              paddingLeft: 0,
+              paddingRight: 0,
+              margin: '0.25rem 0'
+            }
+          }}
+          PreTag="div"
+        >
+          {children}
+        </SyntaxHighlighter>
+      ) : (
+        <div className="bg-muted p-4 rounded-md">
+          <pre className="m-0 p-0 border-0">
+            <code className="text-xs font-mono border-0 leading-6">{children}</code>
+          </pre>
+        </div>
+      )}
+    </div>
+  );
+};
+
 // Helper function to process citations within React children
 const processCitationsInReactChildren = (children: React.ReactNode, getCitationSource: (id: number) => Source | null): React.ReactNode => {
   // If children is not an array or string, just return it
diff --git a/surfsense_web/package.json b/surfsense_web/package.json
index a5ec4d8f1..2899a2aee 100644
--- a/surfsense_web/package.json
+++ b/surfsense_web/package.json
@@ -35,6 +35,7 @@
     "@tabler/icons-react": "^3.30.0",
     "@tanstack/react-table": "^8.21.2",
     "@types/mdx": "^2.0.13",
+    "@types/react-syntax-highlighter": "^15.5.13",
     "ai": "^4.1.54",
     "class-variance-authority": "^0.7.1",
     "clsx": "^2.1.1",
@@ -55,6 +56,7 @@
     "react-json-view": "^1.21.3",
     "react-json-view-lite": "^2.4.0",
     "react-markdown": "^10.0.1",
+    "react-syntax-highlighter": "^15.6.1",
     "rehype-raw": "^7.0.0",
     "rehype-sanitize": "^6.0.0",
     "remark-gfm": "^4.0.1",
diff --git a/surfsense_web/pnpm-lock.yaml b/surfsense_web/pnpm-lock.yaml
index 1d8653f5e..e7ab3e9f9 100644
--- a/surfsense_web/pnpm-lock.yaml
+++ b/surfsense_web/pnpm-lock.yaml
@@ -68,6 +68,9 @@ importers:
       '@types/mdx':
         specifier: ^2.0.13
         version: 2.0.13
+      '@types/react-syntax-highlighter':
+        specifier: ^15.5.13
+        version: 15.5.13
       ai:
         specifier: ^4.1.54
         version: 4.1.54(react@19.0.0)(zod@3.24.2)
@@ -128,6 +131,9 @@ importers:
       react-markdown:
         specifier: ^10.0.1
         version: 10.0.1(@types/react@19.0.10)(react@19.0.0)
+      react-syntax-highlighter:
+        specifier: ^15.6.1
+        version: 15.6.1(react@19.0.0)
       rehype-raw:
         specifier: ^7.0.0
         version: 7.0.0
@@ -2047,6 +2053,9 @@ packages:
   '@types/estree@1.0.6':
     resolution: {integrity: sha512-AYnb1nQyY49te+VRAVgmzfcgjYS91mY5P0TKUDCLEM+gNnA+3T6rWITXRLYCpahpqSQbN5cE+gHpnPyXjHWxcw==}
 
+  '@types/hast@2.3.10':
+    resolution: {integrity: sha512-McWspRw8xx8J9HurkVBfYj0xKoE25tOFlHGdx4MJ5xORQrMGZNqJhVQWaIbm6Oyla5kYOXtDiopzKRJzEOkwJw==}
+
   '@types/hast@3.0.4':
     resolution: {integrity: sha512-WPs+bbQw5aCj+x6laNGWLH3wviHtoCv/P3+otBhbOhJgG8qtpdAMlTCxLtsTWA7LH1Oh/bFCHsBn0TPS5m30EQ==}
 
@@ -2073,6 +2082,9 @@ packages:
     peerDependencies:
       '@types/react': ^19.0.0
 
+  '@types/react-syntax-highlighter@15.5.13':
+    resolution: {integrity: sha512-uLGJ87j6Sz8UaBAooU0T6lWJ0dBmjZgN1PZTrj05TNql2/XpC6+4HhMT5syIdFUUt+FASfCeLLv4kBygNU+8qA==}
+
   '@types/react@19.0.10':
     resolution: {integrity: sha512-JuRQ9KXLEjaUNjTWpzuR231Z2WpIwczOkBEIvbHNCzQefFIT0L8IqE6NV6ULLyC1SI/i234JnDoMkfg+RjQj2g==}
 
@@ -2334,12 +2346,21 @@ packages:
   character-entities-html4@2.1.0:
     resolution: {integrity: sha512-1v7fgQRj6hnSwFpq1Eu0ynr/CDEw0rXo2B61qXrLNdHZmPKgb7fqS1a2JwF0rISo9q77jDI8VMEHoApn8qDoZA==}
 
+  character-entities-legacy@1.1.4:
+    resolution: {integrity: sha512-3Xnr+7ZFS1uxeiUDvV02wQ+QDbc55o97tIV5zHScSPJpcLm/r0DFPcoY3tYRp+VZukxuMeKgXYmsXQHO05zQeA==}
+
   character-entities-legacy@3.0.0:
     resolution: {integrity: sha512-RpPp0asT/6ufRm//AJVwpViZbGM/MkjQFxJccQRHmISF/22NBtsHqAWmL+/pmkPWoIUJdWyeVleTl1wydHATVQ==}
 
+  character-entities@1.2.4:
+    resolution: {integrity: sha512-iBMyeEHxfVnIakwOuDXpVkc54HijNgCyQB2w0VfGQThle6NXn50zU6V/u+LDhxHcDUPojn6Kpga3PTAD8W1bQw==}
+
   character-entities@2.0.2:
     resolution: {integrity: sha512-shx7oQ0Awen/BRIdkjkvz54PnEEI/EjwXDSIZp86/KKdbafHh1Df/RYGBhn4hbe2+uKC9FnT5UCEdyPz3ai9hQ==}
 
+  character-reference-invalid@1.1.4:
+    resolution: {integrity: sha512-mKKUkUbhPpQlCOfIuZkvSEgktjPFIsZKRRbC6KWVEMvlzblj3i3asQv5ODsrwt0N3pHAEvjP8KTQPHkp0+6jOg==}
+
   character-reference-invalid@2.0.1:
     resolution: {integrity: sha512-iBZ4F4wRbyORVsu0jPV7gXkOsGYjGHPmAyv+HiHG8gi5PtC9KI2j1+v8/tlibRvjoWX027ypmG/n0HtO5t7unw==}
 
@@ -2384,6 +2405,9 @@ packages:
     resolution: {integrity: sha512-1rXeuUUiGGrykh+CeBdu5Ie7OJwinCgQY0bc7GCRxy5xVHy+moaqkpL/jqQq0MtQOeYcrqEz4abc5f0KtU7W4A==}
     engines: {node: '>=12.5.0'}
 
+  comma-separated-tokens@1.0.8:
+    resolution: {integrity: sha512-GHuDRO12Sypu2cV70d1dkA2EUmXHgntrzbpvOB+Qy+49ypNfGgFQIC2fhhXbnyrJRynDCAARsT7Ou0M6hirpfw==}
+
   comma-separated-tokens@2.0.3:
     resolution: {integrity: sha512-Fu4hJdvzeylCfQPp9SGWidpzrMs7tTrlu6Vb8XGaRGck8QSNZJJp538Wrb60Lax4fPwR64ViY468OIUTbRlGZg==}
 
@@ -2762,6 +2786,9 @@ packages:
   fastq@1.19.1:
     resolution: {integrity: sha512-GwLTyxkCXjXbxqIhTsMI2Nui8huMPtnxg7krajPJAjnEG/iiOS7i+zCtWGZR9G0NBKbXKh6X9m9UIsYX/N6vvQ==}
 
+  fault@1.0.4:
+    resolution: {integrity: sha512-CJ0HCB5tL5fYTEA7ToAq5+kTwd++Borf1/bifxd9iT70QcXr4MRrO3Llf8Ifs70q+SJcGHFtnIE/Nw6giCtECA==}
+
   fbemitter@3.0.0:
     resolution: {integrity: sha512-KWKaceCwKQU0+HPoop6gn4eOHk50bBv/VxjJtGMfwmJt3D29JpN4H4eisCtIPA+a8GVBam+ldMMpMjJUvpDyHw==}
 
@@ -2815,6 +2842,10 @@ packages:
     resolution: {integrity: sha512-gIXjKqtFuWEgzFRJA9WCQeSJLZDjgJUOMCMzxtvFq/37KojM1BFGufqsCy0r4qSQmYLsZYMeyRqzIWOMup03sw==}
     engines: {node: '>=14'}
 
+  format@0.2.2:
+    resolution: {integrity: sha512-wzsgA6WOq+09wrU1tsJ09udeR/YZRaeArL9e1wPbFg3GG2yDnC2ldKpxs4xunpFF9DgqCqOIra3bc1HWrJ37Ww==}
+    engines: {node: '>=0.4.x'}
+
   framer-motion@12.4.7:
     resolution: {integrity: sha512-VhrcbtcAMXfxlrjeHPpWVu2+mkcoR31e02aNSR7OUS/hZAciKa8q6o3YN2mA1h+jjscRsSyKvX6E1CiY/7OLMw==}
     peerDependencies:
@@ -2985,6 +3016,9 @@ packages:
   hast-util-from-parse5@8.0.3:
     resolution: {integrity: sha512-3kxEVkEKt0zvcZ3hCRYI8rqrgwtlIOFMWkbclACvjlDw8Li9S2hk/d51OI0nr/gIpdMHNepwgOKqZ/sy0Clpyg==}
 
+  hast-util-parse-selector@2.2.5:
+    resolution: {integrity: sha512-7j6mrk/qqkSehsM92wQjdIgWM2/BW61u/53G6xmC8i1OmEdKLHbk419QKQUjz6LglWsfqoiHmyMRkP1BGjecNQ==}
+
   hast-util-parse-selector@4.0.0:
     resolution: {integrity: sha512-wkQCkSYoOGCRKERFWcxMVMOcYE2K1AaNLU8DXS9arxnLOUEWbOXKXiJUNzEpqZ3JOKpnha3jkFrumEjVliDe7A==}
 
@@ -3012,9 +3046,18 @@ packages:
   hast-util-whitespace@3.0.0:
     resolution: {integrity: sha512-88JUN06ipLwsnv+dVn+OIYOvAuvBMy/Qoi6O7mQHxdPXpjy+Cd6xRkWwux7DKO+4sYILtLBRIKgsdpS2gQc7qw==}
 
+  hastscript@6.0.0:
+    resolution: {integrity: sha512-nDM6bvd7lIqDUiYEiu5Sl/+6ReP0BMk/2f4U/Rooccxkj0P5nm+acM5PrGJ/t5I8qPGiqZSE6hVAwZEdZIvP4w==}
+
   hastscript@9.0.1:
     resolution: {integrity: sha512-g7df9rMFX/SPi34tyGCyUBREQoKkapwdY/T04Qn9TDWfHhAYt4/I0gMVirzK5wEzeUqIjEB+LXC/ypb7Aqno5w==}
 
+  highlight.js@10.7.3:
+    resolution: {integrity: sha512-tzcUFauisWKNHaRkN4Wjl/ZA07gENAjFl3J/c480dprkGTg5EQstgaNFqBfUqCq54kZRIEcreTsAgF/m2quD7A==}
+
+  highlightjs-vue@1.0.0:
+    resolution: {integrity: sha512-PDEfEF102G23vHmPhLyPboFCD+BkMGu+GuJe2d9/eH4FsCwvgBpnc9n0pGE+ffKdph38s6foEZiEjdgHdzp+IA==}
+
   html-url-attributes@3.0.1:
     resolution: {integrity: sha512-ol6UPyBWqsrO6EJySPz2O7ZSr856WDrEzM5zMqp+FJJLGMW35cLYmmZnl0vztAZxRUoNZJFTCohfjuIJ8I4QBQ==}
 
@@ -3049,9 +3092,15 @@ packages:
     resolution: {integrity: sha512-4gd7VpWNQNB4UKKCFFVcp1AVv+FMOgs9NKzjHKusc8jTMhd5eL1NqQqOpE0KzMds804/yHlglp3uxgluOqAPLw==}
     engines: {node: '>= 0.4'}
 
+  is-alphabetical@1.0.4:
+    resolution: {integrity: sha512-DwzsA04LQ10FHTZuL0/grVDk4rFoVH1pjAToYwBrHSxcrBIGQuXrQMtD5U1b0U2XVgKZCTLLP8u2Qxqhy3l2Vg==}
+
   is-alphabetical@2.0.1:
     resolution: {integrity: sha512-FWyyY60MeTNyeSRpkM2Iry0G9hpr7/9kD40mD/cGQEuilcZYS4okz8SN2Q6rLCJ8gbCt6fN+rC+6tMGS99LaxQ==}
 
+  is-alphanumerical@1.0.4:
+    resolution: {integrity: sha512-UzoZUr+XfVz3t3v4KyGEniVL9BDRoQtY7tOyrRybkVNjDFWyo1yhXNGrrBTQxp3ib9BLAWs7k2YKBQsFRkZG9A==}
+
   is-alphanumerical@2.0.1:
     resolution: {integrity: sha512-hmbYhX/9MUMF5uh7tOXyK/n0ZvWpad5caBA17GsC6vyuCqaWliRG5K1qS9inmUhEMaOBIW7/whAnSwveW/LtZw==}
 
@@ -3097,6 +3146,9 @@ packages:
     resolution: {integrity: sha512-PwwhEakHVKTdRNVOw+/Gyh0+MzlCl4R6qKvkhuvLtPMggI1WAHt9sOwZxQLSGpUaDnrdyDsomoRgNnCfKNSXXg==}
     engines: {node: '>= 0.4'}
 
+  is-decimal@1.0.4:
+    resolution: {integrity: sha512-RGdriMmQQvZ2aqaQq3awNA6dCGtKpiDFcOzrTWrDAT2MiWrKQVPmxLGHl7Y2nNu6led0kEyoX0enY0qXYsv9zw==}
+
   is-decimal@2.0.1:
     resolution: {integrity: sha512-AAB9hiomQs5DXWcRB1rqsxGUstbRroFOPPVAomNk/3XHR5JyEZChOyTWe2oayKnsSsr/kcGqF+z6yuH6HHpN0A==}
 
@@ -3124,6 +3176,9 @@ packages:
     resolution: {integrity: sha512-xelSayHH36ZgE7ZWhli7pW34hNbNl8Ojv5KVmkJD4hBdD3th8Tfk9vYasLM+mXWOZhFkgZfxhLSnrwRr4elSSg==}
     engines: {node: '>=0.10.0'}
 
+  is-hexadecimal@1.0.4:
+    resolution: {integrity: sha512-gyPJuv83bHMpocVYoqof5VDiZveEoGoFL8m3BXNb2VW8Xs+rz9kqO8LOQ5DH6EsuvilT1ApazU0pyl+ytbPtlw==}
+
   is-hexadecimal@2.0.1:
     resolution: {integrity: sha512-DgZQp241c8oO6cA1SbTEWiXeoxV42vlcJxgH+B3hi1AiqqKruZR3ZGF8In3fj4+/y/7rHvlOZLZtgJ/4ttYGZg==}
 
@@ -3362,6 +3417,9 @@ packages:
     resolution: {integrity: sha512-lyuxPGr/Wfhrlem2CL/UcnUc1zcqKAImBDzukY7Y5F/yQiNdko6+fRLevlw1HgMySw7f611UIY408EtxRSoK3Q==}
     hasBin: true
 
+  lowlight@1.20.0:
+    resolution: {integrity: sha512-8Ktj+prEb1RoCPkEOrPMYUN/nCggB7qAWe3a7OpMjWQkh3l2RD5wKRQ+o8Q8YuI9RG/xs95waaI/E6ym/7NsTw==}
+
   lru-cache@10.4.3:
     resolution: {integrity: sha512-JNAzZcXrCt42VGLuYz0zfAzDfAvJWW6AfYlDBQyDV5DClI2m5sAmK+OIO7s59XfsRsWHp02jAJrRadPRGTt6SQ==}
 
@@ -3716,6 +3774,9 @@ packages:
     resolution: {integrity: sha512-GQ2EWRpQV8/o+Aw8YqtfZZPfNRWZYkbidE9k5rpl/hC3vtHHBfGm2Ifi6qWV+coDGkrUKZAxE3Lot5kcsRlh+g==}
     engines: {node: '>=6'}
 
+  parse-entities@2.0.0:
+    resolution: {integrity: sha512-kkywGpCcRYhqQIchaWqZ875wzpS/bMKhz5HnN3p7wveJTkTtyAB/AlnS0f8DFSqYW1T82t6yEAkEcB+A1I3MbQ==}
+
   parse-entities@4.0.2:
     resolution: {integrity: sha512-GG2AQYWoLgL877gQIKeRPGO1xF9+eG1ujIb5soS5gPvLQ1y2o8FL90w2QWNdf9I361Mpp7726c+lj3U0qK1uGw==}
 
@@ -3792,12 +3853,23 @@ packages:
     resolution: {integrity: sha512-vkcDPrRZo1QZLbn5RLGPpg/WmIQ65qoWWhcGKf/b5eplkkarX0m9z8ppCat4mlOqUsWpyNuYgO3VRyrYHSzX5g==}
     engines: {node: '>= 0.8.0'}
 
+  prismjs@1.27.0:
+    resolution: {integrity: sha512-t13BGPUlFDR7wRB5kQDG4jjl7XeuH6jbJGt11JHPL96qwsEHNX2+68tFXqc1/k+/jALsbSWJKUOT/hcYAZ5LkA==}
+    engines: {node: '>=6'}
+
+  prismjs@1.30.0:
+    resolution: {integrity: sha512-DEvV2ZF2r2/63V+tK8hQvrR2ZGn10srHbXviTlcv7Kpzw8jWiNTqbVgjO3IY8RxrrOUF8VPMQQFysYYYv0YZxw==}
+    engines: {node: '>=6'}
+
   promise@7.3.1:
     resolution: {integrity: sha512-nolQXZ/4L+bP/UGlkfaIujX9BKxGwmQ9OT4mOt5yvy8iK1h3wqTEJCijzGANTCCl9nWjY41juyAn2K3Q1hLLTg==}
 
   prop-types@15.8.1:
     resolution: {integrity: sha512-oj87CgZICdulUohogVAR7AjlC0327U4el4L6eAvOqCeudMDVU0NThNaV+b9Df4dXgSP1gXMTnPdhfe/2qDH5cg==}
 
+  property-information@5.6.0:
+    resolution: {integrity: sha512-YUHSPk+A30YPv+0Qf8i9Mbfe/C0hdPXk1s1jPVToV8pk8BQtpw10ct89Eo7OWkutrwqvT0eicAxlOg3dOAu8JA==}
+
   property-information@6.5.0:
     resolution: {integrity: sha512-PgTgs/BlvHxOu8QuEN7wi5A0OmXaBcHpmCSTehcs6Uuu9IkDIEo13Hy7n898RHfrQ49vKCoGeWZSaAK01nwVig==}
 
@@ -3921,6 +3993,11 @@ packages:
       '@types/react':
         optional: true
 
+  react-syntax-highlighter@15.6.1:
+    resolution: {integrity: sha512-OqJ2/vL7lEeV5zTJyG7kmARppUjiB9h9udl4qHQjjgEos66z00Ia0OckwYfRxCSFrW8RJIBnsBwQsHZbVPspqg==}
+    peerDependencies:
+      react: '>= 0.14.0'
+
   react-textarea-autosize@8.5.7:
     resolution: {integrity: sha512-2MqJ3p0Jh69yt9ktFIaZmORHXw4c4bxSIhCeWiFwmJ9EYKgLmuNII3e9c9b2UO+ijl4StnpZdqpxNIhTdHvqtQ==}
     engines: {node: '>=10'}
@@ -3955,6 +4032,9 @@ packages:
     resolution: {integrity: sha512-00o4I+DVrefhv+nX0ulyi3biSHCPDe+yLv5o/p6d/UVlirijB8E16FtfwSAi4g3tcqrQ4lRAqQSoFEZJehYEcw==}
     engines: {node: '>= 0.4'}
 
+  refractor@3.6.0:
+    resolution: {integrity: sha512-MY9W41IOWxxk31o+YvFCNyNzdkc9M20NoZK5vq6jkv4I/uh2zkWcfudj0Q1fovjUQJrNewS9NMzeTtqPf+n5EA==}
+
   regenerator-runtime@0.14.1:
     resolution: {integrity: sha512-dYnhHh0nJoMfnkZs6GmmhFknAGRrLznOu5nc9ML+EJxGvrx6H7teuevqVqCuPcPK//3eDrrjQhehXVx9cnkGdw==}
 
@@ -4142,6 +4222,9 @@ packages:
     resolution: {integrity: sha512-2ymg6oRBpebeZi9UUNsgQ89bhx01TcTkmNTGnNO88imTmbSgy4nfujrgVEFKWpMTEGA11EDkTt7mqObTPdigIA==}
     engines: {node: '>= 8'}
 
+  space-separated-tokens@1.1.5:
+    resolution: {integrity: sha512-q/JSVd1Lptzhf5bkYm4ob4iWPjx0KiRe3sRFBNrVqbJkFaBm5vbbowy1mymoPNLRa52+oadOhJ+K49wsSeSjTA==}
+
   space-separated-tokens@2.0.2:
     resolution: {integrity: sha512-PEGlAwrG8yXGXRjW32fGbg66JAlOAwbObuqVoJpv/mRgoWDQfgH1wDPvtzWyUSNAXBGSk8h755YDbbcEy3SH2Q==}
 
@@ -4256,6 +4339,9 @@ packages:
   tailwind-merge@3.2.0:
     resolution: {integrity: sha512-FQT/OVqCD+7edmmJpsgCsY820RTD5AkBryuG5IUqR5YQZSdj5xlH5nLgH7YPths7WsLPSpSBNneJdM8aS8aeFA==}
 
+  tailwind-merge@3.3.0:
+    resolution: {integrity: sha512-fyW/pEfcQSiigd5SNn0nApUOxx0zB/dm6UDU/rEwc2c3sX2smWUNbapHv+QRqLGVp9GWX3THIa7MUGPo+YkDzQ==}
+
   tailwindcss-animate@1.0.7:
     resolution: {integrity: sha512-bl6mpH3T7I3UFxuvDEXLxy/VuFxBk5bbzplh7tXI68mwMokNYd1t9qPBHlnyTwfa4JGC4zP516I1hYYtQ/vspA==}
     peerDependencies:
@@ -4509,6 +4595,10 @@ packages:
     resolution: {integrity: sha512-si7QWI6zUMq56bESFvagtmzMdGOtoxfR+Sez11Mobfc7tm+VkUckk9bW2UeffTGVUbOksxmSw0AA2gs8g71NCQ==}
     engines: {node: '>=12'}
 
+  xtend@4.0.2:
+    resolution: {integrity: sha512-LKYU1iAXJXUgAXn9URjiu+MWhyUXHsvfp7mcuYm9dSUKK0/CjtrUwFAxD82/mCWbtLsGjFIad0wIsod4zrTAEQ==}
+    engines: {node: '>=0.4'}
+
   yaml@1.10.2:
     resolution: {integrity: sha512-r3vXyErRCYJ7wg28yvBY5VSoAF8ZvlcW9/BwUzEtUsjvX/DKs24dIkuwjtuprwJJHsbyUbLApepYTR1BN4uHrg==}
     engines: {node: '>= 6'}
@@ -6253,6 +6343,10 @@ snapshots:
 
   '@types/estree@1.0.6': {}
 
+  '@types/hast@2.3.10':
+    dependencies:
+      '@types/unist': 2.0.11
+
   '@types/hast@3.0.4':
     dependencies:
       '@types/unist': 3.0.3
@@ -6277,6 +6371,10 @@ snapshots:
     dependencies:
       '@types/react': 19.0.10
 
+  '@types/react-syntax-highlighter@15.5.13':
+    dependencies:
+      '@types/react': 19.0.10
+
   '@types/react@19.0.10':
     dependencies:
       csstype: 3.1.3
@@ -6568,10 +6666,16 @@ snapshots:
 
   character-entities-html4@2.1.0: {}
 
+  character-entities-legacy@1.1.4: {}
+
   character-entities-legacy@3.0.0: {}
 
+  character-entities@1.2.4: {}
+
   character-entities@2.0.2: {}
 
+  character-reference-invalid@1.1.4: {}
+
   character-reference-invalid@2.0.1: {}
 
   chokidar@3.6.0:
@@ -6626,6 +6730,8 @@ snapshots:
       color-string: 1.9.1
     optional: true
 
+  comma-separated-tokens@1.0.8: {}
+
   comma-separated-tokens@2.0.3: {}
 
   commander@4.1.1: {}
@@ -6744,7 +6850,7 @@ snapshots:
       react: 19.0.0
       react-dom: 19.0.0(react@19.0.0)
       react-easy-sort: 1.6.0(react-dom@19.0.0(react@19.0.0))(react@19.0.0)
-      tailwind-merge: 3.2.0
+      tailwind-merge: 3.3.0
       tsup: 6.7.0(postcss@8.5.3)(typescript@5.8.2)
     transitivePeerDependencies:
       - '@swc/core'
@@ -7218,6 +7324,10 @@ snapshots:
     dependencies:
       reusify: 1.1.0
 
+  fault@1.0.4:
+    dependencies:
+      format: 0.2.2
+
   fbemitter@3.0.0:
     dependencies:
       fbjs: 3.0.5
@@ -7283,6 +7393,8 @@ snapshots:
       cross-spawn: 7.0.6
       signal-exit: 4.1.0
 
+  format@0.2.2: {}
+
   framer-motion@12.4.7(react-dom@19.0.0(react@19.0.0))(react@19.0.0):
     dependencies:
       motion-dom: 12.4.5
@@ -7501,6 +7613,8 @@ snapshots:
       vfile-location: 5.0.3
       web-namespaces: 2.0.1
 
+  hast-util-parse-selector@2.2.5: {}
+
   hast-util-parse-selector@4.0.0:
     dependencies:
       '@types/hast': 3.0.4
@@ -7600,6 +7714,14 @@ snapshots:
     dependencies:
       '@types/hast': 3.0.4
 
+  hastscript@6.0.0:
+    dependencies:
+      '@types/hast': 2.3.10
+      comma-separated-tokens: 1.0.8
+      hast-util-parse-selector: 2.2.5
+      property-information: 5.6.0
+      space-separated-tokens: 1.1.5
+
   hastscript@9.0.1:
     dependencies:
       '@types/hast': 3.0.4
@@ -7608,6 +7730,10 @@ snapshots:
       property-information: 7.0.0
       space-separated-tokens: 2.0.2
 
+  highlight.js@10.7.3: {}
+
+  highlightjs-vue@1.0.0: {}
+
   html-url-attributes@3.0.1: {}
 
   html-void-elements@3.0.0: {}
@@ -7633,8 +7759,15 @@ snapshots:
       hasown: 2.0.2
       side-channel: 1.1.0
 
+  is-alphabetical@1.0.4: {}
+
   is-alphabetical@2.0.1: {}
 
+  is-alphanumerical@1.0.4:
+    dependencies:
+      is-alphabetical: 1.0.4
+      is-decimal: 1.0.4
+
   is-alphanumerical@2.0.1:
     dependencies:
       is-alphabetical: 2.0.1
@@ -7691,6 +7824,8 @@ snapshots:
       call-bound: 1.0.3
       has-tostringtag: 1.0.2
 
+  is-decimal@1.0.4: {}
+
   is-decimal@2.0.1: {}
 
   is-extendable@0.1.1: {}
@@ -7714,6 +7849,8 @@ snapshots:
     dependencies:
       is-extglob: 2.1.1
 
+  is-hexadecimal@1.0.4: {}
+
   is-hexadecimal@2.0.1: {}
 
   is-map@2.0.3: {}
@@ -7917,6 +8054,11 @@ snapshots:
     dependencies:
       js-tokens: 4.0.0
 
+  lowlight@1.20.0:
+    dependencies:
+      fault: 1.0.4
+      highlight.js: 10.7.3
+
   lru-cache@10.4.3: {}
 
   lru-cache@11.1.0: {}
@@ -8536,6 +8678,15 @@ snapshots:
     dependencies:
       callsites: 3.1.0
 
+  parse-entities@2.0.0:
+    dependencies:
+      character-entities: 1.2.4
+      character-entities-legacy: 1.1.4
+      character-reference-invalid: 1.1.4
+      is-alphanumerical: 1.0.4
+      is-decimal: 1.0.4
+      is-hexadecimal: 1.0.4
+
   parse-entities@4.0.2:
     dependencies:
       '@types/unist': 2.0.11
@@ -8604,6 +8755,10 @@ snapshots:
 
   prelude-ls@1.2.1: {}
 
+  prismjs@1.27.0: {}
+
+  prismjs@1.30.0: {}
+
   promise@7.3.1:
     dependencies:
       asap: 2.0.6
@@ -8614,6 +8769,10 @@ snapshots:
       object-assign: 4.1.1
       react-is: 16.13.1
 
+  property-information@5.6.0:
+    dependencies:
+      xtend: 4.0.2
+
   property-information@6.5.0: {}
 
   property-information@7.0.0: {}
@@ -8746,6 +8905,16 @@ snapshots:
     optionalDependencies:
       '@types/react': 19.0.10
 
+  react-syntax-highlighter@15.6.1(react@19.0.0):
+    dependencies:
+      '@babel/runtime': 7.26.9
+      highlight.js: 10.7.3
+      highlightjs-vue: 1.0.0
+      lowlight: 1.20.0
+      prismjs: 1.30.0
+      react: 19.0.0
+      refractor: 3.6.0
+
   react-textarea-autosize@8.5.7(@types/react@19.0.10)(react@19.0.0):
     dependencies:
       '@babel/runtime': 7.26.9
@@ -8804,6 +8973,12 @@ snapshots:
       get-proto: 1.0.1
       which-builtin-type: 1.2.1
 
+  refractor@3.6.0:
+    dependencies:
+      hastscript: 6.0.0
+      parse-entities: 2.0.0
+      prismjs: 1.27.0
+
   regenerator-runtime@0.14.1: {}
 
   regex-recursion@6.0.2:
@@ -9078,6 +9253,8 @@ snapshots:
     dependencies:
       whatwg-url: 7.1.0
 
+  space-separated-tokens@1.1.5: {}
+
   space-separated-tokens@2.0.2: {}
 
   sprintf-js@1.0.3: {}
@@ -9208,6 +9385,8 @@ snapshots:
 
   tailwind-merge@3.2.0: {}
 
+  tailwind-merge@3.3.0: {}
+
   tailwindcss-animate@1.0.7(tailwindcss@4.0.9):
     dependencies:
       tailwindcss: 4.0.9
@@ -9508,6 +9687,8 @@ snapshots:
       string-width: 5.1.2
       strip-ansi: 7.1.0
 
+  xtend@4.0.2: {}
+
   yaml@1.10.2: {}
 
   yocto-queue@0.1.0: {}

From 521ee4a1c4808493f8ced65b5a94a8e102dabdc9 Mon Sep 17 00:00:00 2001
From: "DESKTOP-RTLN3BA\\$punk" <vermarohanfinal@gmail.com>
Date: Wed, 21 May 2025 20:56:23 -0700
Subject: [PATCH 57/70] feat: Removed Hard Dependecy on Google Auth

- Introduced LOCAL auth mode
---
 README.md                                     |   1 -
 surfsense_backend/.env.example                |   9 +-
 surfsense_backend/app/app.py                  |  34 ++--
 surfsense_backend/app/config/__init__.py      |  13 +-
 surfsense_backend/app/db.py                   |  49 ++++--
 surfsense_backend/app/users.py                |  20 ++-
 surfsense_web/.env.example                    |   3 +-
 surfsense_web/app/dashboard/page.tsx          |  23 ++-
 surfsense_web/app/login/AmbientBackground.tsx |  43 +++++
 surfsense_web/app/login/GoogleLoginButton.tsx |  46 +-----
 surfsense_web/app/login/LocalLoginForm.tsx    | 114 ++++++++++++++
 surfsense_web/app/login/page.tsx              |  64 +++++++-
 surfsense_web/app/register/page.tsx           | 149 ++++++++++++++++++
 surfsense_web/components/Navbar.tsx           |  28 ++--
 .../content/docs/docker-installation.mdx      |  21 ++-
 surfsense_web/content/docs/index.mdx          |   6 +-
 .../content/docs/manual-installation.mdx      |  37 +++--
 17 files changed, 535 insertions(+), 125 deletions(-)
 create mode 100644 surfsense_web/app/login/AmbientBackground.tsx
 create mode 100644 surfsense_web/app/login/LocalLoginForm.tsx
 create mode 100644 surfsense_web/app/register/page.tsx

diff --git a/README.md b/README.md
index 034a1cbaa..8e0eae573 100644
--- a/README.md
+++ b/README.md
@@ -130,7 +130,6 @@ Both installation guides include detailed OS-specific instructions for Windows,
 
 Before installation, make sure to complete the [prerequisite setup steps](https://www.surfsense.net/docs/) including:
 - PGVector setup
-- Google OAuth configuration
 - Unstructured.io API key
 - Other required API keys
 
diff --git a/surfsense_backend/.env.example b/surfsense_backend/.env.example
index 19a411515..f9c43d106 100644
--- a/surfsense_backend/.env.example
+++ b/surfsense_backend/.env.example
@@ -1,10 +1,15 @@
 DATABASE_URL="postgresql+asyncpg://postgres:postgres@localhost:5432/surfsense"
 
 SECRET_KEY="SECRET"
-GOOGLE_OAUTH_CLIENT_ID="924507538m"
-GOOGLE_OAUTH_CLIENT_SECRET="GOCSV"
 NEXT_FRONTEND_URL="http://localhost:3000"
 
+#Auth
+AUTH_TYPE="GOOGLE" or "LOCAL"
+# For Google Auth Only
+GOOGLE_OAUTH_CLIENT_ID="924507538m"
+GOOGLE_OAUTH_CLIENT_SECRET="GOCSV"
+
+#Embedding Model
 EMBEDDING_MODEL="mixedbread-ai/mxbai-embed-large-v1"
 
 RERANKERS_MODEL_NAME="ms-marco-MiniLM-L-12-v2"
diff --git a/surfsense_backend/app/app.py b/surfsense_backend/app/app.py
index 5649d2d12..956740f7b 100644
--- a/surfsense_backend/app/app.py
+++ b/surfsense_backend/app/app.py
@@ -6,16 +6,18 @@ from fastapi.middleware.cors import CORSMiddleware
 from sqlalchemy.ext.asyncio import AsyncSession
 
 from app.db import User, create_db_and_tables, get_async_session
-from app.retriver.chunks_hybrid_search import ChucksHybridSearchRetriever
 from app.schemas import UserCreate, UserRead, UserUpdate
+
+
+from app.routes import router as crud_router
+from app.config import config
+
 from app.users import (
     SECRET,
     auth_backend,
     fastapi_users,
-    google_oauth_client,
-    current_active_user,
+    current_active_user
 )
-from app.routes import router as crud_router
 
 
 @asynccontextmanager
@@ -59,16 +61,20 @@ app.include_router(
     prefix="/users",
     tags=["users"],
 )
-app.include_router(
-    fastapi_users.get_oauth_router(
-        google_oauth_client,
-        auth_backend,
-        SECRET,
-        is_verified_by_default=True
-    ),
-    prefix="/auth/google",
-    tags=["auth"],
-)
+
+if config.AUTH_TYPE == "GOOGLE":
+    from app.users import google_oauth_client
+    app.include_router(
+        fastapi_users.get_oauth_router(
+            google_oauth_client,
+            auth_backend,
+            SECRET,
+            is_verified_by_default=True
+        ),
+        prefix="/auth/google",
+        tags=["auth"],
+    )
+
 app.include_router(crud_router, prefix="/api/v1", tags=["crud"])
 
 
diff --git a/surfsense_backend/app/config/__init__.py b/surfsense_backend/app/config/__init__.py
index 9dc8627e5..81cd9a276 100644
--- a/surfsense_backend/app/config/__init__.py
+++ b/surfsense_backend/app/config/__init__.py
@@ -38,12 +38,17 @@ class Config:
     
     # Database
     DATABASE_URL = os.getenv("DATABASE_URL")
-
-    # AUTH: Google OAuth
-    GOOGLE_OAUTH_CLIENT_ID = os.getenv("GOOGLE_OAUTH_CLIENT_ID")
-    GOOGLE_OAUTH_CLIENT_SECRET = os.getenv("GOOGLE_OAUTH_CLIENT_SECRET")
+    
     NEXT_FRONTEND_URL = os.getenv("NEXT_FRONTEND_URL")
     
+    
+    # AUTH: Google OAuth
+    AUTH_TYPE = os.getenv("AUTH_TYPE")
+    if AUTH_TYPE == "GOOGLE":
+        GOOGLE_OAUTH_CLIENT_ID = os.getenv("GOOGLE_OAUTH_CLIENT_ID")
+        GOOGLE_OAUTH_CLIENT_SECRET = os.getenv("GOOGLE_OAUTH_CLIENT_SECRET")
+        
+    
     # LONG-CONTEXT LLMS
     LONG_CONTEXT_LLM = os.getenv("LONG_CONTEXT_LLM")
     LONG_CONTEXT_LLM_API_BASE = os.getenv("LONG_CONTEXT_LLM_API_BASE")
diff --git a/surfsense_backend/app/db.py b/surfsense_backend/app/db.py
index 7327c3a0c..10f78a55f 100644
--- a/surfsense_backend/app/db.py
+++ b/surfsense_backend/app/db.py
@@ -3,11 +3,7 @@ from datetime import datetime, timezone
 from enum import Enum
 
 from fastapi import Depends
-from fastapi_users.db import (
-    SQLAlchemyBaseOAuthAccountTableUUID,
-    SQLAlchemyBaseUserTableUUID,
-    SQLAlchemyUserDatabase,
-)
+
 from pgvector.sqlalchemy import Vector
 from sqlalchemy import (
     ARRAY,
@@ -30,6 +26,18 @@ from app.config import config
 from app.retriver.chunks_hybrid_search import ChucksHybridSearchRetriever
 from app.retriver.documents_hybrid_search import DocumentHybridSearchRetriever
 
+if config.AUTH_TYPE == "GOOGLE":
+    from fastapi_users.db import (
+        SQLAlchemyBaseOAuthAccountTableUUID,
+        SQLAlchemyBaseUserTableUUID,
+        SQLAlchemyUserDatabase,
+    )
+else:
+    from fastapi_users.db import (
+        SQLAlchemyBaseUserTableUUID,
+        SQLAlchemyUserDatabase,
+    )
+
 DATABASE_URL = config.DATABASE_URL
 
 
@@ -141,17 +149,22 @@ class SearchSourceConnector(BaseModel, TimestampMixin):
     user_id = Column(UUID(as_uuid=True), ForeignKey("user.id", ondelete='CASCADE'), nullable=False)
     user = relationship("User", back_populates="search_source_connectors")
 
-
-class OAuthAccount(SQLAlchemyBaseOAuthAccountTableUUID, Base):
-    pass
+if config.AUTH_TYPE == "GOOGLE":
+    class OAuthAccount(SQLAlchemyBaseOAuthAccountTableUUID, Base):
+        pass
 
 
-class User(SQLAlchemyBaseUserTableUUID, Base):
-    oauth_accounts: Mapped[list[OAuthAccount]] = relationship(
-        "OAuthAccount", lazy="joined"
-    )
-    search_spaces = relationship("SearchSpace", back_populates="user")
-    search_source_connectors = relationship("SearchSourceConnector", back_populates="user")
+    class User(SQLAlchemyBaseUserTableUUID, Base):
+        oauth_accounts: Mapped[list[OAuthAccount]] = relationship(
+            "OAuthAccount", lazy="joined"
+        )
+        search_spaces = relationship("SearchSpace", back_populates="user")
+        search_source_connectors = relationship("SearchSourceConnector", back_populates="user")
+else:
+    class User(SQLAlchemyBaseUserTableUUID, Base):
+
+        search_spaces = relationship("SearchSpace", back_populates="user")
+        search_source_connectors = relationship("SearchSourceConnector", back_populates="user")
 
 
 engine = create_async_engine(DATABASE_URL)
@@ -180,8 +193,12 @@ async def get_async_session() -> AsyncGenerator[AsyncSession, None]:
         yield session
 
 
-async def get_user_db(session: AsyncSession = Depends(get_async_session)):
-    yield SQLAlchemyUserDatabase(session, User, OAuthAccount)
+if config.AUTH_TYPE == "GOOGLE":
+    async def get_user_db(session: AsyncSession = Depends(get_async_session)):
+        yield SQLAlchemyUserDatabase(session, User, OAuthAccount)
+else:
+    async def get_user_db(session: AsyncSession = Depends(get_async_session)):
+        yield SQLAlchemyUserDatabase(session, User)
     
 async def get_chucks_hybrid_search_retriever(session: AsyncSession = Depends(get_async_session)):
     return ChucksHybridSearchRetriever(session)
diff --git a/surfsense_backend/app/users.py b/surfsense_backend/app/users.py
index ec4445c3f..d73baae4c 100644
--- a/surfsense_backend/app/users.py
+++ b/surfsense_backend/app/users.py
@@ -10,8 +10,8 @@ from fastapi_users.authentication import (
     JWTStrategy,
 )
 from fastapi_users.db import SQLAlchemyUserDatabase
-from httpx_oauth.clients.google import GoogleOAuth2
-
+from fastapi.responses import JSONResponse
+from fastapi_users.schemas import model_dump
 from app.config import config
 from app.db import User, get_user_db
 from pydantic import BaseModel
@@ -22,10 +22,13 @@ class BearerResponse(BaseModel):
 
 SECRET = config.SECRET_KEY
 
-google_oauth_client = GoogleOAuth2(
-    config.GOOGLE_OAUTH_CLIENT_ID,
-    config.GOOGLE_OAUTH_CLIENT_SECRET,
-)
+if config.AUTH_TYPE == "GOOGLE":
+    from httpx_oauth.clients.google import GoogleOAuth2
+    
+    google_oauth_client = GoogleOAuth2(
+        config.GOOGLE_OAUTH_CLIENT_ID,
+        config.GOOGLE_OAUTH_CLIENT_SECRET,
+    )
 
 
 class UserManager(UUIDIDMixin, BaseUserManager[User, uuid.UUID]):
@@ -79,7 +82,10 @@ class CustomBearerTransport(BearerTransport):
     async def get_login_response(self, token: str) -> Response:
         bearer_response = BearerResponse(access_token=token, token_type="bearer")
         redirect_url = f"{config.NEXT_FRONTEND_URL}/auth/callback?token={bearer_response.access_token}"
-        return RedirectResponse(redirect_url, status_code=302)
+        if config.AUTH_TYPE == "GOOGLE":
+            return RedirectResponse(redirect_url, status_code=302)
+        else:
+            return JSONResponse(model_dump(bearer_response))
 
 bearer_transport = CustomBearerTransport(tokenUrl="auth/jwt/login")
 
diff --git a/surfsense_web/.env.example b/surfsense_web/.env.example
index abd370cbf..3ab9d176f 100644
--- a/surfsense_web/.env.example
+++ b/surfsense_web/.env.example
@@ -1 +1,2 @@
-NEXT_PUBLIC_FASTAPI_BACKEND_URL=http://localhost:8000
\ No newline at end of file
+NEXT_PUBLIC_FASTAPI_BACKEND_URL=http://localhost:8000
+NEXT_PUBLIC_FASTAPI_BACKEND_AUTH_TYPE=LOCAL or GOOGLE
\ No newline at end of file
diff --git a/surfsense_web/app/dashboard/page.tsx b/surfsense_web/app/dashboard/page.tsx
index d3e298f83..176c9bf15 100644
--- a/surfsense_web/app/dashboard/page.tsx
+++ b/surfsense_web/app/dashboard/page.tsx
@@ -4,7 +4,7 @@ import React from 'react'
 import Link from 'next/link'
 import { motion } from 'framer-motion'
 import { Button } from '@/components/ui/button'
-import { Plus, Search, Trash2, AlertCircle, Loader2 } from 'lucide-react'
+import { Plus, Search, Trash2, AlertCircle, Loader2, LogOut } from 'lucide-react'
 import { Tilt } from '@/components/ui/tilt'
 import { Spotlight } from '@/components/ui/spotlight'
 import { Logo } from '@/components/Logo';
@@ -145,11 +145,19 @@ const DashboardPage = () => {
     },
   };
 
+  const router = useRouter();
   const { searchSpaces, loading, error, refreshSearchSpaces } = useSearchSpaces();
 
   if (loading) return <LoadingScreen />;
   if (error) return <ErrorScreen message={error} />;
 
+  const handleLogout = () => {
+    if (typeof window !== 'undefined') {
+      localStorage.removeItem('surfsense_bearer_token');
+      router.push('/');
+    }
+  };
+
   const handleDeleteSearchSpace = async (id: number) => {
     // Send DELETE request to the API
     try {
@@ -193,7 +201,18 @@ const DashboardPage = () => {
               </p>
             </div>
           </div>
-          <ThemeTogglerComponent />
+          <div className="flex items-center space-x-3">
+            <Button 
+              variant="ghost" 
+              size="icon" 
+              onClick={handleLogout}
+              className="h-9 w-9 rounded-full"
+              aria-label="Logout"
+            >
+              <LogOut className="h-5 w-5" />
+            </Button>
+            <ThemeTogglerComponent />
+          </div>
         </div>
 
         <div className="flex flex-col space-y-6 mt-6">
diff --git a/surfsense_web/app/login/AmbientBackground.tsx b/surfsense_web/app/login/AmbientBackground.tsx
new file mode 100644
index 000000000..6b61d517d
--- /dev/null
+++ b/surfsense_web/app/login/AmbientBackground.tsx
@@ -0,0 +1,43 @@
+"use client";
+import React from "react";
+
+export const AmbientBackground = () => {
+  return (
+    <div className="pointer-events-none absolute left-0 top-0 z-0 h-screen w-screen">
+      <div
+        style={{
+          transform: "translateY(-350px) rotate(-45deg)",
+          width: "560px",
+          height: "1380px",
+          background:
+            "radial-gradient(68.54% 68.72% at 55.02% 31.46%, rgba(59, 130, 246, 0.08) 0%, rgba(59, 130, 246, 0.02) 50%, rgba(59, 130, 246, 0) 100%)",
+        }}
+        className="absolute left-0 top-0"
+      />
+      <div
+        style={{
+          transform: "rotate(-45deg) translate(5%, -50%)",
+          transformOrigin: "top left",
+          width: "240px",
+          height: "1380px",
+          background:
+            "radial-gradient(50% 50% at 50% 50%, rgba(59, 130, 246, 0.06) 0%, rgba(59, 130, 246, 0.02) 80%, transparent 100%)",
+        }}
+        className="absolute left-0 top-0"
+      />
+      <div
+        style={{
+          position: "absolute",
+          borderRadius: "20px",
+          transform: "rotate(-45deg) translate(-180%, -70%)",
+          transformOrigin: "top left",
+          width: "240px",
+          height: "1380px",
+          background:
+            "radial-gradient(50% 50% at 50% 50%, rgba(59, 130, 246, 0.04) 0%, rgba(59, 130, 246, 0.02) 80%, transparent 100%)",
+        }}
+        className="absolute left-0 top-0"
+      />
+    </div>
+  );
+}; 
\ No newline at end of file
diff --git a/surfsense_web/app/login/GoogleLoginButton.tsx b/surfsense_web/app/login/GoogleLoginButton.tsx
index 11caafbf3..ee5deb3a9 100644
--- a/surfsense_web/app/login/GoogleLoginButton.tsx
+++ b/surfsense_web/app/login/GoogleLoginButton.tsx
@@ -3,6 +3,7 @@ import React from "react";
 import { IconBrandGoogleFilled } from "@tabler/icons-react";
 import { motion } from "framer-motion";
 import { Logo } from "@/components/Logo";
+import { AmbientBackground } from "./AmbientBackground";
 
 export function GoogleLoginButton() {
   const handleGoogleLogin = () => {
@@ -88,47 +89,4 @@ export function GoogleLoginButton() {
       </div>
     </div>
   );
-}
-
-
-
-const AmbientBackground = () => {
-  return (
-    <div className="pointer-events-none absolute left-0 top-0 z-0 h-screen w-screen">
-      <div
-        style={{
-          transform: "translateY(-350px) rotate(-45deg)",
-          width: "560px",
-          height: "1380px",
-          background:
-            "radial-gradient(68.54% 68.72% at 55.02% 31.46%, rgba(59, 130, 246, 0.08) 0%, rgba(59, 130, 246, 0.02) 50%, rgba(59, 130, 246, 0) 100%)",
-        }}
-        className="absolute left-0 top-0"
-      />
-      <div
-        style={{
-          transform: "rotate(-45deg) translate(5%, -50%)",
-          transformOrigin: "top left",
-          width: "240px",
-          height: "1380px",
-          background:
-            "radial-gradient(50% 50% at 50% 50%, rgba(59, 130, 246, 0.06) 0%, rgba(59, 130, 246, 0.02) 80%, transparent 100%)",
-        }}
-        className="absolute left-0 top-0"
-      />
-      <div
-        style={{
-          position: "absolute",
-          borderRadius: "20px",
-          transform: "rotate(-45deg) translate(-180%, -70%)",
-          transformOrigin: "top left",
-          width: "240px",
-          height: "1380px",
-          background:
-            "radial-gradient(50% 50% at 50% 50%, rgba(59, 130, 246, 0.04) 0%, rgba(59, 130, 246, 0.02) 80%, transparent 100%)",
-        }}
-        className="absolute left-0 top-0"
-      />
-    </div>
-  );
-}; 
\ No newline at end of file
+} 
\ No newline at end of file
diff --git a/surfsense_web/app/login/LocalLoginForm.tsx b/surfsense_web/app/login/LocalLoginForm.tsx
new file mode 100644
index 000000000..345941802
--- /dev/null
+++ b/surfsense_web/app/login/LocalLoginForm.tsx
@@ -0,0 +1,114 @@
+"use client";
+import React, { useState, useEffect } from "react";
+import { useRouter } from "next/navigation";
+import Link from "next/link";
+
+export function LocalLoginForm() {
+  const [username, setUsername] = useState("");
+  const [password, setPassword] = useState("");
+  const [error, setError] = useState("");
+  const [isLoading, setIsLoading] = useState(false);
+  const [authType, setAuthType] = useState<string | null>(null);
+  const router = useRouter();
+
+  useEffect(() => {
+    // Get the auth type from environment variables
+    setAuthType(process.env.NEXT_PUBLIC_FASTAPI_BACKEND_AUTH_TYPE || "GOOGLE");
+  }, []);
+
+  const handleSubmit = async (e: React.FormEvent) => {
+    e.preventDefault();
+    setIsLoading(true);
+    setError("");
+
+    try {
+      // Create form data for the API request
+      const formData = new URLSearchParams();
+      formData.append("username", username);
+      formData.append("password", password);
+      formData.append("grant_type", "password");
+
+      const response = await fetch(
+        `${process.env.NEXT_PUBLIC_FASTAPI_BACKEND_URL}/auth/jwt/login`,
+        {
+          method: "POST",
+          headers: {
+            "Content-Type": "application/x-www-form-urlencoded",
+          },
+          body: formData.toString(),
+        }
+      );
+
+      const data = await response.json();
+
+      if (!response.ok) {
+        throw new Error(data.detail || "Failed to login");
+      }
+
+      router.push("/auth/callback?token=" + data.access_token);
+    } catch (err: any) {
+      setError(err.message || "An error occurred during login");
+    } finally {
+      setIsLoading(false);
+    }
+  };
+
+  return (
+    <div className="w-full max-w-md">
+      <form onSubmit={handleSubmit} className="space-y-4">
+        {error && (
+          <div className="rounded-md bg-red-50 p-4 text-sm text-red-500 dark:bg-red-900/20 dark:text-red-200">
+            {error}
+          </div>
+        )}
+        
+        <div>
+          <label htmlFor="email" className="block text-sm font-medium text-gray-700 dark:text-gray-300">
+            Email
+          </label>
+          <input
+            id="email"
+            type="email"
+            required
+            value={username}
+            onChange={(e) => setUsername(e.target.value)}
+            className="mt-1 block w-full rounded-md border border-gray-300 bg-white px-3 py-2 shadow-sm focus:border-blue-500 focus:outline-none focus:ring-blue-500 dark:border-gray-700 dark:bg-gray-800 dark:text-white"
+          />
+        </div>
+
+        <div>
+          <label htmlFor="password" className="block text-sm font-medium text-gray-700 dark:text-gray-300">
+            Password
+          </label>
+          <input
+            id="password"
+            type="password"
+            required
+            value={password}
+            onChange={(e) => setPassword(e.target.value)}
+            className="mt-1 block w-full rounded-md border border-gray-300 bg-white px-3 py-2 shadow-sm focus:border-blue-500 focus:outline-none focus:ring-blue-500 dark:border-gray-700 dark:bg-gray-800 dark:text-white"
+          />
+        </div>
+
+        <button
+          type="submit"
+          disabled={isLoading}
+          className="w-full rounded-md bg-blue-600 px-4 py-2 text-white shadow-sm hover:bg-blue-700 focus:outline-none focus:ring-2 focus:ring-blue-500 focus:ring-offset-2 disabled:cursor-not-allowed disabled:opacity-50"
+        >
+          {isLoading ? "Signing in..." : "Sign in"}
+        </button>
+      </form>
+
+      {authType === "LOCAL" && (
+        <div className="mt-4 text-center text-sm">
+          <p className="text-gray-600 dark:text-gray-400">
+            Don&apos;t have an account?{" "}
+            <Link href="/register" className="font-medium text-blue-600 hover:text-blue-500 dark:text-blue-400">
+              Register here
+            </Link>
+          </p>
+        </div>
+      )}
+    </div>
+  );
+} 
\ No newline at end of file
diff --git a/surfsense_web/app/login/page.tsx b/surfsense_web/app/login/page.tsx
index ee3b46200..95620d2e4 100644
--- a/surfsense_web/app/login/page.tsx
+++ b/surfsense_web/app/login/page.tsx
@@ -1,5 +1,67 @@
+"use client";
+
+import { useState, useEffect } from "react";
 import { GoogleLoginButton } from "./GoogleLoginButton";
+import { LocalLoginForm } from "./LocalLoginForm";
+import { Logo } from "@/components/Logo";
+import { AmbientBackground } from "./AmbientBackground";
+import { useSearchParams } from "next/navigation";
+import { Loader2 } from "lucide-react";
 
 export default function LoginPage() {
-  return <GoogleLoginButton />;
+  const [authType, setAuthType] = useState<string | null>(null);
+  const [registrationSuccess, setRegistrationSuccess] = useState(false);
+  const [isLoading, setIsLoading] = useState(true);
+  const searchParams = useSearchParams();
+
+  useEffect(() => {
+    // Check if the user was redirected from registration
+    if (searchParams.get("registered") === "true") {
+      setRegistrationSuccess(true);
+    }
+
+    // Get the auth type from environment variables
+    setAuthType(process.env.NEXT_PUBLIC_FASTAPI_BACKEND_AUTH_TYPE || "GOOGLE");
+    setIsLoading(false);
+  }, [searchParams]);
+
+  // Show loading state while determining auth type
+  if (isLoading) {
+    return (
+      <div className="relative w-full overflow-hidden">
+        <AmbientBackground />
+        <div className="mx-auto flex h-screen max-w-lg flex-col items-center justify-center">
+          <Logo className="rounded-md" />
+          <div className="mt-8 flex items-center space-x-2">
+            <Loader2 className="h-5 w-5 animate-spin text-muted-foreground" />
+            <span className="text-muted-foreground">Loading...</span>
+          </div>
+        </div>
+      </div>
+    );
+  }
+
+  if (authType === "GOOGLE") {
+    return <GoogleLoginButton />;
+  }
+
+  return (
+    <div className="relative w-full overflow-hidden">
+      <AmbientBackground />
+      <div className="mx-auto flex h-screen max-w-lg flex-col items-center justify-center">
+        <Logo className="rounded-md" />
+        <h1 className="my-8 text-xl font-bold text-neutral-800 dark:text-neutral-100 md:text-4xl">
+          Sign In
+        </h1>
+
+        {registrationSuccess && (
+          <div className="mb-4 w-full rounded-md bg-green-50 p-4 text-sm text-green-500 dark:bg-green-900/20 dark:text-green-200">
+            Registration successful! You can now sign in with your credentials.
+          </div>
+        )}
+
+        <LocalLoginForm />
+      </div>
+    </div>
+  );
 } 
\ No newline at end of file
diff --git a/surfsense_web/app/register/page.tsx b/surfsense_web/app/register/page.tsx
new file mode 100644
index 000000000..33e012608
--- /dev/null
+++ b/surfsense_web/app/register/page.tsx
@@ -0,0 +1,149 @@
+"use client";
+import React, { useState, useEffect } from "react";
+import { useRouter } from "next/navigation";
+import Link from "next/link";
+import { Logo } from "@/components/Logo";
+import { AmbientBackground } from "../login/AmbientBackground";
+
+export default function RegisterPage() {
+  const [email, setEmail] = useState("");
+  const [password, setPassword] = useState("");
+  const [confirmPassword, setConfirmPassword] = useState("");
+  const [error, setError] = useState("");
+  const [isLoading, setIsLoading] = useState(false);
+  const router = useRouter();
+
+  // Check authentication type and redirect if not LOCAL
+  useEffect(() => {
+    const authType = process.env.NEXT_PUBLIC_FASTAPI_BACKEND_AUTH_TYPE || "GOOGLE";
+    if (authType !== "LOCAL") {
+      router.push("/login");
+    }
+  }, [router]);
+
+  const handleSubmit = async (e: React.FormEvent) => {
+    e.preventDefault();
+    
+    // Form validation
+    if (password !== confirmPassword) {
+      setError("Passwords do not match");
+      return;
+    }
+
+    setIsLoading(true);
+    setError("");
+
+    try {
+      const response = await fetch(
+        `${process.env.NEXT_PUBLIC_FASTAPI_BACKEND_URL}/auth/register`,
+        {
+          method: "POST",
+          headers: {
+            "Content-Type": "application/json",
+          },
+          body: JSON.stringify({
+            email,
+            password,
+            is_active: true,
+            is_superuser: false,
+            is_verified: false,
+          }),
+        }
+      );
+
+      const data = await response.json();
+
+      if (!response.ok) {
+        throw new Error(data.detail || "Registration failed");
+      }
+
+      // Redirect to login page after successful registration
+      router.push("/login?registered=true");
+    } catch (err: any) {
+      setError(err.message || "An error occurred during registration");
+    } finally {
+      setIsLoading(false);
+    }
+  };
+
+  return (
+    <div className="relative w-full overflow-hidden">
+      <AmbientBackground />
+      <div className="mx-auto flex h-screen max-w-lg flex-col items-center justify-center">
+        <Logo className="rounded-md" />
+        <h1 className="my-8 text-xl font-bold text-neutral-800 dark:text-neutral-100 md:text-4xl">
+          Create an Account
+        </h1>
+
+        <div className="w-full max-w-md">
+          <form onSubmit={handleSubmit} className="space-y-4">
+            {error && (
+              <div className="rounded-md bg-red-50 p-4 text-sm text-red-500 dark:bg-red-900/20 dark:text-red-200">
+                {error}
+              </div>
+            )}
+            
+            <div>
+              <label htmlFor="email" className="block text-sm font-medium text-gray-700 dark:text-gray-300">
+                Email
+              </label>
+              <input
+                id="email"
+                type="email"
+                required
+                value={email}
+                onChange={(e) => setEmail(e.target.value)}
+                className="mt-1 block w-full rounded-md border border-gray-300 bg-white px-3 py-2 shadow-sm focus:border-blue-500 focus:outline-none focus:ring-blue-500 dark:border-gray-700 dark:bg-gray-800 dark:text-white"
+              />
+            </div>
+
+            <div>
+              <label htmlFor="password" className="block text-sm font-medium text-gray-700 dark:text-gray-300">
+                Password
+              </label>
+              <input
+                id="password"
+                type="password"
+                required
+                value={password}
+                onChange={(e) => setPassword(e.target.value)}
+                className="mt-1 block w-full rounded-md border border-gray-300 bg-white px-3 py-2 shadow-sm focus:border-blue-500 focus:outline-none focus:ring-blue-500 dark:border-gray-700 dark:bg-gray-800 dark:text-white"
+              />
+            </div>
+
+            <div>
+              <label htmlFor="confirmPassword" className="block text-sm font-medium text-gray-700 dark:text-gray-300">
+                Confirm Password
+              </label>
+              <input
+                id="confirmPassword"
+                type="password"
+                required
+                value={confirmPassword}
+                onChange={(e) => setConfirmPassword(e.target.value)}
+                className="mt-1 block w-full rounded-md border border-gray-300 bg-white px-3 py-2 shadow-sm focus:border-blue-500 focus:outline-none focus:ring-blue-500 dark:border-gray-700 dark:bg-gray-800 dark:text-white"
+              />
+            </div>
+
+            <button
+              type="submit"
+              disabled={isLoading}
+              className="w-full rounded-md bg-blue-600 px-4 py-2 text-white shadow-sm hover:bg-blue-700 focus:outline-none focus:ring-2 focus:ring-blue-500 focus:ring-offset-2 disabled:cursor-not-allowed disabled:opacity-50"
+            >
+              {isLoading ? "Creating account..." : "Register"}
+            </button>
+          </form>
+
+          <div className="mt-4 text-center text-sm">
+            <p className="text-gray-600 dark:text-gray-400">
+              Already have an account?{" "}
+              <Link href="/login" className="font-medium text-blue-600 hover:text-blue-500 dark:text-blue-400">
+                Sign in
+              </Link>
+            </p>
+          </div>
+        </div>
+      </div>
+    </div>
+  );
+} 
\ No newline at end of file
diff --git a/surfsense_web/components/Navbar.tsx b/surfsense_web/components/Navbar.tsx
index fcc45f0a1..d61e0083a 100644
--- a/surfsense_web/components/Navbar.tsx
+++ b/surfsense_web/components/Navbar.tsx
@@ -1,6 +1,6 @@
 "use client";
 import { cn } from "@/lib/utils";
-import { IconMenu2, IconX, IconBrandGoogleFilled } from "@tabler/icons-react";
+import { IconMenu2, IconX, IconBrandGoogleFilled, IconUser } from "@tabler/icons-react";
 import {
   motion,
   AnimatePresence,
@@ -62,7 +62,7 @@ export const Navbar = () => {
 
 const DesktopNav = ({ navItems, visible }: NavbarProps) => {
   const [hoveredIndex, setHoveredIndex] = useState<number | null>(null);
-  
+
   const handleGoogleLogin = () => {
     // Redirect to the login page
     window.location.href = '/login';
@@ -73,8 +73,8 @@ const DesktopNav = ({ navItems, visible }: NavbarProps) => {
       onMouseLeave={() => setHoveredIndex(null)}
       animate={{
         backdropFilter: "blur(16px)",
-        background: visible 
-          ? "rgba(var(--background-rgb), 0.8)" 
+        background: visible
+          ? "rgba(var(--background-rgb), 0.8)"
           : "rgba(var(--background-rgb), 0.6)",
         width: visible ? "38%" : "80%",
         height: visible ? "48px" : "64px",
@@ -99,7 +99,7 @@ const DesktopNav = ({ navItems, visible }: NavbarProps) => {
       } as React.CSSProperties}
     >
       <div className="flex flex-row items-center gap-2">
-        <Logo className="h-8 w-8 rounded-md" /> 
+        <Logo className="h-8 w-8 rounded-md" />
         <span className="dark:text-white/90 text-gray-800 text-lg font-bold">SurfSense</span>
       </div>
       <div className="flex items-center gap-4">
@@ -175,8 +175,8 @@ const DesktopNav = ({ navItems, visible }: NavbarProps) => {
                 variant="outline"
                 className="hidden cursor-pointer md:flex items-center gap-2 rounded-full dark:bg-white/20 dark:hover:bg-white/30 dark:text-white bg-gray-100 hover:bg-gray-200 text-gray-800 border-0"
               >
-                <IconBrandGoogleFilled className="h-4 w-4" />
-                <span>Sign in with Google</span>
+                <IconUser className="h-4 w-4" />
+                <span>Sign in</span>
               </Button>
             </motion.div>
           )}
@@ -188,19 +188,19 @@ const DesktopNav = ({ navItems, visible }: NavbarProps) => {
 
 const MobileNav = ({ navItems, visible }: NavbarProps) => {
   const [open, setOpen] = useState(false);
-  
+
   const handleGoogleLogin = () => {
     // Redirect to the login page
     window.location.href = "./login";
   };
-  
+
   return (
     <>
       <motion.div
         animate={{
           backdropFilter: "blur(16px)",
-          background: visible 
-            ? "rgba(var(--background-rgb), 0.8)" 
+          background: visible
+            ? "rgba(var(--background-rgb), 0.8)"
             : "rgba(var(--background-rgb), 0.6)",
           width: visible ? "80%" : "90%",
           y: visible ? 0 : 8,
@@ -225,7 +225,7 @@ const MobileNav = ({ navItems, visible }: NavbarProps) => {
         } as React.CSSProperties}
       >
         <div className="flex flex-row justify-between items-center w-full">
-        <Logo className="h-8 w-8 rounded-md" /> 
+          <Logo className="h-8 w-8 rounded-md" />
           <div className="flex items-center gap-2">
             <ThemeTogglerComponent />
             {open ? (
@@ -278,8 +278,8 @@ const MobileNav = ({ navItems, visible }: NavbarProps) => {
                 variant="outline"
                 className="flex cursor-pointer items-center gap-2 mt-4 w-full justify-center rounded-full dark:bg-white/20 dark:hover:bg-white/30 dark:text-white bg-gray-100 hover:bg-gray-200 text-gray-800 border-0"
               >
-                <IconBrandGoogleFilled className="h-4 w-4" />
-                <span>Sign in with Google</span>
+                <IconUser className="h-4 w-4" />
+                <span>Sign in</span>
               </Button>
             </motion.div>
           )}
diff --git a/surfsense_web/content/docs/docker-installation.mdx b/surfsense_web/content/docs/docker-installation.mdx
index 6e64cd514..aac7cc7cc 100644
--- a/surfsense_web/content/docs/docker-installation.mdx
+++ b/surfsense_web/content/docs/docker-installation.mdx
@@ -82,8 +82,7 @@ Before you begin, ensure you have:
 | -------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
 | DATABASE_URL               | PostgreSQL connection string (e.g., `postgresql+asyncpg://postgres:postgres@localhost:5432/surfsense`)                                                                                    |
 | SECRET_KEY                 | JWT Secret key for authentication (should be a secure random string)                                                                                                                      |
-| GOOGLE_OAUTH_CLIENT_ID     | Google OAuth client ID obtained from Google Cloud Console                                                                                                                                 |
-| GOOGLE_OAUTH_CLIENT_SECRET | Google OAuth client secret obtained from Google Cloud Console                                                                                                                             |
+| AUTH_TYPE                  | Authentication method: `GOOGLE` for OAuth with Google, `LOCAL` for email/password authentication                                                                                          |
 | NEXT_FRONTEND_URL          | URL where your frontend application is hosted (e.g., `http://localhost:3000`)                                                                                                             |
 | EMBEDDING_MODEL            | Name of the embedding model (e.g., `openai://text-embedding-ada-002`, `anthropic://claude-v1`, `mixedbread-ai/mxbai-embed-large-v1`)                                                      |
 | RERANKERS_MODEL_NAME       | Name of the reranker model (e.g., `ms-marco-MiniLM-L-12-v2`)                                                                                                                              |
@@ -96,10 +95,21 @@ Before you begin, ensure you have:
 | TTS_SERVICE                | Text-to-Speech API provider for Podcasts (e.g., `openai/tts-1`, `azure/neural`, `vertex_ai/`). See [supported providers](https://docs.litellm.ai/docs/text_to_speech#supported-providers) |
 | STT_SERVICE                | Speech-to-Text API provider for Podcasts (e.g., `openai/whisper-1`). See [supported providers](https://docs.litellm.ai/docs/audio_transcription#supported-providers)                      |
 
-Include API keys for the LLM providers you're using. For example:
 
-- `OPENAI_API_KEY`: If using OpenAI models
-- `GEMINI_API_KEY`: If using Google Gemini models
+Include API keys for your chosen LLM providers:
+
+| ENV VARIABLE       | DESCRIPTION                                                                 |
+|--------------------|-----------------------------------------------------------------------------|
+| `OPENAI_API_KEY`   | Required if using OpenAI models                                             |
+| `GEMINI_API_KEY`   | Required if using Google Gemini models                                      |
+| `ANTHROPIC_API_KEY`| Required if using Anthropic models                                          |
+
+### Google OAuth Configuration (if AUTH_TYPE=GOOGLE)
+
+| ENV VARIABLE               | DESCRIPTION                                                                 |
+|----------------------------|-----------------------------------------------------------------------------|
+| `GOOGLE_OAUTH_CLIENT_ID`   | Client ID from Google Cloud Console                                         |
+| `GOOGLE_OAUTH_CLIENT_SECRET` | Client secret from Google Cloud Console                                   |
 
 **Optional Backend LangSmith Observability:**
 | ENV VARIABLE | DESCRIPTION |
@@ -125,6 +135,7 @@ For other LLM providers, refer to the [LiteLLM documentation](https://docs.litel
 | ENV VARIABLE                    | DESCRIPTION                                                |
 | ------------------------------- | ---------------------------------------------------------- |
 | NEXT_PUBLIC_FASTAPI_BACKEND_URL | URL of the backend service (e.g., `http://localhost:8000`) |
+| NEXT_PUBLIC_FASTAPI_BACKEND_AUTH_TYPE | Same value as set in backend AUTH_TYPE i.e `GOOGLE` for OAuth with Google, `LOCAL` for email/password authentication  |
 
 2. **Build and Start Containers**
 
diff --git a/surfsense_web/content/docs/index.mdx b/surfsense_web/content/docs/index.mdx
index f3411b897..4845a7312 100644
--- a/surfsense_web/content/docs/index.mdx
+++ b/surfsense_web/content/docs/index.mdx
@@ -47,9 +47,11 @@ See the [installation notes](https://github.com/pgvector/pgvector/tree/master#in
 
 ---
 
-## Google OAuth Setup
+## Google OAuth Setup (Optional)
 
-SurfSense user management and authentication works on Google OAuth. Lets set it up.
+SurfSense supports both Google OAuth and local email/password authentication. Google OAuth is optional - if you prefer local authentication, you can skip this section.
+
+To set up Google OAuth:
 
 1. Login to your [Google Developer Console](https://console.cloud.google.com/)
 2. Enable People API.
diff --git a/surfsense_web/content/docs/manual-installation.mdx b/surfsense_web/content/docs/manual-installation.mdx
index b3999dc69..72492c13b 100644
--- a/surfsense_web/content/docs/manual-installation.mdx
+++ b/surfsense_web/content/docs/manual-installation.mdx
@@ -53,25 +53,37 @@ Edit the `.env` file and set the following variables:
 | -------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
 | DATABASE_URL               | PostgreSQL connection string (e.g., `postgresql+asyncpg://postgres:postgres@localhost:5432/surfsense`)                                                                                    |
 | SECRET_KEY                 | JWT Secret key for authentication (should be a secure random string)                                                                                                                      |
-| GOOGLE_OAUTH_CLIENT_ID     | Google OAuth client ID                                                                                                                                                                    |
-| GOOGLE_OAUTH_CLIENT_SECRET | Google OAuth client secret                                                                                                                                                                |
-| NEXT_FRONTEND_URL          | Frontend application URL (e.g., `http://localhost:3000`)                                                                                                                                  |
+| AUTH_TYPE                  | Authentication method: `GOOGLE` for OAuth with Google, `LOCAL` for email/password authentication                                                                                          |
+| NEXT_FRONTEND_URL          | URL where your frontend application is hosted (e.g., `http://localhost:3000`)                                                                                                             |
 | EMBEDDING_MODEL            | Name of the embedding model (e.g., `openai://text-embedding-ada-002`, `anthropic://claude-v1`, `mixedbread-ai/mxbai-embed-large-v1`)                                                      |
 | RERANKERS_MODEL_NAME       | Name of the reranker model (e.g., `ms-marco-MiniLM-L-12-v2`)                                                                                                                              |
 | RERANKERS_MODEL_TYPE       | Type of reranker model (e.g., `flashrank`)                                                                                                                                                |
-| FAST_LLM                   | LiteLLM routed faster LLM (e.g., `openai/gpt-4o-mini`, `ollama/deepseek-r1:8b`)                                                                                                           |
-| STRATEGIC_LLM              | LiteLLM routed advanced LLM (e.g., `openai/gpt-4o`, `ollama/gemma3:12b`)                                                                                                                  |
-| LONG_CONTEXT_LLM           | LiteLLM routed long-context LLM (e.g., `gemini/gemini-2.0-flash`, `ollama/deepseek-r1:8b`)                                                                                                |
-| UNSTRUCTURED_API_KEY       | API key for Unstructured.io service                                                                                                                                                       |
-| FIRECRAWL_API_KEY          | API key for Firecrawl service (if using crawler)                                                                                                                                          |
+| FAST_LLM                   | LiteLLM routed smaller, faster LLM (e.g., `openai/gpt-4o-mini`, `ollama/deepseek-r1:8b`)                                                                                                  |
+| STRATEGIC_LLM              | LiteLLM routed advanced LLM for complex tasks (e.g., `openai/gpt-4o`, `ollama/gemma3:12b`)                                                                                                |
+| LONG_CONTEXT_LLM           | LiteLLM routed LLM for longer context windows (e.g., `gemini/gemini-2.0-flash`, `ollama/deepseek-r1:8b`)                                                                                  |
+| UNSTRUCTURED_API_KEY       | API key for Unstructured.io service for document parsing                                                                                                                                  |
+| FIRECRAWL_API_KEY          | API key for Firecrawl service for web crawling                                                                                                                                            |
 | TTS_SERVICE                | Text-to-Speech API provider for Podcasts (e.g., `openai/tts-1`, `azure/neural`, `vertex_ai/`). See [supported providers](https://docs.litellm.ai/docs/text_to_speech#supported-providers) |
 | STT_SERVICE                | Speech-to-Text API provider for Podcasts (e.g., `openai/whisper-1`). See [supported providers](https://docs.litellm.ai/docs/audio_transcription#supported-providers)                      |
 
-**Important**: Since LLM calls are routed through LiteLLM, include API keys for the LLM providers you're using:
 
-- For OpenAI models: `OPENAI_API_KEY`
-- For Google Gemini models: `GEMINI_API_KEY`
-- For other providers, refer to the [LiteLLM documentation](https://docs.litellm.ai/docs/providers)
+Include API keys for your chosen LLM providers:
+
+| ENV VARIABLE       | DESCRIPTION                                                                 |
+|--------------------|-----------------------------------------------------------------------------|
+| `OPENAI_API_KEY`   | Required if using OpenAI models                                             |
+| `GEMINI_API_KEY`   | Required if using Google Gemini models                                      |
+| `ANTHROPIC_API_KEY`| Required if using Anthropic models                                          |
+
+For other providers, refer to the [LiteLLM documentation](https://docs.litellm.ai/docs/providers)
+
+### Google OAuth Configuration (if AUTH_TYPE=GOOGLE)
+
+| ENV VARIABLE               | DESCRIPTION                                                                 |
+|----------------------------|-----------------------------------------------------------------------------|
+| `GOOGLE_OAUTH_CLIENT_ID`   | Client ID from Google Cloud Console                                         |
+| `GOOGLE_OAUTH_CLIENT_SECRET` | Client secret from Google Cloud Console                                   |
+
 
 **Optional Backend LangSmith Observability:**
 | ENV VARIABLE | DESCRIPTION |
@@ -169,6 +181,7 @@ Edit the `.env` file and set:
 | ENV VARIABLE                    | DESCRIPTION                                 |
 | ------------------------------- | ------------------------------------------- |
 | NEXT_PUBLIC_FASTAPI_BACKEND_URL | Backend URL (e.g., `http://localhost:8000`) |
+| NEXT_PUBLIC_FASTAPI_BACKEND_AUTH_TYPE | Same value as set in backend AUTH_TYPE i.e `GOOGLE` for OAuth with Google, `LOCAL` for email/password authentication  |
 
 ### 2. Install Dependencies
 

From 604791e6c5366898e4200ab49ba96d027c4d8245 Mon Sep 17 00:00:00 2001
From: "DESKTOP-RTLN3BA\\$punk" <vermarohanfinal@gmail.com>
Date: Wed, 21 May 2025 21:13:56 -0700
Subject: [PATCH 58/70] fix: Added Suspense boundary in Login Page

---
 surfsense_web/app/login/page.tsx | 26 ++++++++++++++++++++++++--
 1 file changed, 24 insertions(+), 2 deletions(-)

diff --git a/surfsense_web/app/login/page.tsx b/surfsense_web/app/login/page.tsx
index 95620d2e4..65fa0b873 100644
--- a/surfsense_web/app/login/page.tsx
+++ b/surfsense_web/app/login/page.tsx
@@ -1,6 +1,6 @@
 "use client";
 
-import { useState, useEffect } from "react";
+import { useState, useEffect, Suspense } from "react";
 import { GoogleLoginButton } from "./GoogleLoginButton";
 import { LocalLoginForm } from "./LocalLoginForm";
 import { Logo } from "@/components/Logo";
@@ -8,7 +8,7 @@ import { AmbientBackground } from "./AmbientBackground";
 import { useSearchParams } from "next/navigation";
 import { Loader2 } from "lucide-react";
 
-export default function LoginPage() {
+function LoginContent() {
   const [authType, setAuthType] = useState<string | null>(null);
   const [registrationSuccess, setRegistrationSuccess] = useState(false);
   const [isLoading, setIsLoading] = useState(true);
@@ -64,4 +64,26 @@ export default function LoginPage() {
       </div>
     </div>
   );
+}
+
+// Loading fallback for Suspense
+const LoadingFallback = () => (
+  <div className="relative w-full overflow-hidden">
+    <AmbientBackground />
+    <div className="mx-auto flex h-screen max-w-lg flex-col items-center justify-center">
+      <Logo className="rounded-md" />
+      <div className="mt-8 flex items-center space-x-2">
+        <Loader2 className="h-5 w-5 animate-spin text-muted-foreground" />
+        <span className="text-muted-foreground">Loading...</span>
+      </div>
+    </div>
+  </div>
+);
+
+export default function LoginPage() {
+  return (
+    <Suspense fallback={<LoadingFallback />}>
+      <LoginContent />
+    </Suspense>
+  );
 } 
\ No newline at end of file

From 38516e74f93f5740ed516dd359234ea3f4b3e7a4 Mon Sep 17 00:00:00 2001
From: "DESKTOP-RTLN3BA\\$punk" <vermarohanfinal@gmail.com>
Date: Thu, 22 May 2025 00:11:57 -0700
Subject: [PATCH 59/70] Version Bump

---
 surfsense_backend/pyproject.toml         | 2 +-
 surfsense_browser_extension/package.json | 2 +-
 surfsense_web/package.json               | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/surfsense_backend/pyproject.toml b/surfsense_backend/pyproject.toml
index cecf70943..1e5345b09 100644
--- a/surfsense_backend/pyproject.toml
+++ b/surfsense_backend/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "surf-new-backend"
-version = "0.0.6"
+version = "0.0.7"
 description = "SurfSense Backend"
 readme = "README.md"
 requires-python = ">=3.12"
diff --git a/surfsense_browser_extension/package.json b/surfsense_browser_extension/package.json
index 3d2d04002..9274154ca 100644
--- a/surfsense_browser_extension/package.json
+++ b/surfsense_browser_extension/package.json
@@ -1,7 +1,7 @@
 {
   "name": "surfsense_browser_extension",
   "displayName": "Surfsense Browser Extension",
-  "version": "0.0.6",
+  "version": "0.0.7",
   "description": "Extension to collect Browsing History for SurfSense.",
   "author": "https://github.com/MODSetter",
   "scripts": {
diff --git a/surfsense_web/package.json b/surfsense_web/package.json
index 2899a2aee..d9f5529b6 100644
--- a/surfsense_web/package.json
+++ b/surfsense_web/package.json
@@ -1,6 +1,6 @@
 {
   "name": "surfsense_web",
-  "version": "0.0.6",
+  "version": "0.0.7",
   "private": true,
   "description": "SurfSense Frontend",
   "scripts": {

From 807c83b2f6487d4f80a78a5a3a5bc2c7588e7ff7 Mon Sep 17 00:00:00 2001
From: "google-labs-jules[bot]"
 <161369871+google-labs-jules[bot]@users.noreply.github.com>
Date: Tue, 27 May 2025 12:56:26 +0000
Subject: [PATCH 60/70] Fix: Handle Slack API rate limiting for
 conversations.list

The `get_all_channels` method in `slack_history.py` was making paginated
requests to `conversations.list` without any delay, leading to HTTP 429
errors when fetching channels from large Slack workspaces.

This commit introduces the following changes:
- Adds a 3-second delay between paginated calls to `conversations.list`
  to comply with Slack's Tier 2 rate limits (approx. 20 requests/minute).
- Implements handling for the `Retry-After` header when a 429 error is
  received. The system will wait for the specified duration before
  retrying. If the header is missing or invalid, a default of 60 seconds
  is used.
- Adds comprehensive unit tests to verify the new delay and retry logic,
  covering scenarios with and without the `Retry-After` header, as well
  as other API errors.
---
 .../app/connectors/slack_history.py           |  79 ++++---
 .../app/connectors/test_slack_history.py      | 198 ++++++++++++++++++
 2 files changed, 250 insertions(+), 27 deletions(-)
 create mode 100644 surfsense_backend/app/connectors/test_slack_history.py

diff --git a/surfsense_backend/app/connectors/slack_history.py b/surfsense_backend/app/connectors/slack_history.py
index 67e540354..ca9c59d2d 100644
--- a/surfsense_backend/app/connectors/slack_history.py
+++ b/surfsense_backend/app/connectors/slack_history.py
@@ -6,6 +6,7 @@ Allows fetching channel lists and message history with date range filtering.
 """
 
 import os
+import time # Added import
 from slack_sdk import WebClient
 from slack_sdk.errors import SlackApiError
 from datetime import datetime, timedelta
@@ -35,7 +36,7 @@ class SlackHistory:
     
     def get_all_channels(self, include_private: bool = True) -> Dict[str, str]:
         """
-        Fetch all channels that the bot has access to.
+        Fetch all channels that the bot has access to, with rate limit handling.
         
         Args:
             include_private: Whether to include private channels
@@ -45,7 +46,8 @@ class SlackHistory:
         
         Raises:
             ValueError: If no Slack client has been initialized
-            SlackApiError: If there's an error calling the Slack API
+            SlackApiError: If there's an unrecoverable error calling the Slack API
+            RuntimeError: For unexpected errors during channel fetching.
         """
         if not self.client:
             raise ValueError("Slack client not initialized. Call set_token() first.")
@@ -54,35 +56,58 @@ class SlackHistory:
         types = "public_channel"
         if include_private:
             types += ",private_channel"
-        
-        try:
-            # Call the conversations.list method
-            result = self.client.conversations_list(
-                types=types,
-                limit=1000  # Maximum allowed by API
-            )
-            channels = result["channels"]
-            
-            # Handle pagination for workspaces with many channels
-            while result.get("response_metadata", {}).get("next_cursor"):
-                next_cursor = result["response_metadata"]["next_cursor"]
-                
-                # Get the next batch of channels
-                result = self.client.conversations_list(
+
+        next_cursor = None
+        is_first_request = True
+
+        while is_first_request or next_cursor:
+            try:
+                if not is_first_request:  # Add delay only for paginated requests
+                    print(f"Paginating for channels, waiting 3 seconds before next call. Cursor: {next_cursor}")
+                    time.sleep(3)
+
+                current_limit = 1000  # Max limit
+                api_result = self.client.conversations_list(
                     types=types,
                     cursor=next_cursor,
-                    limit=1000
+                    limit=current_limit
                 )
-                channels.extend(result["channels"])
-            
-            # Create a dictionary mapping channel names to IDs
-            for channel in channels:
-                channels_dict[channel["name"]] = channel["id"]
-            
-            return channels_dict
+                
+                channels_on_page = api_result["channels"]
+                for channel in channels_on_page:
+                    # Ensure channel name and id exist, as per observed Slack API behavior
+                    if "name" in channel and "id" in channel:
+                         channels_dict[channel["name"]] = channel["id"]
+                    else:
+                        # Handle cases where a channel might be missing a name or ID
+                        # This could be logged or handled as per specific requirements
+                        print(f"Warning: Channel found with missing name or id. Data: {channel}")
+
+
+                next_cursor = api_result.get("response_metadata", {}).get("next_cursor")
+                is_first_request = False  # Subsequent requests are not the first
+
+                if not next_cursor:  # All pages processed
+                    break
+
+            except SlackApiError as e:
+                if e.response is not None and e.response.status_code == 429:
+                    retry_after_header = e.response.headers.get('Retry-After')
+                    wait_duration = 60  # Default wait time
+                    if retry_after_header and retry_after_header.isdigit():
+                        wait_duration = int(retry_after_header)
+                    
+                    print(f"Slack API rate limit hit. Waiting for {wait_duration} seconds. Cursor: {next_cursor}")
+                    time.sleep(wait_duration)
+                    # The loop will continue, retrying with the same cursor
+                else:
+                    # Not a 429 error, or no response object, re-raise
+                    raise SlackApiError(f"Error retrieving channels: {e}", e.response)
+            except Exception as general_error:
+                # Handle other potential errors like network issues if necessary, or re-raise
+                raise RuntimeError(f"An unexpected error occurred during channel fetching: {general_error}")
         
-        except SlackApiError as e:
-            raise SlackApiError(f"Error retrieving channels: {e}", e.response)
+        return channels_dict
     
     def get_conversation_history(
         self, 
diff --git a/surfsense_backend/app/connectors/test_slack_history.py b/surfsense_backend/app/connectors/test_slack_history.py
new file mode 100644
index 000000000..a7ff99941
--- /dev/null
+++ b/surfsense_backend/app/connectors/test_slack_history.py
@@ -0,0 +1,198 @@
+import unittest
+import time # Imported to be available for patching target module
+from unittest.mock import patch, Mock, call
+from slack_sdk.errors import SlackApiError
+
+# Since test_slack_history.py is in the same directory as slack_history.py
+from .slack_history import SlackHistory
+
+class TestSlackHistoryGetAllChannels(unittest.TestCase):
+
+    @patch('surfsense_backend.app.connectors.slack_history.time.sleep')
+    @patch('slack_sdk.WebClient') # Patches where WebClient is looked up when SlackHistory instantiates it
+    def test_get_all_channels_pagination_with_delay(self, MockWebClient, mock_sleep):
+        mock_client_instance = MockWebClient.return_value
+        
+        page1_response = {
+            "channels": [{"name": "general", "id": "C1"}, {"name": "dev", "id": "C0"}], # Added one more channel
+            "response_metadata": {"next_cursor": "cursor123"}
+        }
+        page2_response = {
+            "channels": [{"name": "random", "id": "C2"}],
+            "response_metadata": {"next_cursor": ""} 
+        }
+        
+        mock_client_instance.conversations_list.side_effect = [
+            page1_response,
+            page2_response
+        ]
+        
+        slack_history = SlackHistory(token="fake_token")
+        channels = slack_history.get_all_channels(include_private=True) # Explicitly True
+        
+        self.assertEqual(len(channels), 3) # Adjusted for 3 channels
+        self.assertEqual(channels["general"], "C1")
+        self.assertEqual(channels["dev"], "C0")
+        self.assertEqual(channels["random"], "C2")
+        
+        expected_calls = [
+            call(types="public_channel,private_channel", cursor=None, limit=1000),
+            call(types="public_channel,private_channel", cursor="cursor123", limit=1000)
+        ]
+        mock_client_instance.conversations_list.assert_has_calls(expected_calls)
+        self.assertEqual(mock_client_instance.conversations_list.call_count, 2)
+        
+        mock_sleep.assert_called_once_with(3)
+
+    @patch('surfsense_backend.app.connectors.slack_history.time.sleep')
+    @patch('slack_sdk.WebClient')
+    def test_get_all_channels_rate_limit_with_retry_after(self, MockWebClient, mock_sleep):
+        mock_client_instance = MockWebClient.return_value
+        
+        mock_error_response = Mock()
+        mock_error_response.status_code = 429
+        mock_error_response.headers = {'Retry-After': '5'}
+        
+        successful_response = {
+            "channels": [{"name": "general", "id": "C1"}],
+            "response_metadata": {"next_cursor": ""}
+        }
+        
+        mock_client_instance.conversations_list.side_effect = [
+            SlackApiError(message="ratelimited", response=mock_error_response),
+            successful_response
+        ]
+        
+        slack_history = SlackHistory(token="fake_token")
+        channels = slack_history.get_all_channels(include_private=True)
+        
+        self.assertEqual(len(channels), 1)
+        self.assertEqual(channels["general"], "C1")
+        mock_sleep.assert_called_once_with(5) 
+        
+        expected_calls = [
+            call(types="public_channel,private_channel", cursor=None, limit=1000), # First attempt
+            call(types="public_channel,private_channel", cursor=None, limit=1000)  # Retry attempt
+        ]
+        mock_client_instance.conversations_list.assert_has_calls(expected_calls)
+        self.assertEqual(mock_client_instance.conversations_list.call_count, 2)
+
+    @patch('surfsense_backend.app.connectors.slack_history.time.sleep')
+    @patch('slack_sdk.WebClient')
+    def test_get_all_channels_rate_limit_no_retry_after_valid_header(self, MockWebClient, mock_sleep):
+        # Test case for when Retry-After is not a digit
+        mock_client_instance = MockWebClient.return_value
+        
+        mock_error_response = Mock()
+        mock_error_response.status_code = 429
+        mock_error_response.headers = {'Retry-After': 'invalid_value'} # Non-digit value
+        
+        successful_response = {
+            "channels": [{"name": "general", "id": "C1"}],
+            "response_metadata": {"next_cursor": ""}
+        }
+        
+        mock_client_instance.conversations_list.side_effect = [
+            SlackApiError(message="ratelimited", response=mock_error_response),
+            successful_response
+        ]
+        
+        slack_history = SlackHistory(token="fake_token")
+        channels = slack_history.get_all_channels(include_private=True)
+        
+        self.assertEqual(channels["general"], "C1")
+        mock_sleep.assert_called_once_with(60) # Default fallback
+        self.assertEqual(mock_client_instance.conversations_list.call_count, 2)
+
+    @patch('surfsense_backend.app.connectors.slack_history.time.sleep')
+    @patch('slack_sdk.WebClient')
+    def test_get_all_channels_rate_limit_no_retry_after_header(self, MockWebClient, mock_sleep):
+        # Test case for when Retry-After header is missing
+        mock_client_instance = MockWebClient.return_value
+        
+        mock_error_response = Mock()
+        mock_error_response.status_code = 429
+        mock_error_response.headers = {} # No Retry-After header
+        
+        successful_response = {
+            "channels": [{"name": "general", "id": "C1"}],
+            "response_metadata": {"next_cursor": ""}
+        }
+        
+        mock_client_instance.conversations_list.side_effect = [
+            SlackApiError(message="ratelimited", response=mock_error_response),
+            successful_response
+        ]
+        
+        slack_history = SlackHistory(token="fake_token")
+        channels = slack_history.get_all_channels(include_private=True)
+        
+        self.assertEqual(channels["general"], "C1")
+        mock_sleep.assert_called_once_with(60) # Default fallback
+        self.assertEqual(mock_client_instance.conversations_list.call_count, 2)
+
+
+    @patch('surfsense_backend.app.connectors.slack_history.time.sleep')
+    @patch('slack_sdk.WebClient')
+    def test_get_all_channels_other_slack_api_error(self, MockWebClient, mock_sleep):
+        mock_client_instance = MockWebClient.return_value
+        
+        mock_error_response = Mock()
+        mock_error_response.status_code = 500 
+        mock_error_response.headers = {}
+        mock_error_response.data = {"ok": False, "error": "internal_error"} # Mocking response.data
+        
+        original_error = SlackApiError(message="server error", response=mock_error_response)
+        mock_client_instance.conversations_list.side_effect = original_error
+        
+        slack_history = SlackHistory(token="fake_token")
+        
+        with self.assertRaises(SlackApiError) as context:
+            slack_history.get_all_channels(include_private=True)
+        
+        # Check if the raised exception is the same one or has the same properties
+        self.assertEqual(context.exception.response.status_code, 500)
+        self.assertIn("server error", str(context.exception))
+        mock_sleep.assert_not_called()
+        mock_client_instance.conversations_list.assert_called_once_with(
+            types="public_channel,private_channel", cursor=None, limit=1000
+        )
+
+    @patch('surfsense_backend.app.connectors.slack_history.time.sleep')
+    @patch('slack_sdk.WebClient')
+    def test_get_all_channels_handles_missing_name_id_gracefully(self, MockWebClient, mock_sleep):
+        mock_client_instance = MockWebClient.return_value
+        
+        # Channel missing 'name', channel missing 'id', valid channel
+        response_with_malformed_data = {
+            "channels": [
+                {"id": "C1_missing_name"}, 
+                {"name": "channel_missing_id"},
+                {"name": "general", "id": "C2_valid"}
+            ],
+            "response_metadata": {"next_cursor": ""}
+        }
+        
+        mock_client_instance.conversations_list.return_value = response_with_malformed_data
+        
+        slack_history = SlackHistory(token="fake_token")
+        # Patch print to check for warning messages
+        with patch('builtins.print') as mock_print:
+            channels = slack_history.get_all_channels(include_private=True)
+        
+        self.assertEqual(len(channels), 1) # Only the valid channel should be included
+        self.assertIn("general", channels)
+        self.assertEqual(channels["general"], "C2_valid")
+        
+        # Assert that warnings were printed for malformed channel data
+        self.assertGreaterEqual(mock_print.call_count, 2) # At least two warnings
+        mock_print.assert_any_call("Warning: Channel found with missing name or id. Data: {'id': 'C1_missing_name'}")
+        mock_print.assert_any_call("Warning: Channel found with missing name or id. Data: {'name': 'channel_missing_id'}")
+
+        mock_sleep.assert_not_called() # No pagination, so no sleep
+        mock_client_instance.conversations_list.assert_called_once_with(
+            types="public_channel,private_channel", cursor=None, limit=1000
+        )
+
+if __name__ == '__main__':
+    unittest.main()

From ce1014c8c24254d04f418c55f4d57c8a713ebbc2 Mon Sep 17 00:00:00 2001
From: "google-labs-jules[bot]"
 <161369871+google-labs-jules[bot]@users.noreply.github.com>
Date: Tue, 27 May 2025 13:27:10 +0000
Subject: [PATCH 61/70] Fix: Handle Slack rate limits and add GitHub org repo
 support

This commit includes two main improvements:

1. Slack Connector (`slack_history.py`):
   - Addresses API rate limiting for `conversations.list` by introducing a 3-second delay between paginated calls.
   - Implements handling for the `Retry-After` header when HTTP 429 errors occur.
   - Fixes a `SyntaxError` caused by a non-printable character accidentally introduced in a previous modification.
   - Adds comprehensive unit tests for the rate limiting and retry logic in `test_slack_history.py`.

2. GitHub Connector (`github_connector.py`):
   - Modifies `get_user_repositories` to fetch all repositories accessible by you (including organization repositories) by changing the API call parameter from `type='owner'` to `type='all'`.
   - Adds unit tests in `test_github_connector.py` to verify this change and other connector functionalities.
---
 .../app/connectors/github_connector.py        |   2 +-
 .../app/connectors/slack_history.py           |   2 +-
 .../app/connectors/test_github_connector.py   | 154 ++++++++++++++++++
 3 files changed, 156 insertions(+), 2 deletions(-)
 create mode 100644 surfsense_backend/app/connectors/test_github_connector.py

diff --git a/surfsense_backend/app/connectors/github_connector.py b/surfsense_backend/app/connectors/github_connector.py
index 265f89b0a..a25bc980a 100644
--- a/surfsense_backend/app/connectors/github_connector.py
+++ b/surfsense_backend/app/connectors/github_connector.py
@@ -80,7 +80,7 @@ class GitHubConnector:
             # type='owner' fetches repos owned by the user
             # type='member' fetches repos the user is a collaborator on (including orgs)
             # type='all' fetches both
-            for repo in self.gh.repositories(type='owner', sort='updated'):
+            for repo in self.gh.repositories(type='all', sort='updated'):
                 repos_data.append({
                     "id": repo.id,
                     "name": repo.name,
diff --git a/surfsense_backend/app/connectors/slack_history.py b/surfsense_backend/app/connectors/slack_history.py
index ca9c59d2d..7967f68e3 100644
--- a/surfsense_backend/app/connectors/slack_history.py
+++ b/surfsense_backend/app/connectors/slack_history.py
@@ -221,7 +221,7 @@ class SlackHistory:
                 channel_id=channel_id,
                 limit=limit,
                 oldest=oldest,
-                latest=latest
+                latest=latest # Retyped to remove hidden characters
             )
             return messages, None
         except SlackApiError as e:
diff --git a/surfsense_backend/app/connectors/test_github_connector.py b/surfsense_backend/app/connectors/test_github_connector.py
new file mode 100644
index 000000000..ad8a0682a
--- /dev/null
+++ b/surfsense_backend/app/connectors/test_github_connector.py
@@ -0,0 +1,154 @@
+import unittest
+from unittest.mock import patch, Mock, call
+from datetime import datetime
+
+# Adjust the import path based on the actual location if test_github_connector.py
+# is not in the same directory as github_connector.py or if paths are set up differently.
+# Assuming surfsend_backend/app/connectors/test_github_connector.py
+from surfsense_backend.app.connectors.github_connector import GitHubConnector
+from github3.exceptions import ForbiddenError # Import the specific exception
+
+class TestGitHubConnector(unittest.TestCase):
+
+    @patch('surfsense_backend.app.connectors.github_connector.github_login')
+    def test_get_user_repositories_uses_type_all(self, mock_github_login):
+        # Mock the GitHub client object and its methods
+        mock_gh_instance = Mock()
+        mock_github_login.return_value = mock_gh_instance
+
+        # Mock the self.gh.me() call in __init__ to prevent an actual API call
+        mock_gh_instance.me.return_value = Mock() # Simple mock to pass initialization
+
+        # Prepare mock repository data
+        mock_repo1_data = Mock()
+        mock_repo1_data.id = 1
+        mock_repo1_data.name = "repo1"
+        mock_repo1_data.full_name = "user/repo1"
+        mock_repo1_data.private = False
+        mock_repo1_data.html_url = "http://example.com/user/repo1"
+        mock_repo1_data.description = "Test repo 1"
+        mock_repo1_data.updated_at = datetime(2023, 1, 1, 10, 30, 0) # Added time component
+
+        mock_repo2_data = Mock()
+        mock_repo2_data.id = 2
+        mock_repo2_data.name = "org-repo"
+        mock_repo2_data.full_name = "org/org-repo"
+        mock_repo2_data.private = True
+        mock_repo2_data.html_url = "http://example.com/org/org-repo"
+        mock_repo2_data.description = "Org repo"
+        mock_repo2_data.updated_at = datetime(2023, 1, 2, 12, 0, 0) # Added time component
+        
+        # Configure the mock for gh.repositories() call
+        # This method is an iterator, so it should return an iterable (e.g., a list)
+        mock_gh_instance.repositories.return_value = [mock_repo1_data, mock_repo2_data]
+
+        connector = GitHubConnector(token="fake_token")
+        repositories = connector.get_user_repositories()
+
+        # Assert that gh.repositories was called correctly
+        mock_gh_instance.repositories.assert_called_once_with(type='all', sort='updated')
+
+        # Assert the structure and content of the returned data
+        expected_repositories = [
+            {
+                "id": 1, "name": "repo1", "full_name": "user/repo1", "private": False,
+                "url": "http://example.com/user/repo1", "description": "Test repo 1",
+                "last_updated": datetime(2023, 1, 1, 10, 30, 0)
+            },
+            {
+                "id": 2, "name": "org-repo", "full_name": "org/org-repo", "private": True,
+                "url": "http://example.com/org/org-repo", "description": "Org repo",
+                "last_updated": datetime(2023, 1, 2, 12, 0, 0)
+            }
+        ]
+        self.assertEqual(repositories, expected_repositories)
+        self.assertEqual(len(repositories), 2)
+
+    @patch('surfsense_backend.app.connectors.github_connector.github_login')
+    def test_get_user_repositories_handles_empty_description_and_none_updated_at(self, mock_github_login):
+        # Mock the GitHub client object and its methods
+        mock_gh_instance = Mock()
+        mock_github_login.return_value = mock_gh_instance
+        mock_gh_instance.me.return_value = Mock()
+
+        mock_repo_data = Mock()
+        mock_repo_data.id = 1
+        mock_repo_data.name = "repo_no_desc"
+        mock_repo_data.full_name = "user/repo_no_desc"
+        mock_repo_data.private = False
+        mock_repo_data.html_url = "http://example.com/user/repo_no_desc"
+        mock_repo_data.description = None # Test None description
+        mock_repo_data.updated_at = None   # Test None updated_at
+
+        mock_gh_instance.repositories.return_value = [mock_repo_data]
+        connector = GitHubConnector(token="fake_token")
+        repositories = connector.get_user_repositories()
+
+        mock_gh_instance.repositories.assert_called_once_with(type='all', sort='updated')
+        expected_repositories = [
+            {
+                "id": 1, "name": "repo_no_desc", "full_name": "user/repo_no_desc", "private": False,
+                "url": "http://example.com/user/repo_no_desc", "description": "", # Expect empty string
+                "last_updated": None # Expect None
+            }
+        ]
+        self.assertEqual(repositories, expected_repositories)
+
+    @patch('surfsense_backend.app.connectors.github_connector.github_login')
+    def test_github_connector_initialization_failure_forbidden(self, mock_github_login):
+        # Test that __init__ raises ValueError on auth failure (ForbiddenError)
+        mock_gh_instance = Mock()
+        mock_github_login.return_value = mock_gh_instance
+        
+        # Create a mock response object for the ForbiddenError
+        # The actual response structure might vary, but github3.py's ForbiddenError
+        # can be instantiated with just a response object that has a status_code.
+        mock_response = Mock()
+        mock_response.status_code = 403 # Typically Forbidden
+        
+        # Setup the side_effect for self.gh.me()
+        mock_gh_instance.me.side_effect = ForbiddenError(mock_response)
+
+        with self.assertRaises(ValueError) as context:
+            GitHubConnector(token="invalid_token_forbidden")
+        self.assertIn("Invalid GitHub token or insufficient permissions.", str(context.exception))
+
+    @patch('surfsense_backend.app.connectors.github_connector.github_login')
+    def test_github_connector_initialization_failure_authentication_failed(self, mock_github_login):
+        # Test that __init__ raises ValueError on auth failure (AuthenticationFailed, which is a subclass of ForbiddenError)
+        # For github3.py, AuthenticationFailed is more specific for token issues.
+        from github3.exceptions import AuthenticationFailed
+
+        mock_gh_instance = Mock()
+        mock_github_login.return_value = mock_gh_instance
+        
+        mock_response = Mock()
+        mock_response.status_code = 401 # Typically Unauthorized
+        
+        mock_gh_instance.me.side_effect = AuthenticationFailed(mock_response)
+
+        with self.assertRaises(ValueError) as context:
+            GitHubConnector(token="invalid_token_authfailed")
+        self.assertIn("Invalid GitHub token or insufficient permissions.", str(context.exception))
+    
+    @patch('surfsense_backend.app.connectors.github_connector.github_login')
+    def test_get_user_repositories_handles_api_exception(self, mock_github_login):
+        mock_gh_instance = Mock()
+        mock_github_login.return_value = mock_gh_instance
+        mock_gh_instance.me.return_value = Mock()
+
+        # Simulate an exception when calling repositories
+        mock_gh_instance.repositories.side_effect = Exception("API Error")
+
+        connector = GitHubConnector(token="fake_token")
+        # We expect it to log an error and return an empty list
+        with patch('surfsense_backend.app.connectors.github_connector.logger') as mock_logger:
+            repositories = connector.get_user_repositories()
+        
+        self.assertEqual(repositories, [])
+        mock_logger.error.assert_called_once()
+        self.assertIn("Failed to fetch GitHub repositories: API Error", mock_logger.error.call_args[0][0])
+
+
+if __name__ == '__main__':
+    unittest.main()

From 299bb35d8c4fc4b361e2f211d2a6bcbffff6b539 Mon Sep 17 00:00:00 2001
From: "google-labs-jules[bot]"
 <161369871+google-labs-jules[bot]@users.noreply.github.com>
Date: Tue, 27 May 2025 13:39:42 +0000
Subject: [PATCH 62/70] Hi there, it's Jules! I've just finished making some
 improvements.

Here's a rundown of what I did:

Fix: Robust Slack rate limiting, error handling & GitHub org repos

This update delivers comprehensive improvements to Slack connector stability and enhances the GitHub connector.

**Slack Connector (`slack_history.py`, `connectors_indexing_tasks.py`):**
- I've implemented proactive delays (1.2s for `conversations.history`, 3s for `conversations.list` pagination) and `Retry-After` header handling for 429 rate limit errors across `conversations.list`, `conversations.history`, and `users.info` API calls.
- I'll now gracefully handle `not_in_channel` errors when fetching conversation history by logging a warning and skipping the channel.
- I've refactored channel info fetching: `get_all_channels` now returns richer channel data (including `is_member`, `is_private`).
- I've removed direct calls to `conversations.info` from `connectors_indexing_tasks.py`, using the richer data from `get_all_channels` instead, to prevent associated rate limits.
- I corrected a `SyntaxError` (non-printable character) in `slack_history.py`.
- I've enhanced logging for rate limit actions, delays, and errors.
- I've updated unit tests in `test_slack_history.py` to cover all new logic.

**GitHub Connector (`github_connector.py`):**
- I've modified `get_user_repositories` to fetch all repositories accessible by you (owned, collaborated, organization) by changing the API call parameter from `type='owner'` to `type='all'`.
- I've included unit tests in `test_github_connector.py` for this change.
---
 .../app/connectors/slack_history.py           | 130 +++++--
 .../app/connectors/test_slack_history.py      | 318 +++++++++++++++---
 .../app/tasks/connectors_indexing_tasks.py    |  40 +--
 3 files changed, 384 insertions(+), 104 deletions(-)

diff --git a/surfsense_backend/app/connectors/slack_history.py b/surfsense_backend/app/connectors/slack_history.py
index 7967f68e3..6e0e57a24 100644
--- a/surfsense_backend/app/connectors/slack_history.py
+++ b/surfsense_backend/app/connectors/slack_history.py
@@ -7,11 +7,14 @@ Allows fetching channel lists and message history with date range filtering.
 
 import os
 import time # Added import
+import logging # Added import
 from slack_sdk import WebClient
 from slack_sdk.errors import SlackApiError
 from datetime import datetime, timedelta
 from typing import Dict, List, Optional, Tuple, Any, Union
 
+logger = logging.getLogger(__name__) # Added logger
+
 
 class SlackHistory:
     """Class for retrieving conversation history from Slack channels."""
@@ -34,7 +37,7 @@ class SlackHistory:
         """
         self.client = WebClient(token=token)
     
-    def get_all_channels(self, include_private: bool = True) -> Dict[str, str]:
+    def get_all_channels(self, include_private: bool = True) -> List[Dict[str, Any]]:
         """
         Fetch all channels that the bot has access to, with rate limit handling.
         
@@ -42,7 +45,7 @@ class SlackHistory:
             include_private: Whether to include private channels
         
         Returns:
-            Dictionary mapping channel names to channel IDs
+            List of dictionaries, each representing a channel with id, name, is_private, is_member.
         
         Raises:
             ValueError: If no Slack client has been initialized
@@ -52,7 +55,7 @@ class SlackHistory:
         if not self.client:
             raise ValueError("Slack client not initialized. Call set_token() first.")
         
-        channels_dict = {}
+        channels_list = [] # Changed from dict to list
         types = "public_channel"
         if include_private:
             types += ",private_channel"
@@ -63,7 +66,7 @@ class SlackHistory:
         while is_first_request or next_cursor:
             try:
                 if not is_first_request:  # Add delay only for paginated requests
-                    print(f"Paginating for channels, waiting 3 seconds before next call. Cursor: {next_cursor}")
+                    logger.info(f"Paginating for channels, waiting 3 seconds before next call. Cursor: {next_cursor}")
                     time.sleep(3)
 
                 current_limit = 1000  # Max limit
@@ -75,13 +78,20 @@ class SlackHistory:
                 
                 channels_on_page = api_result["channels"]
                 for channel in channels_on_page:
-                    # Ensure channel name and id exist, as per observed Slack API behavior
                     if "name" in channel and "id" in channel:
-                         channels_dict[channel["name"]] = channel["id"]
+                        channel_data = {
+                            "id": channel.get("id"),
+                            "name": channel.get("name"),
+                            "is_private": channel.get("is_private", False),
+                            # is_member is often part of the channel object from conversations.list
+                            # It indicates if the authenticated user (bot) is a member.
+                            # For public channels, this might be true or the API might not focus on it
+                            # if the bot can read it anyway. For private, it's crucial.
+                            "is_member": channel.get("is_member", False) 
+                        }
+                        channels_list.append(channel_data)
                     else:
-                        # Handle cases where a channel might be missing a name or ID
-                        # This could be logged or handled as per specific requirements
-                        print(f"Warning: Channel found with missing name or id. Data: {channel}")
+                        logger.warning(f"Channel found with missing name or id. Data: {channel}")
 
 
                 next_cursor = api_result.get("response_metadata", {}).get("next_cursor")
@@ -97,7 +107,7 @@ class SlackHistory:
                     if retry_after_header and retry_after_header.isdigit():
                         wait_duration = int(retry_after_header)
                     
-                    print(f"Slack API rate limit hit. Waiting for {wait_duration} seconds. Cursor: {next_cursor}")
+                    logger.warning(f"Slack API rate limit hit while fetching channels. Waiting for {wait_duration} seconds. Cursor: {next_cursor}")
                     time.sleep(wait_duration)
                     # The loop will continue, retrying with the same cursor
                 else:
@@ -105,9 +115,10 @@ class SlackHistory:
                     raise SlackApiError(f"Error retrieving channels: {e}", e.response)
             except Exception as general_error:
                 # Handle other potential errors like network issues if necessary, or re-raise
+                logger.error(f"An unexpected error occurred during channel fetching: {general_error}")
                 raise RuntimeError(f"An unexpected error occurred during channel fetching: {general_error}")
         
-        return channels_dict
+        return channels_list
     
     def get_conversation_history(
         self, 
@@ -135,17 +146,18 @@ class SlackHistory:
         if not self.client:
             raise ValueError("Slack client not initialized. Call set_token() first.")
         
-        try:
-            # Call the conversations.history method
-            messages = []
-            next_cursor = None
-            
-            while True:
+        messages = []
+        next_cursor = None
+        
+        while True:
+            try:
+                # Proactive delay for conversations.history (Tier 3)
+                time.sleep(1.2) # Wait 1.2 seconds before each history call.
+
                 kwargs = {
                     "channel": channel_id,
                     "limit": min(limit, 1000),  # API max is 1000
                 }
-                
                 if oldest:
                     kwargs["oldest"] = oldest
                 if latest:
@@ -153,22 +165,57 @@ class SlackHistory:
                 if next_cursor:
                     kwargs["cursor"] = next_cursor
                 
-                result = self.client.conversations_history(**kwargs)
+                current_api_call_successful = False
+                result = None # Ensure result is defined
+                try:
+                    result = self.client.conversations_history(**kwargs)
+                    current_api_call_successful = True
+                except SlackApiError as e_history:
+                    if e_history.response is not None and e_history.response.status_code == 429:
+                        retry_after_str = e_history.response.headers.get('Retry-After')
+                        wait_time = 60 # Default
+                        if retry_after_str and retry_after_str.isdigit():
+                            wait_time = int(retry_after_str)
+                        logger.warning(
+                            f"Rate limited by Slack on conversations.history for channel {channel_id}. "
+                            f"Retrying after {wait_time} seconds. Cursor: {next_cursor}"
+                        )
+                        time.sleep(wait_time)
+                        # current_api_call_successful remains False, loop will retry this page
+                    else:
+                        raise # Re-raise to outer handler for not_in_channel or other SlackApiErrors
+                
+                if not current_api_call_successful:
+                    continue # Retry the current page fetch due to handled rate limit
+
+                # Process result if successful
                 batch = result["messages"]
                 messages.extend(batch)
                 
-                # Check if we need to paginate
                 if result.get("has_more", False) and len(messages) < limit:
                     next_cursor = result["response_metadata"]["next_cursor"]
                 else:
-                    break
+                    break # Exit pagination loop
             
-            # Respect the overall limit parameter
-            return messages[:limit]
+            except SlackApiError as e: # Outer catch for not_in_channel or unhandled SlackApiErrors from inner try
+                if (e.response is not None and 
+                    hasattr(e.response, 'data') and
+                    isinstance(e.response.data, dict) and
+                    e.response.data.get('error') == 'not_in_channel'):
+                    logger.warning(
+                        f"Bot is not in channel '{channel_id}'. Cannot fetch history. "
+                        "Please add the bot to this channel."
+                    )
+                    return [] 
+                # For other SlackApiErrors from inner block or this level
+                raise SlackApiError(f"Error retrieving history for channel {channel_id}: {e}", e.response)
+            except Exception as general_error: # Catch any other unexpected errors
+                logger.error(f"Unexpected error in get_conversation_history for channel {channel_id}: {general_error}")
+                # Re-raise the general error to allow higher-level handling or visibility
+                raise 
         
-        except SlackApiError as e:
-            raise SlackApiError(f"Error retrieving history for channel {channel_id}: {e}", e.response)
-    
+        return messages[:limit]
+
     @staticmethod
     def convert_date_to_timestamp(date_str: str) -> Optional[int]:
         """
@@ -245,12 +292,31 @@ class SlackHistory:
         """
         if not self.client:
             raise ValueError("Slack client not initialized. Call set_token() first.")
-            
-        try:
-            result = self.client.users_info(user=user_id)
-            return result["user"]
-        except SlackApiError as e:
-            raise SlackApiError(f"Error retrieving user info for {user_id}: {e}", e.response)
+        
+        while True:
+            try:
+                # Proactive delay for users.info (Tier 4) - generally not needed unless called extremely rapidly.
+                # For now, we are only adding Retry-After as per plan.
+                # time.sleep(0.6) # Optional: ~100 req/min if ever needed.
+
+                result = self.client.users_info(user=user_id)
+                return result["user"] # Success, return and exit loop implicitly
+
+            except SlackApiError as e_user_info:
+                if e_user_info.response is not None and e_user_info.response.status_code == 429:
+                    retry_after_str = e_user_info.response.headers.get('Retry-After')
+                    wait_time = 30  # Default for Tier 4, can be adjusted
+                    if retry_after_str and retry_after_str.isdigit():
+                        wait_time = int(retry_after_str)
+                    logger.warning(f"Rate limited by Slack on users.info for user {user_id}. Retrying after {wait_time} seconds.")
+                    time.sleep(wait_time)
+                    continue  # Retry the API call
+                else:
+                    # Not a 429 error, or no response object, re-raise
+                    raise SlackApiError(f"Error retrieving user info for {user_id}: {e_user_info}", e_user_info.response)
+            except Exception as general_error: # Catch any other unexpected errors
+                logger.error(f"Unexpected error in get_user_info for user {user_id}: {general_error}")
+                raise # Re-raise unexpected errors
     
     def format_message(self, msg: Dict[str, Any], include_user_info: bool = False) -> Dict[str, Any]:
         """
diff --git a/surfsense_backend/app/connectors/test_slack_history.py b/surfsense_backend/app/connectors/test_slack_history.py
index a7ff99941..ecff2c58c 100644
--- a/surfsense_backend/app/connectors/test_slack_history.py
+++ b/surfsense_backend/app/connectors/test_slack_history.py
@@ -8,17 +8,22 @@ from .slack_history import SlackHistory
 
 class TestSlackHistoryGetAllChannels(unittest.TestCase):
 
+    @patch('surfsense_backend.app.connectors.slack_history.logger')
     @patch('surfsense_backend.app.connectors.slack_history.time.sleep')
-    @patch('slack_sdk.WebClient') # Patches where WebClient is looked up when SlackHistory instantiates it
-    def test_get_all_channels_pagination_with_delay(self, MockWebClient, mock_sleep):
+    @patch('slack_sdk.WebClient') 
+    def test_get_all_channels_pagination_with_delay(self, MockWebClient, mock_sleep, mock_logger):
         mock_client_instance = MockWebClient.return_value
         
+        # Mock API responses now include is_private and is_member
         page1_response = {
-            "channels": [{"name": "general", "id": "C1"}, {"name": "dev", "id": "C0"}], # Added one more channel
+            "channels": [
+                {"name": "general", "id": "C1", "is_private": False, "is_member": True}, 
+                {"name": "dev", "id": "C0", "is_private": False, "is_member": True}
+            ],
             "response_metadata": {"next_cursor": "cursor123"}
         }
         page2_response = {
-            "channels": [{"name": "random", "id": "C2"}],
+            "channels": [{"name": "random", "id": "C2", "is_private": True, "is_member": True}],
             "response_metadata": {"next_cursor": ""} 
         }
         
@@ -28,12 +33,16 @@ class TestSlackHistoryGetAllChannels(unittest.TestCase):
         ]
         
         slack_history = SlackHistory(token="fake_token")
-        channels = slack_history.get_all_channels(include_private=True) # Explicitly True
+        channels_list = slack_history.get_all_channels(include_private=True)
         
-        self.assertEqual(len(channels), 3) # Adjusted for 3 channels
-        self.assertEqual(channels["general"], "C1")
-        self.assertEqual(channels["dev"], "C0")
-        self.assertEqual(channels["random"], "C2")
+        expected_channels_list = [
+            {"id": "C1", "name": "general", "is_private": False, "is_member": True},
+            {"id": "C0", "name": "dev", "is_private": False, "is_member": True},
+            {"id": "C2", "name": "random", "is_private": True, "is_member": True}
+        ]
+        
+        self.assertEqual(len(channels_list), 3)
+        self.assertListEqual(channels_list, expected_channels_list) # Assert list equality
         
         expected_calls = [
             call(types="public_channel,private_channel", cursor=None, limit=1000),
@@ -43,10 +52,12 @@ class TestSlackHistoryGetAllChannels(unittest.TestCase):
         self.assertEqual(mock_client_instance.conversations_list.call_count, 2)
         
         mock_sleep.assert_called_once_with(3)
+        mock_logger.info.assert_called_once_with("Paginating for channels, waiting 3 seconds before next call. Cursor: cursor123")
 
+    @patch('surfsense_backend.app.connectors.slack_history.logger')
     @patch('surfsense_backend.app.connectors.slack_history.time.sleep')
     @patch('slack_sdk.WebClient')
-    def test_get_all_channels_rate_limit_with_retry_after(self, MockWebClient, mock_sleep):
+    def test_get_all_channels_rate_limit_with_retry_after(self, MockWebClient, mock_sleep, mock_logger):
         mock_client_instance = MockWebClient.return_value
         
         mock_error_response = Mock()
@@ -54,7 +65,7 @@ class TestSlackHistoryGetAllChannels(unittest.TestCase):
         mock_error_response.headers = {'Retry-After': '5'}
         
         successful_response = {
-            "channels": [{"name": "general", "id": "C1"}],
+            "channels": [{"name": "general", "id": "C1", "is_private": False, "is_member": True}],
             "response_metadata": {"next_cursor": ""}
         }
         
@@ -64,31 +75,34 @@ class TestSlackHistoryGetAllChannels(unittest.TestCase):
         ]
         
         slack_history = SlackHistory(token="fake_token")
-        channels = slack_history.get_all_channels(include_private=True)
+        channels_list = slack_history.get_all_channels(include_private=True)
+        
+        expected_channels_list = [{"id": "C1", "name": "general", "is_private": False, "is_member": True}]
+        self.assertEqual(len(channels_list), 1)
+        self.assertListEqual(channels_list, expected_channels_list)
         
-        self.assertEqual(len(channels), 1)
-        self.assertEqual(channels["general"], "C1")
         mock_sleep.assert_called_once_with(5) 
+        mock_logger.warning.assert_called_once_with("Slack API rate limit hit while fetching channels. Waiting for 5 seconds. Cursor: None")
         
         expected_calls = [
-            call(types="public_channel,private_channel", cursor=None, limit=1000), # First attempt
-            call(types="public_channel,private_channel", cursor=None, limit=1000)  # Retry attempt
+            call(types="public_channel,private_channel", cursor=None, limit=1000), 
+            call(types="public_channel,private_channel", cursor=None, limit=1000)
         ]
         mock_client_instance.conversations_list.assert_has_calls(expected_calls)
         self.assertEqual(mock_client_instance.conversations_list.call_count, 2)
 
+    @patch('surfsense_backend.app.connectors.slack_history.logger')
     @patch('surfsense_backend.app.connectors.slack_history.time.sleep')
     @patch('slack_sdk.WebClient')
-    def test_get_all_channels_rate_limit_no_retry_after_valid_header(self, MockWebClient, mock_sleep):
-        # Test case for when Retry-After is not a digit
+    def test_get_all_channels_rate_limit_no_retry_after_valid_header(self, MockWebClient, mock_sleep, mock_logger):
         mock_client_instance = MockWebClient.return_value
         
         mock_error_response = Mock()
         mock_error_response.status_code = 429
-        mock_error_response.headers = {'Retry-After': 'invalid_value'} # Non-digit value
+        mock_error_response.headers = {'Retry-After': 'invalid_value'} 
         
         successful_response = {
-            "channels": [{"name": "general", "id": "C1"}],
+            "channels": [{"name": "general", "id": "C1", "is_private": False, "is_member": True}],
             "response_metadata": {"next_cursor": ""}
         }
         
@@ -98,24 +112,26 @@ class TestSlackHistoryGetAllChannels(unittest.TestCase):
         ]
         
         slack_history = SlackHistory(token="fake_token")
-        channels = slack_history.get_all_channels(include_private=True)
+        channels_list = slack_history.get_all_channels(include_private=True)
         
-        self.assertEqual(channels["general"], "C1")
+        expected_channels_list = [{"id": "C1", "name": "general", "is_private": False, "is_member": True}]
+        self.assertListEqual(channels_list, expected_channels_list)
         mock_sleep.assert_called_once_with(60) # Default fallback
+        mock_logger.warning.assert_called_once_with("Slack API rate limit hit while fetching channels. Waiting for 60 seconds. Cursor: None")
         self.assertEqual(mock_client_instance.conversations_list.call_count, 2)
 
+    @patch('surfsense_backend.app.connectors.slack_history.logger')
     @patch('surfsense_backend.app.connectors.slack_history.time.sleep')
     @patch('slack_sdk.WebClient')
-    def test_get_all_channels_rate_limit_no_retry_after_header(self, MockWebClient, mock_sleep):
-        # Test case for when Retry-After header is missing
+    def test_get_all_channels_rate_limit_no_retry_after_header(self, MockWebClient, mock_sleep, mock_logger):
         mock_client_instance = MockWebClient.return_value
         
         mock_error_response = Mock()
         mock_error_response.status_code = 429
-        mock_error_response.headers = {} # No Retry-After header
+        mock_error_response.headers = {} 
         
         successful_response = {
-            "channels": [{"name": "general", "id": "C1"}],
+            "channels": [{"name": "general", "id": "C1", "is_private": False, "is_member": True}],
             "response_metadata": {"next_cursor": ""}
         }
         
@@ -125,22 +141,24 @@ class TestSlackHistoryGetAllChannels(unittest.TestCase):
         ]
         
         slack_history = SlackHistory(token="fake_token")
-        channels = slack_history.get_all_channels(include_private=True)
+        channels_list = slack_history.get_all_channels(include_private=True)
         
-        self.assertEqual(channels["general"], "C1")
+        expected_channels_list = [{"id": "C1", "name": "general", "is_private": False, "is_member": True}]
+        self.assertListEqual(channels_list, expected_channels_list)
         mock_sleep.assert_called_once_with(60) # Default fallback
+        mock_logger.warning.assert_called_once_with("Slack API rate limit hit while fetching channels. Waiting for 60 seconds. Cursor: None")
         self.assertEqual(mock_client_instance.conversations_list.call_count, 2)
 
-
+    @patch('surfsense_backend.app.connectors.slack_history.logger')
     @patch('surfsense_backend.app.connectors.slack_history.time.sleep')
     @patch('slack_sdk.WebClient')
-    def test_get_all_channels_other_slack_api_error(self, MockWebClient, mock_sleep):
+    def test_get_all_channels_other_slack_api_error(self, MockWebClient, mock_sleep, mock_logger):
         mock_client_instance = MockWebClient.return_value
         
         mock_error_response = Mock()
         mock_error_response.status_code = 500 
         mock_error_response.headers = {}
-        mock_error_response.data = {"ok": False, "error": "internal_error"} # Mocking response.data
+        mock_error_response.data = {"ok": False, "error": "internal_error"} 
         
         original_error = SlackApiError(message="server error", response=mock_error_response)
         mock_client_instance.conversations_list.side_effect = original_error
@@ -150,25 +168,25 @@ class TestSlackHistoryGetAllChannels(unittest.TestCase):
         with self.assertRaises(SlackApiError) as context:
             slack_history.get_all_channels(include_private=True)
         
-        # Check if the raised exception is the same one or has the same properties
         self.assertEqual(context.exception.response.status_code, 500)
         self.assertIn("server error", str(context.exception))
         mock_sleep.assert_not_called()
+        mock_logger.warning.assert_not_called() # Ensure no rate limit log
         mock_client_instance.conversations_list.assert_called_once_with(
             types="public_channel,private_channel", cursor=None, limit=1000
         )
 
+    @patch('surfsense_backend.app.connectors.slack_history.logger')
     @patch('surfsense_backend.app.connectors.slack_history.time.sleep')
     @patch('slack_sdk.WebClient')
-    def test_get_all_channels_handles_missing_name_id_gracefully(self, MockWebClient, mock_sleep):
+    def test_get_all_channels_handles_missing_name_id_gracefully(self, MockWebClient, mock_sleep, mock_logger):
         mock_client_instance = MockWebClient.return_value
         
-        # Channel missing 'name', channel missing 'id', valid channel
         response_with_malformed_data = {
             "channels": [
-                {"id": "C1_missing_name"}, 
-                {"name": "channel_missing_id"},
-                {"name": "general", "id": "C2_valid"}
+                {"id": "C1_missing_name", "is_private": False, "is_member": True}, 
+                {"name": "channel_missing_id", "is_private": False, "is_member": True},
+                {"name": "general", "id": "C2_valid", "is_private": False, "is_member": True}
             ],
             "response_metadata": {"next_cursor": ""}
         }
@@ -176,23 +194,227 @@ class TestSlackHistoryGetAllChannels(unittest.TestCase):
         mock_client_instance.conversations_list.return_value = response_with_malformed_data
         
         slack_history = SlackHistory(token="fake_token")
-        # Patch print to check for warning messages
-        with patch('builtins.print') as mock_print:
-            channels = slack_history.get_all_channels(include_private=True)
+        channels_list = slack_history.get_all_channels(include_private=True)
         
-        self.assertEqual(len(channels), 1) # Only the valid channel should be included
-        self.assertIn("general", channels)
-        self.assertEqual(channels["general"], "C2_valid")
+        expected_channels_list = [
+            {"id": "C2_valid", "name": "general", "is_private": False, "is_member": True}
+        ]
+        self.assertEqual(len(channels_list), 1) 
+        self.assertListEqual(channels_list, expected_channels_list)
         
-        # Assert that warnings were printed for malformed channel data
-        self.assertGreaterEqual(mock_print.call_count, 2) # At least two warnings
-        mock_print.assert_any_call("Warning: Channel found with missing name or id. Data: {'id': 'C1_missing_name'}")
-        mock_print.assert_any_call("Warning: Channel found with missing name or id. Data: {'name': 'channel_missing_id'}")
+        self.assertEqual(mock_logger.warning.call_count, 2)
+        mock_logger.warning.assert_any_call("Channel found with missing name or id. Data: {'id': 'C1_missing_name', 'is_private': False, 'is_member': True}")
+        mock_logger.warning.assert_any_call("Channel found with missing name or id. Data: {'name': 'channel_missing_id', 'is_private': False, 'is_member': True}")
 
-        mock_sleep.assert_not_called() # No pagination, so no sleep
+        mock_sleep.assert_not_called() 
         mock_client_instance.conversations_list.assert_called_once_with(
             types="public_channel,private_channel", cursor=None, limit=1000
         )
 
 if __name__ == '__main__':
     unittest.main()
+
+class TestSlackHistoryGetConversationHistory(unittest.TestCase):
+
+    @patch('surfsense_backend.app.connectors.slack_history.logger')
+    @patch('surfsense_backend.app.connectors.slack_history.time.sleep')
+    @patch('slack_sdk.WebClient')
+    def test_proactive_delay_single_page(self, MockWebClient, mock_time_sleep, mock_logger):
+        mock_client_instance = MockWebClient.return_value
+        mock_client_instance.conversations_history.return_value = {
+            "messages": [{"text": "msg1"}],
+            "has_more": False
+        }
+        
+        slack_history = SlackHistory(token="fake_token")
+        slack_history.get_conversation_history(channel_id="C123")
+        
+        mock_time_sleep.assert_called_once_with(1.2) # Proactive delay
+
+    @patch('surfsense_backend.app.connectors.slack_history.logger')
+    @patch('surfsense_backend.app.connectors.slack_history.time.sleep')
+    @patch('slack_sdk.WebClient')
+    def test_proactive_delay_multiple_pages(self, MockWebClient, mock_time_sleep, mock_logger):
+        mock_client_instance = MockWebClient.return_value
+        mock_client_instance.conversations_history.side_effect = [
+            {
+                "messages": [{"text": "msg1"}],
+                "has_more": True,
+                "response_metadata": {"next_cursor": "cursor1"}
+            },
+            {
+                "messages": [{"text": "msg2"}],
+                "has_more": False
+            }
+        ]
+        
+        slack_history = SlackHistory(token="fake_token")
+        slack_history.get_conversation_history(channel_id="C123")
+        
+        # Expected calls: 1.2 (page1), 1.2 (page2)
+        self.assertEqual(mock_time_sleep.call_count, 2)
+        mock_time_sleep.assert_has_calls([call(1.2), call(1.2)])
+
+    @patch('surfsense_backend.app.connectors.slack_history.logger')
+    @patch('surfsense_backend.app.connectors.slack_history.time.sleep')
+    @patch('slack_sdk.WebClient')
+    def test_retry_after_logic(self, MockWebClient, mock_time_sleep, mock_logger):
+        mock_client_instance = MockWebClient.return_value
+        
+        mock_error_response = Mock()
+        mock_error_response.status_code = 429
+        mock_error_response.headers = {'Retry-After': '5'}
+        
+        mock_client_instance.conversations_history.side_effect = [
+            SlackApiError(message="ratelimited", response=mock_error_response),
+            {"messages": [{"text": "msg1"}], "has_more": False}
+        ]
+        
+        slack_history = SlackHistory(token="fake_token")
+        messages = slack_history.get_conversation_history(channel_id="C123")
+        
+        self.assertEqual(len(messages), 1)
+        self.assertEqual(messages[0]["text"], "msg1")
+        
+        # Expected sleep calls: 1.2 (proactive for 1st attempt), 5 (rate limit), 1.2 (proactive for 2nd attempt)
+        mock_time_sleep.assert_has_calls([call(1.2), call(5), call(1.2)], any_order=False)
+        mock_logger.warning.assert_called_once() # Check that a warning was logged for rate limiting
+
+    @patch('surfsense_backend.app.connectors.slack_history.logger')
+    @patch('surfsense_backend.app.connectors.slack_history.time.sleep') 
+    @patch('slack_sdk.WebClient')
+    def test_not_in_channel_error(self, MockWebClient, mock_time_sleep, mock_logger):
+        mock_client_instance = MockWebClient.return_value
+        
+        mock_error_response = Mock()
+        mock_error_response.status_code = 403 # Typical for not_in_channel, but data matters more
+        mock_error_response.data = {'ok': False, 'error': 'not_in_channel'}
+        
+        # This error is now raised by the inner try-except, then caught by the outer one
+        mock_client_instance.conversations_history.side_effect = SlackApiError(
+            message="not_in_channel error", 
+            response=mock_error_response
+        )
+        
+        slack_history = SlackHistory(token="fake_token")
+        messages = slack_history.get_conversation_history(channel_id="C123")
+        
+        self.assertEqual(messages, [])
+        mock_logger.warning.assert_called_with(
+            "Bot is not in channel 'C123'. Cannot fetch history. Please add the bot to this channel."
+        )
+        mock_time_sleep.assert_called_once_with(1.2) # Proactive delay before the API call
+
+    @patch('surfsense_backend.app.connectors.slack_history.logger')
+    @patch('surfsense_backend.app.connectors.slack_history.time.sleep')
+    @patch('slack_sdk.WebClient')
+    def test_other_slack_api_error_propagates(self, MockWebClient, mock_time_sleep, mock_logger):
+        mock_client_instance = MockWebClient.return_value
+        
+        mock_error_response = Mock()
+        mock_error_response.status_code = 500
+        mock_error_response.data = {'ok': False, 'error': 'internal_error'}
+        original_error = SlackApiError(message="server error", response=mock_error_response)
+
+        mock_client_instance.conversations_history.side_effect = original_error
+        
+        slack_history = SlackHistory(token="fake_token")
+        
+        with self.assertRaises(SlackApiError) as context:
+            slack_history.get_conversation_history(channel_id="C123")
+        
+        self.assertIn("Error retrieving history for channel C123", str(context.exception))
+        self.assertIs(context.exception.response, mock_error_response)
+        mock_time_sleep.assert_called_once_with(1.2) # Proactive delay
+
+    @patch('surfsense_backend.app.connectors.slack_history.logger')
+    @patch('surfsense_backend.app.connectors.slack_history.time.sleep')
+    @patch('slack_sdk.WebClient')
+    def test_general_exception_propagates(self, MockWebClient, mock_time_sleep, mock_logger):
+        mock_client_instance = MockWebClient.return_value
+        original_error = Exception("Something broke")
+        mock_client_instance.conversations_history.side_effect = original_error
+        
+        slack_history = SlackHistory(token="fake_token")
+        
+        with self.assertRaises(Exception) as context: # Check for generic Exception
+            slack_history.get_conversation_history(channel_id="C123")
+        
+        self.assertIs(context.exception, original_error) # Should re-raise the original error
+        mock_logger.error.assert_called_once_with("Unexpected error in get_conversation_history for channel C123: Something broke")
+        mock_time_sleep.assert_called_once_with(1.2) # Proactive delay
+
+class TestSlackHistoryGetUserInfo(unittest.TestCase):
+
+    @patch('surfsense_backend.app.connectors.slack_history.logger')
+    @patch('surfsense_backend.app.connectors.slack_history.time.sleep')
+    @patch('slack_sdk.WebClient')
+    def test_retry_after_logic(self, MockWebClient, mock_time_sleep, mock_logger):
+        mock_client_instance = MockWebClient.return_value
+        
+        mock_error_response = Mock()
+        mock_error_response.status_code = 429
+        mock_error_response.headers = {'Retry-After': '3'} # Using 3 seconds for test
+        
+        successful_user_data = {"id": "U123", "name": "testuser"}
+        
+        mock_client_instance.users_info.side_effect = [
+            SlackApiError(message="ratelimited_userinfo", response=mock_error_response),
+            {"user": successful_user_data}
+        ]
+        
+        slack_history = SlackHistory(token="fake_token")
+        user_info = slack_history.get_user_info(user_id="U123")
+        
+        self.assertEqual(user_info, successful_user_data)
+        
+        # Assert that time.sleep was called for the rate limit
+        mock_time_sleep.assert_called_once_with(3)
+        mock_logger.warning.assert_called_once_with(
+            "Rate limited by Slack on users.info for user U123. Retrying after 3 seconds."
+        )
+        # Assert users_info was called twice (original + retry)
+        self.assertEqual(mock_client_instance.users_info.call_count, 2)
+        mock_client_instance.users_info.assert_has_calls([call(user="U123"), call(user="U123")])
+
+    @patch('surfsense_backend.app.connectors.slack_history.logger')
+    @patch('surfsense_backend.app.connectors.slack_history.time.sleep') # time.sleep might be called by other logic, but not expected here
+    @patch('slack_sdk.WebClient')
+    def test_other_slack_api_error_propagates(self, MockWebClient, mock_time_sleep, mock_logger):
+        mock_client_instance = MockWebClient.return_value
+        
+        mock_error_response = Mock()
+        mock_error_response.status_code = 500 # Some other error
+        mock_error_response.data = {'ok': False, 'error': 'internal_server_error'}
+        original_error = SlackApiError(message="internal server error", response=mock_error_response)
+
+        mock_client_instance.users_info.side_effect = original_error
+        
+        slack_history = SlackHistory(token="fake_token")
+        
+        with self.assertRaises(SlackApiError) as context:
+            slack_history.get_user_info(user_id="U123")
+        
+        # Check that the raised error is the one we expect
+        self.assertIn("Error retrieving user info for U123", str(context.exception))
+        self.assertIs(context.exception.response, mock_error_response)
+        mock_time_sleep.assert_not_called() # No rate limit sleep
+
+    @patch('surfsense_backend.app.connectors.slack_history.logger')
+    @patch('surfsense_backend.app.connectors.slack_history.time.sleep')
+    @patch('slack_sdk.WebClient')
+    def test_general_exception_propagates(self, MockWebClient, mock_time_sleep, mock_logger):
+        mock_client_instance = MockWebClient.return_value
+        original_error = Exception("A very generic problem")
+        mock_client_instance.users_info.side_effect = original_error
+        
+        slack_history = SlackHistory(token="fake_token")
+        
+        with self.assertRaises(Exception) as context:
+            slack_history.get_user_info(user_id="U123")
+            
+        self.assertIs(context.exception, original_error) # Check it's the exact same exception
+        mock_logger.error.assert_called_once_with(
+            "Unexpected error in get_user_info for user U123: A very generic problem"
+        )
+        mock_time_sleep.assert_not_called() # No rate limit sleep
diff --git a/surfsense_backend/app/tasks/connectors_indexing_tasks.py b/surfsense_backend/app/tasks/connectors_indexing_tasks.py
index 94643a45d..6af36ada1 100644
--- a/surfsense_backend/app/tasks/connectors_indexing_tasks.py
+++ b/surfsense_backend/app/tasks/connectors_indexing_tasks.py
@@ -114,33 +114,25 @@ async def index_slack_messages(
         skipped_channels = []
         
         # Process each channel
-        for channel_name, channel_id in channels.items():
+        for channel_obj in channels: # Modified loop to iterate over list of channel objects
+            channel_id = channel_obj["id"]
+            channel_name = channel_obj["name"]
+            is_private = channel_obj["is_private"]
+            is_member = channel_obj["is_member"] # This might be False for public channels too
+
             try:
-                # Check if the bot is a member of the channel
-                try:
-                    # First try to get channel info to check if bot is a member
-                    channel_info = slack_client.client.conversations_info(channel=channel_id)
-                    
-                    # For private channels, the bot needs to be a member
-                    if channel_info.get("channel", {}).get("is_private", False):
-                        # Check if bot is a member
-                        is_member = channel_info.get("channel", {}).get("is_member", False)
-                        if not is_member:
-                            logger.warning(f"Bot is not a member of private channel {channel_name} ({channel_id}). Skipping.")
-                            skipped_channels.append(f"{channel_name} (private, bot not a member)")
-                            documents_skipped += 1
-                            continue
-                except SlackApiError as e:
-                    if "not_in_channel" in str(e) or "channel_not_found" in str(e):
-                        logger.warning(f"Bot cannot access channel {channel_name} ({channel_id}). Skipping.")
-                        skipped_channels.append(f"{channel_name} (access error)")
-                        documents_skipped += 1
-                        continue
-                    else:
-                        # Re-raise if it's a different error
-                        raise
+                # If it's a private channel and the bot is not a member, skip.
+                # For public channels, if they are listed by conversations.list, the bot can typically read history.
+                # The `not_in_channel` error in get_conversation_history will be the ultimate gatekeeper if history is inaccessible.
+                if is_private and not is_member:
+                    logger.warning(f"Bot is not a member of private channel {channel_name} ({channel_id}). Skipping.")
+                    skipped_channels.append(f"{channel_name} (private, bot not a member)")
+                    documents_skipped += 1
+                    continue
                 
                 # Get messages for this channel
+                # The get_history_by_date_range now uses get_conversation_history, 
+                # which handles 'not_in_channel' by returning [] and logging.
                 messages, error = slack_client.get_history_by_date_range(
                     channel_id=channel_id,
                     start_date=start_date_str,

From 379694c267e7ed023bcf3cb6756df0e481d46ac9 Mon Sep 17 00:00:00 2001
From: "google-labs-jules[bot]"
 <161369871+google-labs-jules[bot]@users.noreply.github.com>
Date: Tue, 27 May 2025 13:44:56 +0000
Subject: [PATCH 63/70] Hotfix: Remove non-printable characters from
 slack_history.py

This commit addresses recurring `SyntaxError: invalid non-printable character U+001B`
errors in `surfsense_backend/app/connectors/slack_history.py`.

The file was cleaned to remove all occurrences of the
U+001B (ESCAPE) character. This ensures that previously introduced
problematic control characters are fully removed, allowing the application
to parse and load the module correctly.
---
 surfsense_backend/app/connectors/slack_history.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/surfsense_backend/app/connectors/slack_history.py b/surfsense_backend/app/connectors/slack_history.py
index 6e0e57a24..0bb90ec4a 100644
--- a/surfsense_backend/app/connectors/slack_history.py
+++ b/surfsense_backend/app/connectors/slack_history.py
@@ -268,7 +268,7 @@ class SlackHistory:
                 channel_id=channel_id,
                 limit=limit,
                 oldest=oldest,
-                latest=latest # Retyped to remove hidden characters
+                latest=latest
             )
             return messages, None
         except SlackApiError as e:

From 5ff08e7b3f63de6428f84e5ff1185d45eb0f7370 Mon Sep 17 00:00:00 2001
From: "google-labs-jules[bot]"
 <161369871+google-labs-jules[bot]@users.noreply.github.com>
Date: Thu, 29 May 2025 01:58:58 +0000
Subject: [PATCH 64/70] feat: Add GitHub Actions workflow for Docker image
 publishing

Adds a GitHub Actions workflow to automatically build and publish Docker images for the backend and frontend services.

The workflow (`.github/workflows/docker-publish.yml`) is triggered on pushes to the `main` branch. It includes two jobs:

1.  `build_and_push_backend`: Builds the Docker image from `surfsense_backend/Dockerfile` and pushes it to `ghcr.io/<owner>/surfsense_backend:<commit_sha>`.
2.  `build_and_push_frontend`: Builds the Docker image from `surfsense_web/Dockerfile` and pushes it to `ghcr.io/<owner>/surfsense_web:<commit_sha>`.

Both jobs include steps for:
- Checking out the repository.
- Setting up QEMU and Docker Buildx.
- Logging into the GitHub Container Registry (ghcr.io) using `secrets.GITHUB_TOKEN`.
- Building and pushing the respective Docker images, tagged with the commit SHA.
- Adding OCI labels for image source, creation date, and revision.

This CI pipeline automates the process of creating and distributing Docker images for the application, ensuring that new versions are available in the GitHub Container Registry upon changes to the main branch.
---
 .github/workflows/docker-publish.yml | 74 ++++++++++++++++++++++++++++
 1 file changed, 74 insertions(+)
 create mode 100644 .github/workflows/docker-publish.yml

diff --git a/.github/workflows/docker-publish.yml b/.github/workflows/docker-publish.yml
new file mode 100644
index 000000000..b9a860522
--- /dev/null
+++ b/.github/workflows/docker-publish.yml
@@ -0,0 +1,74 @@
+name: Docker Publish
+
+on:
+  push:
+    branches: [ "main" ]
+
+jobs:
+  build_and_push_backend:
+    runs-on: ubuntu-latest
+    permissions:
+      contents: read
+      packages: write
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v4
+
+      - name: Set up QEMU
+        uses: docker/setup-qemu-action@v3
+
+      - name: Set up Docker Buildx
+        uses: docker/setup-buildx-action@v3
+
+      - name: Log in to GitHub Container Registry
+        uses: docker/login-action@v3
+        with:
+          registry: ghcr.io
+          username: ${{ github.actor }}
+          password: ${{ secrets.GITHUB_TOKEN }}
+
+      - name: Build and push backend image
+        uses: docker/build-push-action@v5
+        with:
+          context: ./surfsense_backend
+          file: ./surfsense_backend/Dockerfile
+          push: true
+          tags: ghcr.io/${{ github.repository_owner }}/surfsense_backend:${{ github.sha }}
+          labels: |
+            org.opencontainers.image.source=${{ github.repositoryUrl }}
+            org.opencontainers.image.created=${{ fromJSON(steps.meta.outputs.json).labels['org.opencontainers.image.created'] }}
+            org.opencontainers.image.revision=${{ github.sha }}
+
+  build_and_push_frontend:
+    runs-on: ubuntu-latest
+    permissions:
+      contents: read
+      packages: write
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v4
+
+      - name: Set up QEMU
+        uses: docker/setup-qemu-action@v3
+
+      - name: Set up Docker Buildx
+        uses: docker/setup-buildx-action@v3
+
+      - name: Log in to GitHub Container Registry
+        uses: docker/login-action@v3
+        with:
+          registry: ghcr.io
+          username: ${{ github.actor }}
+          password: ${{ secrets.GITHUB_TOKEN }}
+
+      - name: Build and push frontend image
+        uses: docker/build-push-action@v5
+        with:
+          context: ./surfsense_web
+          file: ./surfsense_web/Dockerfile
+          push: true
+          tags: ghcr.io/${{ github.repository_owner }}/surfsense_web:${{ github.sha }}
+          labels: |
+            org.opencontainers.image.source=${{ github.repositoryUrl }}
+            org.opencontainers.image.created=${{ fromJSON(steps.meta.outputs.json).labels['org.opencontainers.image.created'] }}
+            org.opencontainers.image.revision=${{ github.sha }}

From 825dcad1128c571252c7e62c9cc45ca86763b1ce Mon Sep 17 00:00:00 2001
From: "google-labs-jules[bot]"
 <161369871+google-labs-jules[bot]@users.noreply.github.com>
Date: Thu, 29 May 2025 02:00:56 +0000
Subject: [PATCH 65/70] feat: Enable multi-architecture Docker image builds
 (amd64, arm64)

Updates the GitHub Actions workflow (`.github/workflows/docker-publish.yml`)
to build and push Docker images for both `linux/amd64` and `linux/arm64`
architectures.

The `platforms` attribute has been added to the `docker/build-push-action`
step for both the backend and frontend jobs. This ensures that you on
different CPU architectures can use the published images from ghcr.io.
---
 .github/workflows/docker-publish.yml | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/.github/workflows/docker-publish.yml b/.github/workflows/docker-publish.yml
index b9a860522..9b7ecc6a0 100644
--- a/.github/workflows/docker-publish.yml
+++ b/.github/workflows/docker-publish.yml
@@ -34,6 +34,7 @@ jobs:
           file: ./surfsense_backend/Dockerfile
           push: true
           tags: ghcr.io/${{ github.repository_owner }}/surfsense_backend:${{ github.sha }}
+          platforms: linux/amd64,linux/arm64
           labels: |
             org.opencontainers.image.source=${{ github.repositoryUrl }}
             org.opencontainers.image.created=${{ fromJSON(steps.meta.outputs.json).labels['org.opencontainers.image.created'] }}
@@ -68,6 +69,7 @@ jobs:
           file: ./surfsense_web/Dockerfile
           push: true
           tags: ghcr.io/${{ github.repository_owner }}/surfsense_web:${{ github.sha }}
+          platforms: linux/amd64,linux/arm64
           labels: |
             org.opencontainers.image.source=${{ github.repositoryUrl }}
             org.opencontainers.image.created=${{ fromJSON(steps.meta.outputs.json).labels['org.opencontainers.image.created'] }}

From 5411bac8e08333b06f6951912fbff34473858404 Mon Sep 17 00:00:00 2001
From: "DESKTOP-RTLN3BA\\$punk" <vermarohanfinal@gmail.com>
Date: Wed, 28 May 2025 23:52:00 -0700
Subject: [PATCH 66/70] feat: Added content based hashing to prevent duplicates
 and fix resync issues

---
 .../versions/1_add_github_connector_enum.py   |   1 -
 .../versions/2_add_linear_connector_enum.py   |   1 -
 ...3_add_linear_connector_to_documenttype_.py |   1 -
 .../alembic/versions/4_add_linkup_api_enum.py |   1 -
 .../versions/5_remove_title_char_limit.py     |   1 -
 .../6_change_podcast_content_to_transcript.py |   1 -
 .../versions/7_remove_is_generated_column.py  |   1 -
 .../8_add_content_hash_to_documents.py        |  56 +++
 ...1_add_github_connector_to_documenttype_.py |   1 -
 .../app/connectors/linear_connector.py        |   2 +-
 .../app/connectors/slack_history.py           |   2 +-
 surfsense_backend/app/db.py                   |   1 +
 .../routes/search_source_connectors_routes.py |   2 +-
 .../app/tasks/background_tasks.py             |  70 ++-
 .../app/tasks/connectors_indexing_tasks.py    | 429 ++++++------------
 .../app/utils/document_converters.py          |  56 ++-
 surfsense_backend/draw.py                     |   5 -
 17 files changed, 297 insertions(+), 334 deletions(-)
 create mode 100644 surfsense_backend/alembic/versions/8_add_content_hash_to_documents.py
 delete mode 100644 surfsense_backend/draw.py

diff --git a/surfsense_backend/alembic/versions/1_add_github_connector_enum.py b/surfsense_backend/alembic/versions/1_add_github_connector_enum.py
index bb72838ad..1902777b6 100644
--- a/surfsense_backend/alembic/versions/1_add_github_connector_enum.py
+++ b/surfsense_backend/alembic/versions/1_add_github_connector_enum.py
@@ -2,7 +2,6 @@
 
 Revision ID: 1
 Revises: 
-Create Date: 2023-10-27 10:00:00.000000 
 
 """
 from typing import Sequence, Union
diff --git a/surfsense_backend/alembic/versions/2_add_linear_connector_enum.py b/surfsense_backend/alembic/versions/2_add_linear_connector_enum.py
index d3527d34a..526c7c3ad 100644
--- a/surfsense_backend/alembic/versions/2_add_linear_connector_enum.py
+++ b/surfsense_backend/alembic/versions/2_add_linear_connector_enum.py
@@ -2,7 +2,6 @@
 
 Revision ID: 2
 Revises: e55302644c51
-Create Date: 2025-04-16 10:00:00.000000 
 
 """
 from typing import Sequence, Union
diff --git a/surfsense_backend/alembic/versions/3_add_linear_connector_to_documenttype_.py b/surfsense_backend/alembic/versions/3_add_linear_connector_to_documenttype_.py
index ab50d8550..e71ee2ed4 100644
--- a/surfsense_backend/alembic/versions/3_add_linear_connector_to_documenttype_.py
+++ b/surfsense_backend/alembic/versions/3_add_linear_connector_to_documenttype_.py
@@ -2,7 +2,6 @@
 
 Revision ID: 3
 Revises: 2
-Create Date: 2025-04-16 10:05:00.059921
 
 """
 from typing import Sequence, Union
diff --git a/surfsense_backend/alembic/versions/4_add_linkup_api_enum.py b/surfsense_backend/alembic/versions/4_add_linkup_api_enum.py
index 8ccfac2d2..093bdf067 100644
--- a/surfsense_backend/alembic/versions/4_add_linkup_api_enum.py
+++ b/surfsense_backend/alembic/versions/4_add_linkup_api_enum.py
@@ -2,7 +2,6 @@
 
 Revision ID: 4
 Revises: 3
-Create Date: 2025-04-18 10:00:00.000000 
 
 """
 from typing import Sequence, Union
diff --git a/surfsense_backend/alembic/versions/5_remove_title_char_limit.py b/surfsense_backend/alembic/versions/5_remove_title_char_limit.py
index 57ed10899..62fe019f4 100644
--- a/surfsense_backend/alembic/versions/5_remove_title_char_limit.py
+++ b/surfsense_backend/alembic/versions/5_remove_title_char_limit.py
@@ -2,7 +2,6 @@
 
 Revision ID: 5
 Revises: 4
-Create Date: 2023-06-10 00:00:00.000000
 
 """
 from typing import Sequence, Union
diff --git a/surfsense_backend/alembic/versions/6_change_podcast_content_to_transcript.py b/surfsense_backend/alembic/versions/6_change_podcast_content_to_transcript.py
index 991948f3a..fa7a0f8f6 100644
--- a/surfsense_backend/alembic/versions/6_change_podcast_content_to_transcript.py
+++ b/surfsense_backend/alembic/versions/6_change_podcast_content_to_transcript.py
@@ -2,7 +2,6 @@
 
 Revision ID: 6
 Revises: 5
-Create Date: 2023-08-15 00:00:00.000000
 
 """
 from typing import Sequence, Union
diff --git a/surfsense_backend/alembic/versions/7_remove_is_generated_column.py b/surfsense_backend/alembic/versions/7_remove_is_generated_column.py
index c5d25ad70..03048a146 100644
--- a/surfsense_backend/alembic/versions/7_remove_is_generated_column.py
+++ b/surfsense_backend/alembic/versions/7_remove_is_generated_column.py
@@ -2,7 +2,6 @@
 
 Revision ID: 7
 Revises: 6
-Create Date: 2023-08-15 01:00:00.000000
 
 """
 from typing import Sequence, Union
diff --git a/surfsense_backend/alembic/versions/8_add_content_hash_to_documents.py b/surfsense_backend/alembic/versions/8_add_content_hash_to_documents.py
new file mode 100644
index 000000000..64982fc56
--- /dev/null
+++ b/surfsense_backend/alembic/versions/8_add_content_hash_to_documents.py
@@ -0,0 +1,56 @@
+"""Add content_hash column to documents table
+
+Revision ID: 8
+Revises: 7
+"""
+from typing import Sequence, Union
+
+from alembic import op
+import sqlalchemy as sa
+
+
+# revision identifiers, used by Alembic.
+revision: str = '8'
+down_revision: Union[str, None] = '7'
+branch_labels: Union[str, Sequence[str], None] = None
+depends_on: Union[str, Sequence[str], None] = None
+
+
+def upgrade() -> None:
+    # Add content_hash column as nullable first to handle existing data
+    op.add_column('documents', sa.Column('content_hash', sa.String(), nullable=True))
+    
+    # Update existing documents to generate content hashes
+    # Using SHA-256 hash of the content column with proper UTF-8 encoding
+    op.execute("""
+        UPDATE documents 
+        SET content_hash = encode(sha256(convert_to(content, 'UTF8')), 'hex')
+        WHERE content_hash IS NULL
+    """)
+    
+    # Handle duplicate content hashes by keeping only the oldest document for each hash
+    # Delete newer documents with duplicate content hashes
+    op.execute("""
+        DELETE FROM documents 
+        WHERE id NOT IN (
+            SELECT MIN(id) 
+            FROM documents 
+            GROUP BY content_hash
+        )
+    """)
+    
+    # Now alter the column to match the model: nullable=False, index=True, unique=True
+    op.alter_column('documents', 'content_hash', 
+                    existing_type=sa.String(),
+                    nullable=False)
+    op.create_index(op.f('ix_documents_content_hash'), 'documents', ['content_hash'], unique=False)
+    op.create_unique_constraint(op.f('uq_documents_content_hash'), 'documents', ['content_hash'])
+
+
+def downgrade() -> None:
+    # Remove constraints and index first
+    op.drop_constraint(op.f('uq_documents_content_hash'), 'documents', type_='unique')
+    op.drop_index(op.f('ix_documents_content_hash'), table_name='documents')
+    
+    # Remove content_hash column from documents table
+    op.drop_column('documents', 'content_hash') 
\ No newline at end of file
diff --git a/surfsense_backend/alembic/versions/e55302644c51_add_github_connector_to_documenttype_.py b/surfsense_backend/alembic/versions/e55302644c51_add_github_connector_to_documenttype_.py
index 1f15912d4..12d653794 100644
--- a/surfsense_backend/alembic/versions/e55302644c51_add_github_connector_to_documenttype_.py
+++ b/surfsense_backend/alembic/versions/e55302644c51_add_github_connector_to_documenttype_.py
@@ -2,7 +2,6 @@
 
 Revision ID: e55302644c51
 Revises: 1
-Create Date: 2025-04-13 19:56:00.059921
 
 """
 from typing import Sequence, Union
diff --git a/surfsense_backend/app/connectors/linear_connector.py b/surfsense_backend/app/connectors/linear_connector.py
index be9a1a49d..52b770445 100644
--- a/surfsense_backend/app/connectors/linear_connector.py
+++ b/surfsense_backend/app/connectors/linear_connector.py
@@ -6,7 +6,7 @@ Allows fetching issue lists and their comments with date range filtering.
 """
 
 import requests
-from datetime import datetime, timedelta
+from datetime import datetime
 from typing import Dict, List, Optional, Tuple, Any, Union
 
 
diff --git a/surfsense_backend/app/connectors/slack_history.py b/surfsense_backend/app/connectors/slack_history.py
index 67e540354..effb9c829 100644
--- a/surfsense_backend/app/connectors/slack_history.py
+++ b/surfsense_backend/app/connectors/slack_history.py
@@ -8,7 +8,7 @@ Allows fetching channel lists and message history with date range filtering.
 import os
 from slack_sdk import WebClient
 from slack_sdk.errors import SlackApiError
-from datetime import datetime, timedelta
+from datetime import datetime
 from typing import Dict, List, Optional, Tuple, Any, Union
 
 
diff --git a/surfsense_backend/app/db.py b/surfsense_backend/app/db.py
index 10f78a55f..7ee566311 100644
--- a/surfsense_backend/app/db.py
+++ b/surfsense_backend/app/db.py
@@ -99,6 +99,7 @@ class Document(BaseModel, TimestampMixin):
     document_metadata = Column(JSON, nullable=True)
     
     content = Column(Text, nullable=False)
+    content_hash = Column(String, nullable=False, index=True, unique=True)
     embedding = Column(Vector(config.embedding_model_instance.dimension))
     
     search_space_id = Column(Integer, ForeignKey("searchspaces.id", ondelete='CASCADE'), nullable=False)
diff --git a/surfsense_backend/app/routes/search_source_connectors_routes.py b/surfsense_backend/app/routes/search_source_connectors_routes.py
index ff5cce148..15c815032 100644
--- a/surfsense_backend/app/routes/search_source_connectors_routes.py
+++ b/surfsense_backend/app/routes/search_source_connectors_routes.py
@@ -21,7 +21,7 @@ from app.utils.check_ownership import check_ownership
 from pydantic import BaseModel, Field, ValidationError
 from app.tasks.connectors_indexing_tasks import index_slack_messages, index_notion_pages, index_github_repos, index_linear_issues
 from app.connectors.github_connector import GitHubConnector
-from datetime import datetime, timezone, timedelta
+from datetime import datetime, timedelta
 import logging
 
 # Set up logging
diff --git a/surfsense_backend/app/tasks/background_tasks.py b/surfsense_backend/app/tasks/background_tasks.py
index 2e5e361a6..1c6cd6d49 100644
--- a/surfsense_backend/app/tasks/background_tasks.py
+++ b/surfsense_backend/app/tasks/background_tasks.py
@@ -1,12 +1,13 @@
 from typing import Optional, List
 from sqlalchemy.ext.asyncio import AsyncSession
 from sqlalchemy.exc import SQLAlchemyError
+from sqlalchemy.future import select
 from app.db import Document, DocumentType, Chunk
 from app.schemas import ExtensionDocumentContent
 from app.config import config
 from app.prompts import SUMMARY_PROMPT_TEMPLATE
 from datetime import datetime
-from app.utils.document_converters import convert_document_to_markdown
+from app.utils.document_converters import convert_document_to_markdown, generate_content_hash
 from langchain_core.documents import Document as LangChainDocument
 from langchain_community.document_loaders import FireCrawlLoader, AsyncChromiumLoader
 from langchain_community.document_transformers import MarkdownifyTransformer
@@ -14,7 +15,6 @@ import validators
 from youtube_transcript_api import YouTubeTranscriptApi
 from urllib.parse import urlparse, parse_qs
 import aiohttp
-from app.db import Document as DB_Document, DocumentType as DB_DocumentType
 import logging
 
 md = MarkdownifyTransformer()
@@ -73,6 +73,17 @@ async def add_crawled_url_document(
 
         document_parts.append("</DOCUMENT>")
         combined_document_string = "\n".join(document_parts)
+        content_hash = generate_content_hash(combined_document_string)
+
+        # Check if document with this content hash already exists
+        existing_doc_result = await session.execute(
+            select(Document).where(Document.content_hash == content_hash)
+        )
+        existing_document = existing_doc_result.scalars().first()
+        
+        if existing_document:
+            logging.info(f"Document with content hash {content_hash} already exists. Skipping processing.")
+            return existing_document
 
         # Generate summary
         summary_chain = SUMMARY_PROMPT_TEMPLATE | config.long_context_llm_instance
@@ -102,6 +113,7 @@ async def add_crawled_url_document(
             content=summary_content,
             embedding=summary_embedding,
             chunks=chunks,
+            content_hash=content_hash,
         )
 
         session.add(document)
@@ -163,6 +175,17 @@ async def add_extension_received_document(
 
         document_parts.append("</DOCUMENT>")
         combined_document_string = "\n".join(document_parts)
+        content_hash = generate_content_hash(combined_document_string)
+
+        # Check if document with this content hash already exists
+        existing_doc_result = await session.execute(
+            select(Document).where(Document.content_hash == content_hash)
+        )
+        existing_document = existing_doc_result.scalars().first()
+        
+        if existing_document:
+            logging.info(f"Document with content hash {content_hash} already exists. Skipping processing.")
+            return existing_document
 
         # Generate summary
         summary_chain = SUMMARY_PROMPT_TEMPLATE | config.long_context_llm_instance
@@ -190,6 +213,7 @@ async def add_extension_received_document(
             content=summary_content,
             embedding=summary_embedding,
             chunks=chunks,
+            content_hash=content_hash,
         )
 
         session.add(document)
@@ -210,6 +234,18 @@ async def add_received_markdown_file_document(
     session: AsyncSession, file_name: str, file_in_markdown: str, search_space_id: int
 ) -> Optional[Document]:
     try:
+        content_hash = generate_content_hash(file_in_markdown)
+
+        # Check if document with this content hash already exists
+        existing_doc_result = await session.execute(
+            select(Document).where(Document.content_hash == content_hash)
+        )
+        existing_document = existing_doc_result.scalars().first()
+        
+        if existing_document:
+            logging.info(f"Document with content hash {content_hash} already exists. Skipping processing.")
+            return existing_document
+
         # Generate summary
         summary_chain = SUMMARY_PROMPT_TEMPLATE | config.long_context_llm_instance
         summary_result = await summary_chain.ainvoke({"document": file_in_markdown})
@@ -237,6 +273,7 @@ async def add_received_markdown_file_document(
             content=summary_content,
             embedding=summary_embedding,
             chunks=chunks,
+            content_hash=content_hash,
         )
 
         session.add(document)
@@ -263,6 +300,18 @@ async def add_received_file_document(
             unstructured_processed_elements
         )
 
+        content_hash = generate_content_hash(file_in_markdown)
+
+        # Check if document with this content hash already exists
+        existing_doc_result = await session.execute(
+            select(Document).where(Document.content_hash == content_hash)
+        )
+        existing_document = existing_doc_result.scalars().first()
+        
+        if existing_document:
+            logging.info(f"Document with content hash {content_hash} already exists. Skipping processing.")
+            return existing_document
+
         # TODO: Check if file_markdown exceeds token limit of embedding model
 
         # Generate summary
@@ -292,6 +341,7 @@ async def add_received_file_document(
             content=summary_content,
             embedding=summary_embedding,
             chunks=chunks,
+            content_hash=content_hash,
         )
 
         session.add(document)
@@ -404,6 +454,17 @@ async def add_youtube_video_document(
 
         document_parts.append("</DOCUMENT>")
         combined_document_string = "\n".join(document_parts)
+        content_hash = generate_content_hash(combined_document_string)
+
+        # Check if document with this content hash already exists
+        existing_doc_result = await session.execute(
+            select(Document).where(Document.content_hash == content_hash)
+        )
+        existing_document = existing_doc_result.scalars().first()
+        
+        if existing_document:
+            logging.info(f"Document with content hash {content_hash} already exists. Skipping processing.")
+            return existing_document
 
         # Generate summary
         summary_chain = SUMMARY_PROMPT_TEMPLATE | config.long_context_llm_instance
@@ -424,9 +485,9 @@ async def add_youtube_video_document(
 
         # Create document
 
-        document = DB_Document(
+        document = Document(
             title=video_data.get("title", "YouTube Video"),
-            document_type=DB_DocumentType.YOUTUBE_VIDEO,
+            document_type=DocumentType.YOUTUBE_VIDEO,
             document_metadata={
                 "url": url,
                 "video_id": video_id,
@@ -438,6 +499,7 @@ async def add_youtube_video_document(
             embedding=summary_embedding,
             chunks=chunks,
             search_space_id=search_space_id,
+            content_hash=content_hash,
         )
 
         session.add(document)
diff --git a/surfsense_backend/app/tasks/connectors_indexing_tasks.py b/surfsense_backend/app/tasks/connectors_indexing_tasks.py
index 94643a45d..a36e14dd8 100644
--- a/surfsense_backend/app/tasks/connectors_indexing_tasks.py
+++ b/surfsense_backend/app/tasks/connectors_indexing_tasks.py
@@ -14,6 +14,8 @@ from app.connectors.linear_connector import LinearConnector
 from slack_sdk.errors import SlackApiError
 import logging
 
+from app.utils.document_converters import generate_content_hash
+
 # Set up logging
 logger = logging.getLogger(__name__)
 
@@ -67,13 +69,13 @@ async def index_slack_messages(
             
             # Check if last_indexed_at is in the future or after end_date
             if last_indexed_naive > end_date:
-                logger.warning(f"Last indexed date ({last_indexed_naive.strftime('%Y-%m-%d')}) is in the future. Using 30 days ago instead.")
-                start_date = end_date - timedelta(days=30)
+                logger.warning(f"Last indexed date ({last_indexed_naive.strftime('%Y-%m-%d')}) is in the future. Using 365 days ago instead.")
+                start_date = end_date - timedelta(days=365)
             else:
                 start_date = last_indexed_naive
                 logger.info(f"Using last_indexed_at ({start_date.strftime('%Y-%m-%d')}) as start date")
         else:
-            start_date = end_date - timedelta(days=30)  # Use 30 days instead of 365 to catch recent issues
+            start_date = end_date - timedelta(days=365)  # Use 365 days as default
             logger.info(f"No last_indexed_at found, using {start_date.strftime('%Y-%m-%d')} (30 days ago) as start date")
         
         # Format dates for Slack API
@@ -89,27 +91,8 @@ async def index_slack_messages(
         if not channels:
             return 0, "No Slack channels found"
             
-        # Get existing documents for this search space and connector type to prevent duplicates
-        existing_docs_result = await session.execute(
-            select(Document)
-            .filter(
-                Document.search_space_id == search_space_id,
-                Document.document_type == DocumentType.SLACK_CONNECTOR
-            )
-        )
-        existing_docs = existing_docs_result.scalars().all()
-        
-        # Create a lookup dictionary of existing documents by channel_id
-        existing_docs_by_channel_id = {}
-        for doc in existing_docs:
-            if "channel_id" in doc.document_metadata:
-                existing_docs_by_channel_id[doc.document_metadata["channel_id"]] = doc
-        
-        logger.info(f"Found {len(existing_docs_by_channel_id)} existing Slack documents in database")
-        
         # Track the number of documents indexed
         documents_indexed = 0
-        documents_updated = 0
         documents_skipped = 0
         skipped_channels = []
         
@@ -189,10 +172,9 @@ async def index_slack_messages(
                     ("METADATA", [
                         f"CHANNEL_NAME: {channel_name}",
                         f"CHANNEL_ID: {channel_id}",
-                        f"START_DATE: {start_date_str}",
-                        f"END_DATE: {end_date_str}",
-                        f"MESSAGE_COUNT: {len(formatted_messages)}",
-                        f"INDEXED_AT: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}"
+                        # f"START_DATE: {start_date_str}",
+                        # f"END_DATE: {end_date_str}",
+                        f"MESSAGE_COUNT: {len(formatted_messages)}"
                     ]),
                     ("CONTENT", [
                         "FORMAT: markdown",
@@ -213,6 +195,18 @@ async def index_slack_messages(
                 
                 document_parts.append("</DOCUMENT>")
                 combined_document_string = '\n'.join(document_parts)
+                content_hash = generate_content_hash(combined_document_string)
+
+                # Check if document with this content hash already exists
+                existing_doc_by_hash_result = await session.execute(
+                    select(Document).where(Document.content_hash == content_hash)
+                )
+                existing_document_by_hash = existing_doc_by_hash_result.scalars().first()
+                
+                if existing_document_by_hash:
+                    logger.info(f"Document with content hash {content_hash} already exists for channel {channel_name}. Skipping processing.")
+                    documents_skipped += 1
+                    continue
                 
                 # Generate summary
                 summary_chain = SUMMARY_PROMPT_TEMPLATE | config.long_context_llm_instance
@@ -226,61 +220,28 @@ async def index_slack_messages(
                     for chunk in config.chunker_instance.chunk(channel_content)
                 ]
                 
-                # Check if this channel already exists in our database
-                existing_document = existing_docs_by_channel_id.get(channel_id)
-                
-                if existing_document:
-                    # Update existing document instead of creating a new one
-                    logger.info(f"Updating existing document for channel {channel_name}")
-                    
-                    # Update document fields
-                    existing_document.title = f"Slack - {channel_name}"
-                    existing_document.document_metadata = {
+                # Create and store new document
+                document = Document(
+                    search_space_id=search_space_id,
+                    title=f"Slack - {channel_name}",
+                    document_type=DocumentType.SLACK_CONNECTOR,
+                    document_metadata={
                         "channel_name": channel_name,
                         "channel_id": channel_id,
                         "start_date": start_date_str,
                         "end_date": end_date_str,
                         "message_count": len(formatted_messages),
-                        "indexed_at": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
-                        "last_updated": datetime.now().strftime("%Y-%m-%d %H:%M:%S")
-                    }
-                    existing_document.content = summary_content
-                    existing_document.embedding = summary_embedding
-                    
-                    # Delete existing chunks and add new ones
-                    await session.execute(
-                        delete(Chunk)
-                        .where(Chunk.document_id == existing_document.id)
-                    )
-                    
-                    # Assign new chunks to existing document
-                    for chunk in chunks:
-                        chunk.document_id = existing_document.id
-                        session.add(chunk)
-                    
-                    documents_updated += 1
-                else:
-                    # Create and store new document
-                    document = Document(
-                        search_space_id=search_space_id,
-                        title=f"Slack - {channel_name}",
-                        document_type=DocumentType.SLACK_CONNECTOR,
-                        document_metadata={
-                            "channel_name": channel_name,
-                            "channel_id": channel_id,
-                            "start_date": start_date_str,
-                            "end_date": end_date_str,
-                            "message_count": len(formatted_messages),
-                            "indexed_at": datetime.now().strftime("%Y-%m-%d %H:%M:%S")
-                        },
-                        content=summary_content,
-                        embedding=summary_embedding,
-                        chunks=chunks
-                    )
-                    
-                    session.add(document)
-                    documents_indexed += 1
-                    logger.info(f"Successfully indexed new channel {channel_name} with {len(formatted_messages)} messages")
+                        "indexed_at": datetime.now().strftime("%Y-%m-%d %H:%M:%S")
+                    },
+                    content=summary_content,
+                    embedding=summary_embedding,
+                    chunks=chunks,
+                    content_hash=content_hash,
+                )
+                
+                session.add(document)
+                documents_indexed += 1
+                logger.info(f"Successfully indexed new channel {channel_name} with {len(formatted_messages)} messages")
                 
             except SlackApiError as slack_error:
                 logger.error(f"Slack API error for channel {channel_name}: {str(slack_error)}")
@@ -295,7 +256,7 @@ async def index_slack_messages(
         
         # Update the last_indexed_at timestamp for the connector only if requested
         # and if we successfully indexed at least one channel
-        total_processed = documents_indexed + documents_updated
+        total_processed = documents_indexed
         if update_last_indexed and total_processed > 0:
             connector.last_indexed_at = datetime.now()
         
@@ -305,11 +266,11 @@ async def index_slack_messages(
         # Prepare result message
         result_message = None
         if skipped_channels:
-            result_message = f"Processed {total_processed} channels ({documents_indexed} new, {documents_updated} updated). Skipped {len(skipped_channels)} channels: {', '.join(skipped_channels)}"
+            result_message = f"Processed {total_processed} channels. Skipped {len(skipped_channels)} channels: {', '.join(skipped_channels)}"
         else:
-            result_message = f"Processed {total_processed} channels ({documents_indexed} new, {documents_updated} updated)."
+            result_message = f"Processed {total_processed} channels."
         
-        logger.info(f"Slack indexing completed: {documents_indexed} new channels, {documents_updated} updated, {documents_skipped} skipped")
+        logger.info(f"Slack indexing completed: {documents_indexed} new channels, {documents_skipped} skipped")
         return total_processed, result_message
     
     except SQLAlchemyError as db_error:
@@ -386,27 +347,8 @@ async def index_notion_pages(
             logger.info("No Notion pages found to index")
             return 0, "No Notion pages found"
         
-        # Get existing documents for this search space and connector type to prevent duplicates
-        existing_docs_result = await session.execute(
-            select(Document)
-            .filter(
-                Document.search_space_id == search_space_id,
-                Document.document_type == DocumentType.NOTION_CONNECTOR
-            )
-        )
-        existing_docs = existing_docs_result.scalars().all()
-        
-        # Create a lookup dictionary of existing documents by page_id
-        existing_docs_by_page_id = {}
-        for doc in existing_docs:
-            if "page_id" in doc.document_metadata:
-                existing_docs_by_page_id[doc.document_metadata["page_id"]] = doc
-        
-        logger.info(f"Found {len(existing_docs_by_page_id)} existing Notion documents in database")
-        
         # Track the number of documents indexed
         documents_indexed = 0
-        documents_updated = 0
         documents_skipped = 0
         skipped_pages = []
         
@@ -482,8 +424,7 @@ async def index_notion_pages(
                 metadata_sections = [
                     ("METADATA", [
                         f"PAGE_TITLE: {page_title}",
-                        f"PAGE_ID: {page_id}",
-                        f"INDEXED_AT: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}"
+                        f"PAGE_ID: {page_id}"
                     ]),
                     ("CONTENT", [
                         "FORMAT: markdown",
@@ -504,6 +445,18 @@ async def index_notion_pages(
                 
                 document_parts.append("</DOCUMENT>")
                 combined_document_string = '\n'.join(document_parts)
+                content_hash = generate_content_hash(combined_document_string)
+
+                # Check if document with this content hash already exists
+                existing_doc_by_hash_result = await session.execute(
+                    select(Document).where(Document.content_hash == content_hash)
+                )
+                existing_document_by_hash = existing_doc_by_hash_result.scalars().first()
+                
+                if existing_document_by_hash:
+                    logger.info(f"Document with content hash {content_hash} already exists for page {page_title}. Skipping processing.")
+                    documents_skipped += 1
+                    continue
                 
                 # Generate summary
                 logger.debug(f"Generating summary for page {page_title}")
@@ -519,55 +472,25 @@ async def index_notion_pages(
                     for chunk in config.chunker_instance.chunk(markdown_content)
                 ]
                 
-                # Check if this page already exists in our database
-                existing_document = existing_docs_by_page_id.get(page_id)
-                
-                if existing_document:
-                    # Update existing document instead of creating a new one
-                    logger.info(f"Updating existing document for page {page_title}")
-                    
-                    # Update document fields
-                    existing_document.title = f"Notion - {page_title}"
-                    existing_document.document_metadata = {
+                # Create and store new document
+                document = Document(
+                    search_space_id=search_space_id,
+                    title=f"Notion - {page_title}",
+                    document_type=DocumentType.NOTION_CONNECTOR,
+                    document_metadata={
                         "page_title": page_title,
                         "page_id": page_id,
-                        "indexed_at": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
-                        "last_updated": datetime.now().strftime("%Y-%m-%d %H:%M:%S")
-                    }
-                    existing_document.content = summary_content
-                    existing_document.embedding = summary_embedding
-                    
-                    # Delete existing chunks and add new ones
-                    await session.execute(
-                        delete(Chunk)
-                        .where(Chunk.document_id == existing_document.id)
-                    )
-                    
-                    # Assign new chunks to existing document
-                    for chunk in chunks:
-                        chunk.document_id = existing_document.id
-                        session.add(chunk)
-                    
-                    documents_updated += 1
-                else:
-                    # Create and store new document
-                    document = Document(
-                        search_space_id=search_space_id,
-                        title=f"Notion - {page_title}",
-                        document_type=DocumentType.NOTION_CONNECTOR,
-                        document_metadata={
-                            "page_title": page_title,
-                            "page_id": page_id,
-                            "indexed_at": datetime.now().strftime("%Y-%m-%d %H:%M:%S")
-                        },
-                        content=summary_content,
-                        embedding=summary_embedding,
-                        chunks=chunks
-                    )
-                    
-                    session.add(document)
-                    documents_indexed += 1
-                    logger.info(f"Successfully indexed new Notion page: {page_title}")
+                        "indexed_at": datetime.now().strftime("%Y-%m-%d %H:%M:%S")
+                    },
+                    content=summary_content,
+                    content_hash=content_hash,
+                    embedding=summary_embedding,
+                    chunks=chunks
+                )
+                
+                session.add(document)
+                documents_indexed += 1
+                logger.info(f"Successfully indexed new Notion page: {page_title}")
                 
             except Exception as e:
                 logger.error(f"Error processing Notion page {page.get('title', 'Unknown')}: {str(e)}", exc_info=True)
@@ -577,7 +500,7 @@ async def index_notion_pages(
         
         # Update the last_indexed_at timestamp for the connector only if requested
         # and if we successfully indexed at least one page
-        total_processed = documents_indexed + documents_updated
+        total_processed = documents_indexed
         if update_last_indexed and total_processed > 0:
             connector.last_indexed_at = datetime.now()
             logger.info(f"Updated last_indexed_at for connector {connector_id}")
@@ -588,11 +511,11 @@ async def index_notion_pages(
         # Prepare result message
         result_message = None
         if skipped_pages:
-            result_message = f"Processed {total_processed} pages ({documents_indexed} new, {documents_updated} updated). Skipped {len(skipped_pages)} pages: {', '.join(skipped_pages)}"
+            result_message = f"Processed {total_processed} pages. Skipped {len(skipped_pages)} pages: {', '.join(skipped_pages)}"
         else:
-            result_message = f"Processed {total_processed} pages ({documents_indexed} new, {documents_updated} updated)."
+            result_message = f"Processed {total_processed} pages."
         
-        logger.info(f"Notion indexing completed: {documents_indexed} new pages, {documents_updated} updated, {documents_skipped} skipped")
+        logger.info(f"Notion indexing completed: {documents_indexed} new pages, {documents_skipped} skipped")
         return total_processed, result_message
     
     except SQLAlchemyError as db_error:
@@ -660,19 +583,6 @@ async def index_github_repos(
         #    If a repo is inaccessible, get_repository_files will likely fail gracefully later.
         logger.info(f"Starting indexing for {len(repo_full_names_to_index)} selected repositories.")
 
-        # 5. Get existing documents for this search space and connector type to prevent duplicates
-        existing_docs_result = await session.execute(
-            select(Document)
-            .filter(
-                Document.search_space_id == search_space_id,
-                Document.document_type == DocumentType.GITHUB_CONNECTOR
-            )
-        )
-        existing_docs = existing_docs_result.scalars().all()
-        # Create a lookup dict: key=repo_fullname/file_path, value=Document object
-        existing_docs_lookup = {doc.document_metadata.get("full_path"): doc for doc in existing_docs if doc.document_metadata.get("full_path")}
-        logger.info(f"Found {len(existing_docs_lookup)} existing GitHub documents in database for search space {search_space_id}")
-
         # 6. Iterate through selected repositories and index files
         for repo_full_name in repo_full_names_to_index:
             if not repo_full_name or not isinstance(repo_full_name, str):
@@ -699,12 +609,6 @@ async def index_github_repos(
                         logger.warning(f"Skipping file with missing info in {repo_full_name}: {file_info}")
                         continue
 
-                    # Check if document already exists and if content hash matches
-                    existing_doc = existing_docs_lookup.get(full_path_key)
-                    if existing_doc and existing_doc.document_metadata.get("sha") == file_sha:
-                        logger.debug(f"Skipping unchanged file: {full_path_key}")
-                        continue # Skip if SHA matches (content hasn't changed)
-
                     # Get file content
                     file_content = github_client.get_file_content(repo_full_name, file_path)
 
@@ -712,6 +616,18 @@ async def index_github_repos(
                         logger.warning(f"Could not retrieve content for {full_path_key}. Skipping.")
                         continue # Skip if content fetch failed
                         
+                    content_hash = generate_content_hash(file_content)
+
+                    # Check if document with this content hash already exists
+                    existing_doc_by_hash_result = await session.execute(
+                        select(Document).where(Document.content_hash == content_hash)
+                    )
+                    existing_document_by_hash = existing_doc_by_hash_result.scalars().first()
+                    
+                    if existing_document_by_hash:
+                        logger.info(f"Document with content hash {content_hash} already exists for file {full_path_key}. Skipping processing.")
+                        continue
+                        
                     # Use file_content directly for chunking, maybe summary for main content?
                     # For now, let's use the full content for both, might need refinement
                     summary_content = f"GitHub file: {full_path_key}\n\n{file_content[:1000]}..." # Simple summary
@@ -738,42 +654,20 @@ async def index_github_repos(
                         "indexed_at": datetime.now(timezone.utc).isoformat()
                     }
 
-                    if existing_doc:
-                        # Update existing document
-                        logger.info(f"Updating document for file: {full_path_key}")
-                        existing_doc.title = f"GitHub - {file_path}"
-                        existing_doc.document_metadata = doc_metadata
-                        existing_doc.content = summary_content # Update summary
-                        existing_doc.embedding = summary_embedding # Update embedding
-
-                        # Delete old chunks
-                        await session.execute(
-                            delete(Chunk)
-                            .where(Chunk.document_id == existing_doc.id)
-                        )
-                        # Add new chunks
-                        for chunk_obj in chunks_data:
-                            chunk_obj.document_id = existing_doc.id
-                            session.add(chunk_obj)
-                        
-                        documents_processed += 1
-                    else:
-                        # Create new document
-                        logger.info(f"Creating new document for file: {full_path_key}")
-                        document = Document(
-                            title=f"GitHub - {file_path}",
-                            document_type=DocumentType.GITHUB_CONNECTOR,
-                            document_metadata=doc_metadata,
-                            content=summary_content, # Store summary
-                            embedding=summary_embedding,
-                            search_space_id=search_space_id,
-                            chunks=chunks_data # Associate chunks directly
-                        )
-                        session.add(document)
-                        documents_processed += 1
-
-                    # Commit periodically or at the end? For now, commit per repo
-                    # await session.commit() 
+                    # Create new document
+                    logger.info(f"Creating new document for file: {full_path_key}")
+                    document = Document(
+                        title=f"GitHub - {file_path}",
+                        document_type=DocumentType.GITHUB_CONNECTOR,
+                        document_metadata=doc_metadata,
+                        content=summary_content, # Store summary
+                        content_hash=content_hash,
+                        embedding=summary_embedding,
+                        search_space_id=search_space_id,
+                        chunks=chunks_data # Associate chunks directly
+                    )
+                    session.add(document)
+                    documents_processed += 1
 
             except Exception as repo_err:
                 logger.error(f"Failed to process repository {repo_full_name}: {repo_err}")
@@ -847,14 +741,14 @@ async def index_linear_issues(
             
             # Check if last_indexed_at is in the future or after end_date
             if last_indexed_naive > end_date:
-                logger.warning(f"Last indexed date ({last_indexed_naive.strftime('%Y-%m-%d')}) is in the future. Using 30 days ago instead.")
-                start_date = end_date - timedelta(days=30)
+                logger.warning(f"Last indexed date ({last_indexed_naive.strftime('%Y-%m-%d')}) is in the future. Using 365 days ago instead.")
+                start_date = end_date - timedelta(days=365)
             else:
                 start_date = last_indexed_naive
                 logger.info(f"Using last_indexed_at ({start_date.strftime('%Y-%m-%d')}) as start date")
         else:
-            start_date = end_date - timedelta(days=30)  # Use 30 days instead of 365 to catch recent issues
-            logger.info(f"No last_indexed_at found, using {start_date.strftime('%Y-%m-%d')} (30 days ago) as start date")
+            start_date = end_date - timedelta(days=365)  # Use 365 days as default
+            logger.info(f"No last_indexed_at found, using {start_date.strftime('%Y-%m-%d')} (365 days ago) as start date")
         
         # Format dates for Linear API
         start_date_str = start_date.strftime("%Y-%m-%d")
@@ -905,35 +799,8 @@ async def index_linear_issues(
         if len(issues) > 10:
             logger.info(f"  ...and {len(issues) - 10} more issues")
         
-        # Get existing documents for this search space and connector type to prevent duplicates
-        existing_docs_result = await session.execute(
-            select(Document)
-            .filter(
-                Document.search_space_id == search_space_id,
-                Document.document_type == DocumentType.LINEAR_CONNECTOR
-            )
-        )
-        existing_docs = existing_docs_result.scalars().all()
-        
-        # Create a lookup dictionary of existing documents by issue_id
-        existing_docs_by_issue_id = {}
-        for doc in existing_docs:
-            if "issue_id" in doc.document_metadata:
-                existing_docs_by_issue_id[doc.document_metadata["issue_id"]] = doc
-        
-        logger.info(f"Found {len(existing_docs_by_issue_id)} existing Linear documents in database")
-        
-        # Log existing document IDs for debugging
-        if existing_docs_by_issue_id:
-            logger.info("Existing Linear document issue IDs in database:")
-            for idx, (issue_id, doc) in enumerate(list(existing_docs_by_issue_id.items())[:10]):  # Log first 10
-                logger.info(f"  {idx+1}. {issue_id} - {doc.document_metadata.get('issue_identifier', 'Unknown')} - {doc.document_metadata.get('issue_title', 'Unknown')}")
-            if len(existing_docs_by_issue_id) > 10:
-                logger.info(f"  ...and {len(existing_docs_by_issue_id) - 10} more existing documents")
-        
         # Track the number of documents indexed
         documents_indexed = 0
-        documents_updated = 0
         documents_skipped = 0
         skipped_issues = []
         
@@ -979,6 +846,19 @@ async def index_linear_issues(
                 comment_count = len(formatted_issue.get("comments", []))
                 summary_content += f"Comments: {comment_count}"
                 
+                content_hash = generate_content_hash(issue_content)
+
+                # Check if document with this content hash already exists
+                existing_doc_by_hash_result = await session.execute(
+                    select(Document).where(Document.content_hash == content_hash)
+                )
+                existing_document_by_hash = existing_doc_by_hash_result.scalars().first()
+                
+                if existing_document_by_hash:
+                    logger.info(f"Document with content hash {content_hash} already exists for issue {issue_identifier}. Skipping processing.")
+                    documents_skipped += 1
+                    continue
+                
                 # Generate embedding for the summary
                 summary_embedding = config.embedding_model_instance.embed(summary_content)
                 
@@ -988,62 +868,29 @@ async def index_linear_issues(
                     for chunk in config.chunker_instance.chunk(issue_content)
                 ]
                 
-                # Check if this issue already exists in our database
-                existing_document = existing_docs_by_issue_id.get(issue_id)
-                
-                if existing_document:
-                    # Update existing document instead of creating a new one
-                    logger.info(f"Updating existing document for issue {issue_identifier} - {issue_title}")
-                    
-                    # Update document fields
-                    existing_document.title = f"Linear - {issue_identifier}: {issue_title}"
-                    existing_document.document_metadata = {
+                # Create and store new document
+                logger.info(f"Creating new document for issue {issue_identifier} - {issue_title}")
+                document = Document(
+                    search_space_id=search_space_id,
+                    title=f"Linear - {issue_identifier}: {issue_title}",
+                    document_type=DocumentType.LINEAR_CONNECTOR,
+                    document_metadata={
                         "issue_id": issue_id,
                         "issue_identifier": issue_identifier,
                         "issue_title": issue_title,
                         "state": state,
                         "comment_count": comment_count,
-                        "indexed_at": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
-                        "last_updated": datetime.now().strftime("%Y-%m-%d %H:%M:%S")
-                    }
-                    existing_document.content = summary_content
-                    existing_document.embedding = summary_embedding
-                    
-                    # Delete existing chunks and add new ones
-                    await session.execute(
-                        delete(Chunk)
-                        .where(Chunk.document_id == existing_document.id)
-                    )
-                    
-                    # Assign new chunks to existing document
-                    for chunk in chunks:
-                        chunk.document_id = existing_document.id
-                        session.add(chunk)
-                    
-                    documents_updated += 1
-                else:
-                    # Create and store new document
-                    logger.info(f"Creating new document for issue {issue_identifier} - {issue_title}")
-                    document = Document(
-                        search_space_id=search_space_id,
-                        title=f"Linear - {issue_identifier}: {issue_title}",
-                        document_type=DocumentType.LINEAR_CONNECTOR,
-                        document_metadata={
-                            "issue_id": issue_id,
-                            "issue_identifier": issue_identifier,
-                            "issue_title": issue_title,
-                            "state": state,
-                            "comment_count": comment_count,
-                            "indexed_at": datetime.now().strftime("%Y-%m-%d %H:%M:%S")
-                        },
-                        content=summary_content,
-                        embedding=summary_embedding,
-                        chunks=chunks
-                    )
-                    
-                    session.add(document)
-                    documents_indexed += 1
-                    logger.info(f"Successfully indexed new issue {issue_identifier} - {issue_title}")
+                        "indexed_at": datetime.now().strftime("%Y-%m-%d %H:%M:%S")
+                    },
+                    content=summary_content,
+                    content_hash=content_hash,
+                    embedding=summary_embedding,
+                    chunks=chunks
+                )
+                
+                session.add(document)
+                documents_indexed += 1
+                logger.info(f"Successfully indexed new issue {issue_identifier} - {issue_title}")
                 
             except Exception as e:
                 logger.error(f"Error processing issue {issue.get('identifier', 'Unknown')}: {str(e)}", exc_info=True)
@@ -1052,7 +899,7 @@ async def index_linear_issues(
                 continue  # Skip this issue and continue with others
         
         # Update the last_indexed_at timestamp for the connector only if requested
-        total_processed = documents_indexed + documents_updated
+        total_processed = documents_indexed
         if update_last_indexed:
             connector.last_indexed_at = datetime.now()
             logger.info(f"Updated last_indexed_at to {connector.last_indexed_at}")
@@ -1062,7 +909,7 @@ async def index_linear_issues(
         logger.info(f"Successfully committed all Linear document changes to database")
         
        
-        logger.info(f"Linear indexing completed: {documents_indexed} new issues, {documents_updated} updated, {documents_skipped} skipped")
+        logger.info(f"Linear indexing completed: {documents_indexed} new issues, {documents_skipped} skipped")
         return total_processed, None  # Return None as the error message to indicate success
     
     except SQLAlchemyError as db_error:
diff --git a/surfsense_backend/app/utils/document_converters.py b/surfsense_backend/app/utils/document_converters.py
index 0c9b8f73f..a6f69e4f6 100644
--- a/surfsense_backend/app/utils/document_converters.py
+++ b/surfsense_backend/app/utils/document_converters.py
@@ -1,19 +1,22 @@
+import hashlib
+
+
 async def convert_element_to_markdown(element) -> str:
     """
     Convert an Unstructured element to markdown format based on its category.
-    
+
     Args:
         element: The Unstructured API element object
-        
+
     Returns:
         str: Markdown formatted string
     """
     element_category = element.metadata["category"]
     content = element.page_content
-    
+
     if not content:
         return ""
-        
+
     markdown_mapping = {
         "Formula": lambda x: f"```math\n{x}\n```",
         "FigureCaption": lambda x: f"*Figure: {x}*",
@@ -31,7 +34,7 @@ async def convert_element_to_markdown(element) -> str:
         "PageNumber": lambda x: f"*Page {x}*\n\n",
         "UncategorizedText": lambda x: f"{x}\n\n"
     }
-    
+
     converter = markdown_mapping.get(element_category, lambda x: x)
     return converter(content)
 
@@ -39,29 +42,30 @@ async def convert_element_to_markdown(element) -> str:
 async def convert_document_to_markdown(elements):
     """
     Convert all document elements to markdown.
-    
+
     Args:
         elements: List of Unstructured API elements
-        
+
     Returns:
         str: Complete markdown document
     """
     markdown_parts = []
-    
+
     for element in elements:
         markdown_text = await convert_element_to_markdown(element)
         if markdown_text:
             markdown_parts.append(markdown_text)
-    
+
     return "".join(markdown_parts)
 
+
 def convert_chunks_to_langchain_documents(chunks):
     """
     Convert chunks from hybrid search results to LangChain Document objects.
-    
+
     Args:
         chunks: List of chunk dictionaries from hybrid search results
-        
+
     Returns:
         List of LangChain Document objects
     """
@@ -71,20 +75,20 @@ def convert_chunks_to_langchain_documents(chunks):
         raise ImportError(
             "LangChain is not installed. Please install it with `pip install langchain langchain-core`"
         )
-    
+
     langchain_docs = []
-    
+
     for chunk in chunks:
         # Extract content from the chunk
         content = chunk.get("content", "")
-        
+
         # Create metadata dictionary
         metadata = {
             "chunk_id": chunk.get("chunk_id"),
             "score": chunk.get("score"),
             "rank": chunk.get("rank") if "rank" in chunk else None,
         }
-        
+
         # Add document information to metadata
         if "document" in chunk:
             doc = chunk["document"]
@@ -93,24 +97,25 @@ def convert_chunks_to_langchain_documents(chunks):
                 "document_title": doc.get("title"),
                 "document_type": doc.get("document_type"),
             })
-            
+
             # Add document metadata if available
             if "metadata" in doc:
                 # Prefix document metadata keys to avoid conflicts
-                doc_metadata = {f"doc_meta_{k}": v for k, v in doc.get("metadata", {}).items()}
+                doc_metadata = {f"doc_meta_{k}": v for k,
+                                v in doc.get("metadata", {}).items()}
                 metadata.update(doc_metadata)
-                
+
                 # Add source URL if available in metadata
                 if "url" in doc.get("metadata", {}):
                     metadata["source"] = doc["metadata"]["url"]
                 elif "sourceURL" in doc.get("metadata", {}):
                     metadata["source"] = doc["metadata"]["sourceURL"]
-                    
+
         # Ensure source_id is set for citation purposes
         # Use document_id as the source_id if available
         if "document_id" in metadata:
             metadata["source_id"] = metadata["document_id"]
-        
+
         # Update content for citation mode - format as XML with explicit source_id
         new_content = f"""
         <document>
@@ -124,13 +129,18 @@ def convert_chunks_to_langchain_documents(chunks):
             </content>
         </document>
         """
-        
+
         # Create LangChain Document
         langchain_doc = LangChainDocument(
             page_content=new_content,
             metadata=metadata
         )
-        
+
         langchain_docs.append(langchain_doc)
-    
+
     return langchain_docs
+
+
+def generate_content_hash(content: str) -> str:
+    """Generate SHA-256 hash for the given content."""
+    return hashlib.sha256(content.encode('utf-8')).hexdigest()
diff --git a/surfsense_backend/draw.py b/surfsense_backend/draw.py
deleted file mode 100644
index ec55f79a5..000000000
--- a/surfsense_backend/draw.py
+++ /dev/null
@@ -1,5 +0,0 @@
-from app.agents.researcher.graph import graph as researcher_graph
-from app.agents.researcher.sub_section_writer.graph import graph as sub_section_writer_graph
-
-print(researcher_graph.get_graph().draw_mermaid())
-print(sub_section_writer_graph.get_graph().draw_mermaid())
\ No newline at end of file

From 73751c0eb13243f5d1ac1a9efcf7177acba10b6a Mon Sep 17 00:00:00 2001
From: "DESKTOP-RTLN3BA\\$punk" <vermarohanfinal@gmail.com>
Date: Fri, 30 May 2025 19:17:19 -0700
Subject: [PATCH 67/70] feat: Removed Hard Dependency on Unstructured.io

- Added Llamaparse Support :)
---
 README.md                                     |  36 +++--
 surfsense_backend/.env.example                |   6 +-
 surfsense_backend/app/config/__init__.py      |  13 +-
 .../app/routes/documents_routes.py            |  97 +++++++++----
 .../app/tasks/background_tasks.py             |  79 ++++++++++-
 surfsense_backend/pyproject.toml              |   1 +
 surfsense_backend/uv.lock                     | 128 +++++++++++++++++-
 surfsense_web/.env.example                    |   3 +-
 .../documents/upload/page.tsx                 | 113 ++++++++++++----
 .../content/docs/docker-installation.mdx      |   5 +-
 .../content/docs/manual-installation.mdx      |   5 +-
 11 files changed, 402 insertions(+), 84 deletions(-)

diff --git a/README.md b/README.md
index 8e0eae573..00cf5ade1 100644
--- a/README.md
+++ b/README.md
@@ -31,7 +31,7 @@ https://github.com/user-attachments/assets/bf64a6ca-934b-47ac-9e1b-edac5fe972ec
 ### 💡 **Idea**: 
 Have your own highly customizable private NotebookLM and Perplexity integrated with external sources.
 ### 📁 **Multiple File Format Uploading Support**
-Save content from your own personal files *(Documents, images, videos and supports **34 file extensions**)* to your own personal knowledge base .
+Save content from your own personal files *(Documents, images, videos and supports **50+ file extensions**)* to your own personal knowledge base .
 ### 🔍 **Powerful Search**
 Quickly research or find anything in your saved content .
 ### 💬 **Chat with your Saved Content**
@@ -66,35 +66,33 @@ Open source and easy to deploy locally.
 
 ### 📄 **Supported File Extensions**
 
-#### Document
+> **Note**: File format support depends on your ETL service configuration. LlamaCloud supports 100+ formats, while Unstructured supports 34+ core formats.
 
-`.doc`, `.docx`, `.odt`, `.rtf`, `.pdf`, `.xml`
+#### Documents & Text
+**LlamaCloud**: `.pdf`, `.doc`, `.docx`, `.docm`, `.dot`, `.dotm`, `.rtf`, `.txt`, `.xml`, `.epub`, `.odt`, `.wpd`, `.pages`, `.key`, `.numbers`, `.602`, `.abw`, `.cgm`, `.cwk`, `.hwp`, `.lwp`, `.mw`, `.mcw`, `.pbd`, `.sda`, `.sdd`, `.sdp`, `.sdw`, `.sgl`, `.sti`, `.sxi`, `.sxw`, `.stw`, `.sxg`, `.uof`, `.uop`, `.uot`, `.vor`, `.wps`, `.zabw`
 
-#### Text & Markup
+**Unstructured**: `.doc`, `.docx`, `.odt`, `.rtf`, `.pdf`, `.xml`, `.txt`, `.md`, `.markdown`, `.rst`, `.html`, `.org`, `.epub`
 
-`.txt`, `.md`, `.markdown`, `.rst`, `.html`, `.org`
+#### Presentations
+**LlamaCloud**: `.ppt`, `.pptx`, `.pptm`, `.pot`, `.potm`, `.potx`, `.odp`, `.key`
 
-#### Spreadsheets & Tables
+**Unstructured**: `.ppt`, `.pptx`
 
-`.xls`, `.xlsx`, `.csv`, `.tsv`
+#### Spreadsheets & Data
+**LlamaCloud**: `.xlsx`, `.xls`, `.xlsm`, `.xlsb`, `.xlw`, `.csv`, `.tsv`, `.ods`, `.fods`, `.numbers`, `.dbf`, `.123`, `.dif`, `.sylk`, `.slk`, `.prn`, `.et`, `.uos1`, `.uos2`, `.wk1`, `.wk2`, `.wk3`, `.wk4`, `.wks`, `.wq1`, `.wq2`, `.wb1`, `.wb2`, `.wb3`, `.qpw`, `.xlr`, `.eth`
 
-#### Audio & Video
-
-`.mp3`, `.mpga`, `.m4a`, `.wav`, `.mp4`, `.mpeg`, `.webm`
+**Unstructured**: `.xls`, `.xlsx`, `.csv`, `.tsv`
 
 #### Images
+**LlamaCloud**: `.jpg`, `.jpeg`, `.png`, `.gif`, `.bmp`, `.svg`, `.tiff`, `.webp`, `.html`, `.htm`, `.web`
 
-`.jpg`, `.jpeg`, `.png`, `.bmp`, `.tiff`, `.heic`
-
-#### Email & eBooks
-
-`.eml`, `.msg`, `.epub`
-
-#### PowerPoint Presentations & Other
-
-`.ppt`, `.pptx`, `.p7s`
+**Unstructured**: `.jpg`, `.jpeg`, `.png`, `.bmp`, `.tiff`, `.heic`
 
+#### Audio & Video *(Always Supported)*
+`.mp3`, `.mpga`, `.m4a`, `.wav`, `.mp4`, `.mpeg`, `.webm`
 
+#### Email & Communication
+**Unstructured**: `.eml`, `.msg`, `.p7s`
 
 ### 🔖 Cross Browser Extension
 - The SurfSense extension can be used to save any webpage you like.
diff --git a/surfsense_backend/.env.example b/surfsense_backend/.env.example
index f9c43d106..c0032a912 100644
--- a/surfsense_backend/.env.example
+++ b/surfsense_backend/.env.example
@@ -30,9 +30,13 @@ STT_SERVICE="openai/whisper-1"
 OPENAI_API_KEY="sk-proj-iA"
 GEMINI_API_KEY="AIzaSyB6-1641124124124124124124124124124"
 
-UNSTRUCTURED_API_KEY="Tpu3P0U8iy"
 FIRECRAWL_API_KEY="fcr-01J0000000000000000000000"
 
+#File Parser Service
+ETL_SERVICE="UNSTRUCTURED" or "LLAMACLOUD"
+UNSTRUCTURED_API_KEY="Tpu3P0U8iy"
+LLAMA_CLOUD_API_KEY="llx-nnn"
+
 #OPTIONAL: Add these for LangSmith Observability
 LANGSMITH_TRACING=true
 LANGSMITH_ENDPOINT="https://api.smith.langchain.com"
diff --git a/surfsense_backend/app/config/__init__.py b/surfsense_backend/app/config/__init__.py
index 81cd9a276..9135c3222 100644
--- a/surfsense_backend/app/config/__init__.py
+++ b/surfsense_backend/app/config/__init__.py
@@ -96,9 +96,18 @@ class Config:
     # OAuth JWT
     SECRET_KEY = os.getenv("SECRET_KEY")
     
-    # Unstructured API Key
-    UNSTRUCTURED_API_KEY = os.getenv("UNSTRUCTURED_API_KEY")
+    # ETL Service
+    ETL_SERVICE = os.getenv("ETL_SERVICE")
     
+    if ETL_SERVICE == "UNSTRUCTURED":
+        # Unstructured API Key
+        UNSTRUCTURED_API_KEY = os.getenv("UNSTRUCTURED_API_KEY")
+        
+    elif ETL_SERVICE == "LLAMACLOUD":
+        # LlamaCloud API Key
+        LLAMA_CLOUD_API_KEY = os.getenv("LLAMA_CLOUD_API_KEY")
+        
+        
     # Firecrawl API Key
     FIRECRAWL_API_KEY = os.getenv("FIRECRAWL_API_KEY", None) 
     
diff --git a/surfsense_backend/app/routes/documents_routes.py b/surfsense_backend/app/routes/documents_routes.py
index 5ea232729..acd246e18 100644
--- a/surfsense_backend/app/routes/documents_routes.py
+++ b/surfsense_backend/app/routes/documents_routes.py
@@ -7,7 +7,7 @@ from app.db import get_async_session, User, SearchSpace, Document, DocumentType
 from app.schemas import DocumentsCreate, DocumentUpdate, DocumentRead
 from app.users import current_active_user
 from app.utils.check_ownership import check_ownership
-from app.tasks.background_tasks import add_received_markdown_file_document, add_extension_received_document, add_received_file_document, add_crawled_url_document, add_youtube_video_document
+from app.tasks.background_tasks import add_received_markdown_file_document, add_extension_received_document, add_received_file_document_using_unstructured, add_crawled_url_document, add_youtube_video_document, add_received_file_document_using_llamacloud
 from app.config import config as app_config
 # Force asyncio to use standard event loop before unstructured imports
 import asyncio
@@ -101,8 +101,7 @@ async def create_documents(
                 content = await file.read()
                 with open(temp_path, "wb") as f:
                     f.write(content)
-
-                # Process in background to avoid uvloop conflicts
+         
                 fastapi_background_tasks.add_task(
                     process_file_in_background_with_new_session,
                     temp_path,
@@ -191,36 +190,74 @@ async def process_file_in_background(
                 search_space_id
             )
         else:
-            # Use synchronous unstructured API to avoid event loop issues
-            from langchain_unstructured import UnstructuredLoader
+            if app_config.ETL_SERVICE == "UNSTRUCTURED":
+                from langchain_unstructured import UnstructuredLoader
+                
+                # Process the file
+                loader = UnstructuredLoader(
+                    file_path,
+                    mode="elements",
+                    post_processors=[],
+                    languages=["eng"],
+                    include_orig_elements=False,
+                    include_metadata=False,
+                    strategy="auto",
+                )
 
-            # Process the file
-            loader = UnstructuredLoader(
-                file_path,
-                mode="elements",
-                post_processors=[],
-                languages=["eng"],
-                include_orig_elements=False,
-                include_metadata=False,
-                strategy="auto",
-            )
+                docs = await loader.aload()
 
-            docs = await loader.aload()
+                # Clean up the temp file
+                import os
+                try:
+                    os.unlink(file_path)
+                except:
+                    pass
 
-            # Clean up the temp file
-            import os
-            try:
-                os.unlink(file_path)
-            except:
-                pass
+                # Pass the documents to the existing background task
+                await add_received_file_document_using_unstructured(
+                    session,
+                    filename,
+                    docs,
+                    search_space_id
+                )
+            elif app_config.ETL_SERVICE == "LLAMACLOUD":
+                from llama_cloud_services import LlamaParse
+                from llama_cloud_services.parse.utils import ResultType
 
-            # Pass the documents to the existing background task
-            await add_received_file_document(
-                session,
-                filename,
-                docs,
-                search_space_id
-            )
+                
+                # Create LlamaParse parser instance
+                parser = LlamaParse(
+                    api_key=app_config.LLAMA_CLOUD_API_KEY,
+                    num_workers=1,  # Use single worker for file processing
+                    verbose=True,
+                    language="en",
+                    result_type=ResultType.MD
+                )
+                
+                # Parse the file asynchronously
+                result = await parser.aparse(file_path)
+                
+                # Clean up the temp file
+                import os
+                try:
+                    os.unlink(file_path)
+                except:
+                    pass
+                
+                # Get markdown documents from the result
+                markdown_documents = await result.aget_markdown_documents(split_by_page=False)
+                
+                for doc in markdown_documents:
+                    # Extract text content from the markdown documents
+                    markdown_content = doc.text
+                    
+                    # Process the documents using our LlamaCloud background task
+                    await add_received_file_document_using_llamacloud(
+                        session,
+                        filename,
+                        llamacloud_markdown_document=markdown_content,
+                        search_space_id=search_space_id
+                    )
     except Exception as e:
         import logging
         logging.error(f"Error processing file in background: {str(e)}")
@@ -442,3 +479,5 @@ async def process_youtube_video_with_new_session(
         except Exception as e:
             import logging
             logging.error(f"Error processing YouTube video: {str(e)}")
+
+
diff --git a/surfsense_backend/app/tasks/background_tasks.py b/surfsense_backend/app/tasks/background_tasks.py
index 1c6cd6d49..f6b1eb203 100644
--- a/surfsense_backend/app/tasks/background_tasks.py
+++ b/surfsense_backend/app/tasks/background_tasks.py
@@ -289,7 +289,7 @@ async def add_received_markdown_file_document(
         raise RuntimeError(f"Failed to process file document: {str(e)}")
 
 
-async def add_received_file_document(
+async def add_received_file_document_using_unstructured(
     session: AsyncSession,
     file_name: str,
     unstructured_processed_elements: List[LangChainDocument],
@@ -357,6 +357,83 @@ async def add_received_file_document(
         raise RuntimeError(f"Failed to process file document: {str(e)}")
 
 
+async def add_received_file_document_using_llamacloud(
+    session: AsyncSession,
+    file_name: str,
+    llamacloud_markdown_document: str,
+    search_space_id: int,
+) -> Optional[Document]:
+    """
+    Process and store document content parsed by LlamaCloud.
+
+    Args:
+        session: Database session
+        file_name: Name of the processed file
+        llamacloud_markdown_documents: List of markdown content from LlamaCloud parsing
+        search_space_id: ID of the search space
+
+    Returns:
+        Document object if successful, None if failed
+    """
+    try:
+        # Combine all markdown documents into one
+        file_in_markdown = llamacloud_markdown_document
+
+        content_hash = generate_content_hash(file_in_markdown)
+
+        # Check if document with this content hash already exists
+        existing_doc_result = await session.execute(
+            select(Document).where(Document.content_hash == content_hash)
+        )
+        existing_document = existing_doc_result.scalars().first()
+        
+        if existing_document:
+            logging.info(f"Document with content hash {content_hash} already exists. Skipping processing.")
+            return existing_document
+
+        # Generate summary
+        summary_chain = SUMMARY_PROMPT_TEMPLATE | config.long_context_llm_instance
+        summary_result = await summary_chain.ainvoke({"document": file_in_markdown})
+        summary_content = summary_result.content
+        summary_embedding = config.embedding_model_instance.embed(summary_content)
+
+        # Process chunks
+        chunks = [
+            Chunk(
+                content=chunk.text,
+                embedding=config.embedding_model_instance.embed(chunk.text),
+            )
+            for chunk in config.chunker_instance.chunk(file_in_markdown)
+        ]
+
+        # Create and store document
+        document = Document(
+            search_space_id=search_space_id,
+            title=file_name,
+            document_type=DocumentType.FILE,
+            document_metadata={
+                "FILE_NAME": file_name,
+                "ETL_SERVICE": "LLAMACLOUD",
+            },
+            content=summary_content,
+            embedding=summary_embedding,
+            chunks=chunks,
+            content_hash=content_hash,
+        )
+
+        session.add(document)
+        await session.commit()
+        await session.refresh(document)
+
+        return document
+    except SQLAlchemyError as db_error:
+        await session.rollback()
+        raise db_error
+    except Exception as e:
+        await session.rollback()
+        raise RuntimeError(f"Failed to process file document using LlamaCloud: {str(e)}")
+
+
 async def add_youtube_video_document(
     session: AsyncSession, url: str, search_space_id: int
 ):
diff --git a/surfsense_backend/pyproject.toml b/surfsense_backend/pyproject.toml
index 1e5345b09..dfa755946 100644
--- a/surfsense_backend/pyproject.toml
+++ b/surfsense_backend/pyproject.toml
@@ -17,6 +17,7 @@ dependencies = [
     "langgraph>=0.3.29",
     "linkup-sdk>=0.2.4",
     "litellm>=1.61.4",
+    "llama-cloud-services>=0.6.25",
     "markdownify>=0.14.1",
     "notion-client>=2.3.0",
     "pgvector>=0.3.6",
diff --git a/surfsense_backend/uv.lock b/surfsense_backend/uv.lock
index 5f90ed9ae..968e5c9e8 100644
--- a/surfsense_backend/uv.lock
+++ b/surfsense_backend/uv.lock
@@ -110,6 +110,18 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/ec/6a/bc7e17a3e87a2985d3e8f4da4cd0f481060eb78fb08596c42be62c90a4d9/aiosignal-1.3.2-py2.py3-none-any.whl", hash = "sha256:45cde58e409a301715980c2b01d0c28bdde3770d8290b5eb2173759d9acb31a5", size = 7597 },
 ]
 
+[[package]]
+name = "aiosqlite"
+version = "0.21.0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "typing-extensions" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/13/7d/8bca2bf9a247c2c5dfeec1d7a5f40db6518f88d314b8bca9da29670d2671/aiosqlite-0.21.0.tar.gz", hash = "sha256:131bb8056daa3bc875608c631c678cda73922a2d4ba8aec373b19f18c17e7aa3", size = 13454 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/f5/10/6c25ed6de94c49f88a91fa5018cb4c0f3625f31d5be9f771ebe5cc7cd506/aiosqlite-0.21.0-py3-none-any.whl", hash = "sha256:2549cf4057f95f53dcba16f2b64e8e2791d7e1adedb13197dd8ed77bb226d7d0", size = 15792 },
+]
+
 [[package]]
 name = "alembic"
 version = "1.15.2"
@@ -228,6 +240,22 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/df/73/b6e24bd22e6720ca8ee9a85a0c4a2971af8497d8f3193fa05390cbd46e09/backoff-2.2.1-py3-none-any.whl", hash = "sha256:63579f9a0628e06278f7e47b7d7d5b6ce20dc65c5e96a6f3ca99a6adca0396e8", size = 15148 },
 ]
 
+[[package]]
+name = "banks"
+version = "2.1.2"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "deprecated" },
+    { name = "griffe" },
+    { name = "jinja2" },
+    { name = "platformdirs" },
+    { name = "pydantic" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/77/34/2b6697f02ffb68bee50e5fd37d6c64432244d3245603fd62950169dfed7e/banks-2.1.2.tar.gz", hash = "sha256:a0651db9d14b57fa2e115e78f68dbb1b36fe226ad6eef96192542908b1d20c1f", size = 173332 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/04/4a/7fdca29d1db62f5f5c3446bf8f668beacdb0b5a8aff4247574ddfddc6bcd/banks-2.1.2-py3-none-any.whl", hash = "sha256:7fba451069f6bea376483b8136a0f29cb1e6883133626d00e077e20a3d102c0e", size = 28064 },
+]
+
 [[package]]
 name = "bcrypt"
 version = "4.2.1"
@@ -572,6 +600,15 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/6e/c6/ac0b6c1e2d138f1002bcf799d330bd6d85084fece321e662a14223794041/Deprecated-1.2.18-py2.py3-none-any.whl", hash = "sha256:bd5011788200372a32418f888e326a09ff80d0214bd961147cfed01b5c018eec", size = 9998 },
 ]
 
+[[package]]
+name = "dirtyjson"
+version = "1.0.8"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/db/04/d24f6e645ad82ba0ef092fa17d9ef7a21953781663648a01c9371d9e8e98/dirtyjson-1.0.8.tar.gz", hash = "sha256:90ca4a18f3ff30ce849d100dcf4a003953c79d3a2348ef056f1d9c22231a25fd", size = 30782 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/68/69/1bcf70f81de1b4a9f21b3a62ec0c83bdff991c88d6cc2267d02408457e88/dirtyjson-1.0.8-py3-none-any.whl", hash = "sha256:125e27248435a58acace26d5c2c4c11a1c0de0a9c5124c5a94ba78e517d74f53", size = 25197 },
+]
+
 [[package]]
 name = "distro"
 version = "1.9.0"
@@ -988,6 +1025,18 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/ac/38/08cc303ddddc4b3d7c628c3039a61a3aae36c241ed01393d00c2fd663473/greenlet-3.1.1-cp313-cp313t-musllinux_1_1_x86_64.whl", hash = "sha256:411f015496fec93c1c8cd4e5238da364e1da7a124bcb293f085bf2860c32c6f6", size = 1142112 },
 ]
 
+[[package]]
+name = "griffe"
+version = "1.7.3"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "colorama" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/a9/3e/5aa9a61f7c3c47b0b52a1d930302992229d191bf4bc76447b324b731510a/griffe-1.7.3.tar.gz", hash = "sha256:52ee893c6a3a968b639ace8015bec9d36594961e156e23315c8e8e51401fa50b", size = 395137 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/58/c6/5c20af38c2a57c15d87f7f38bee77d63c1d2a3689f74fefaf35915dd12b2/griffe-1.7.3-py3-none-any.whl", hash = "sha256:c6b3ee30c2f0f17f30bcdef5068d6ab7a2a4f1b8bf1a3e74b56fffd21e1c5f75", size = 129303 },
+]
+
 [[package]]
 name = "grpcio"
 version = "1.71.0"
@@ -1604,6 +1653,72 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/f9/c2/1b6c502909b7af9054736af61e27558a3341e8c1ba28e7f82473e6dd936f/litellm-1.61.4-py3-none-any.whl", hash = "sha256:e87e0d397a191795b4217f9299fc9b21eaacaab91409695f0a4780cceccda6e1", size = 6814517 },
 ]
 
+[[package]]
+name = "llama-cloud"
+version = "0.1.23"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "certifi" },
+    { name = "httpx" },
+    { name = "pydantic" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/5b/e4/d1a30167ed6690a408382be1cf7de220a506085f4371baaf067d65bad8fd/llama_cloud-0.1.23.tar.gz", hash = "sha256:3d84a24a860f046d39a106c06742ec0ea39a574ac42bbf91706fe025f44e233e", size = 101292 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/8a/15/3b56acef877dbc5d01d7e1a782c2cc50ef8a08d5773121c3bc20546de582/llama_cloud-0.1.23-py3-none-any.whl", hash = "sha256:ce95b0705d85c99b3b27b0af0d16a17d9a81b14c96bf13c1063a1bd13d8d0446", size = 267343 },
+]
+
+[[package]]
+name = "llama-cloud-services"
+version = "0.6.25"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "click" },
+    { name = "llama-cloud" },
+    { name = "llama-index-core" },
+    { name = "platformdirs" },
+    { name = "pydantic" },
+    { name = "python-dotenv" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/79/c0/89f89dfc2c2b6c2d5c1c5fde9f445696eb12f9c2a4e17637ab0aaf7cc373/llama_cloud_services-0.6.25.tar.gz", hash = "sha256:3608004b0cf984640a3a36657b8b40394d7ce2c48e3eb9dd24fc654df7643595", size = 32303 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/e6/f1/99b8ef4a636dafd5f1ae1e1b19eb9f793f51573d782919bf01d9b9f797f4/llama_cloud_services-0.6.25-py3-none-any.whl", hash = "sha256:aef0afbbf0d6dc485e6566af2daeeefa8caa7bc7f6511d860036bc0aac15361b", size = 37231 },
+]
+
+[[package]]
+name = "llama-index-core"
+version = "0.12.39"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "aiohttp" },
+    { name = "aiosqlite" },
+    { name = "banks" },
+    { name = "dataclasses-json" },
+    { name = "deprecated" },
+    { name = "dirtyjson" },
+    { name = "filetype" },
+    { name = "fsspec" },
+    { name = "httpx" },
+    { name = "nest-asyncio" },
+    { name = "networkx" },
+    { name = "nltk" },
+    { name = "numpy" },
+    { name = "pillow" },
+    { name = "pydantic" },
+    { name = "pyyaml" },
+    { name = "requests" },
+    { name = "sqlalchemy", extra = ["asyncio"] },
+    { name = "tenacity" },
+    { name = "tiktoken" },
+    { name = "tqdm" },
+    { name = "typing-extensions" },
+    { name = "typing-inspect" },
+    { name = "wrapt" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/f7/45/163806502804ff75ace474f868cc33158774c4eb31d565133f32932e930e/llama_index_core-0.12.39.tar.gz", hash = "sha256:0cca9de59953542a3c2f1db61327c5204e0b1e997f31f1200e49392b2879593a", size = 7292040 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/dd/a3/583d80764df75aefc9885f28dcc06a0e5aefc993fa5318186e70f2340d73/llama_index_core-0.12.39-py3-none-any.whl", hash = "sha256:c255ed87aa85e43893f2bb05870b61ce7701d7a6a931d174ba925def5856b4c2", size = 7664906 },
+]
+
 [[package]]
 name = "lxml"
 version = "5.3.1"
@@ -2468,6 +2583,15 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/cf/6c/41c21c6c8af92b9fea313aa47c75de49e2f9a467964ee33eb0135d47eb64/pillow-11.1.0-cp313-cp313t-win_arm64.whl", hash = "sha256:67cd427c68926108778a9005f2a04adbd5e67c442ed21d95389fe1d595458756", size = 2377651 },
 ]
 
+[[package]]
+name = "platformdirs"
+version = "4.3.8"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/fe/8b/3c73abc9c759ecd3f1f7ceff6685840859e8070c4d947c93fae71f6a0bf2/platformdirs-4.3.8.tar.gz", hash = "sha256:3d512d96e16bcb959a814c9f348431070822a6496326a4be0911c40b5a74c2bc", size = 21362 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/fe/39/979e8e21520d4e47a0bbe349e2713c0aac6f3d853d0e5b34d76206c439aa/platformdirs-4.3.8-py3-none-any.whl", hash = "sha256:ff7059bb7eb1179e2685604f4aaf157cfd9535242bd23742eadc3c13542139b4", size = 18567 },
+]
+
 [[package]]
 name = "playwright"
 version = "1.50.0"
@@ -3392,7 +3516,7 @@ wheels = [
 
 [[package]]
 name = "surf-new-backend"
-version = "0.0.6"
+version = "0.0.7"
 source = { virtual = "." }
 dependencies = [
     { name = "alembic" },
@@ -3407,6 +3531,7 @@ dependencies = [
     { name = "langgraph" },
     { name = "linkup-sdk" },
     { name = "litellm" },
+    { name = "llama-cloud-services" },
     { name = "markdownify" },
     { name = "notion-client" },
     { name = "pgvector" },
@@ -3438,6 +3563,7 @@ requires-dist = [
     { name = "langgraph", specifier = ">=0.3.29" },
     { name = "linkup-sdk", specifier = ">=0.2.4" },
     { name = "litellm", specifier = ">=1.61.4" },
+    { name = "llama-cloud-services", specifier = ">=0.6.25" },
     { name = "markdownify", specifier = ">=0.14.1" },
     { name = "notion-client", specifier = ">=2.3.0" },
     { name = "pgvector", specifier = ">=0.3.6" },
diff --git a/surfsense_web/.env.example b/surfsense_web/.env.example
index 3ab9d176f..03f266baa 100644
--- a/surfsense_web/.env.example
+++ b/surfsense_web/.env.example
@@ -1,2 +1,3 @@
 NEXT_PUBLIC_FASTAPI_BACKEND_URL=http://localhost:8000
-NEXT_PUBLIC_FASTAPI_BACKEND_AUTH_TYPE=LOCAL or GOOGLE
\ No newline at end of file
+NEXT_PUBLIC_FASTAPI_BACKEND_AUTH_TYPE=LOCAL or GOOGLE
+NEXT_PUBLIC_ETL_SERVICE=UNSTRUCTURED or LLAMACLOUD
\ No newline at end of file
diff --git a/surfsense_web/app/dashboard/[search_space_id]/documents/upload/page.tsx b/surfsense_web/app/dashboard/[search_space_id]/documents/upload/page.tsx
index e1adbe2d6..b8848b05e 100644
--- a/surfsense_web/app/dashboard/[search_space_id]/documents/upload/page.tsx
+++ b/surfsense_web/app/dashboard/[search_space_id]/documents/upload/page.tsx
@@ -42,38 +42,95 @@ export default function FileUploader() {
     const router = useRouter();
     const fileInputRef = useRef<HTMLInputElement>(null);
 
-    const acceptedFileTypes = {
-        'image/bmp': ['.bmp'],
-        'text/csv': ['.csv'],
-        'application/msword': ['.doc'],
-        'application/vnd.openxmlformats-officedocument.wordprocessingml.document': ['.docx'],
-        'message/rfc822': ['.eml'],
-        'application/epub+zip': ['.epub'],
-        'image/heic': ['.heic'],
-        'text/html': ['.html'],
-        'image/jpeg': ['.jpeg', '.jpg'],
-        'image/png': ['.png'],
-        'text/markdown': ['.md', '.markdown'],
-        'application/vnd.ms-outlook': ['.msg'],
-        'application/vnd.oasis.opendocument.text': ['.odt'],
-        'text/x-org': ['.org'],
-        'application/pkcs7-signature': ['.p7s'],
-        'application/pdf': ['.pdf'],
-        'application/vnd.ms-powerpoint': ['.ppt'],
-        'application/vnd.openxmlformats-officedocument.presentationml.presentation': ['.pptx'],
-        'text/x-rst': ['.rst'],
-        'application/rtf': ['.rtf'],
-        'image/tiff': ['.tiff'],
-        'text/plain': ['.txt'],
-        'text/tab-separated-values': ['.tsv'],
-        'application/vnd.ms-excel': ['.xls'],
-        'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet': ['.xlsx'],
-        'application/xml': ['.xml'],
+    // Audio files are always supported (using whisper)
+    const audioFileTypes = {
         'audio/mpeg': ['.mp3', '.mpeg', '.mpga'],
         'audio/mp4': ['.mp4', '.m4a'],
         'audio/wav': ['.wav'],
         'audio/webm': ['.webm'],
-    }
+    };
+
+    // Conditionally set accepted file types based on ETL service
+    const acceptedFileTypes = process.env.NEXT_PUBLIC_ETL_SERVICE === 'LLAMACLOUD' 
+        ? {
+            // LlamaCloud supported file types
+            'application/pdf': ['.pdf'],
+            'application/msword': ['.doc'],
+            'application/vnd.openxmlformats-officedocument.wordprocessingml.document': ['.docx'],
+            'application/vnd.ms-word.document.macroEnabled.12': ['.docm'],
+            'application/msword-template': ['.dot'],
+            'application/vnd.ms-word.template.macroEnabled.12': ['.dotm'],
+            'application/vnd.ms-powerpoint': ['.ppt'],
+            'application/vnd.ms-powerpoint.template.macroEnabled.12': ['.pptm'],
+            'application/vnd.openxmlformats-officedocument.presentationml.presentation': ['.pptx'],
+            'application/vnd.ms-powerpoint.template': ['.pot'],
+            'application/vnd.openxmlformats-officedocument.presentationml.template': ['.potx'],
+            'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet': ['.xlsx'],
+            'application/vnd.ms-excel': ['.xls'],
+            'application/vnd.ms-excel.sheet.macroEnabled.12': ['.xlsm'],
+            'application/vnd.ms-excel.sheet.binary.macroEnabled.12': ['.xlsb'],
+            'application/vnd.ms-excel.workspace': ['.xlw'],
+            'application/rtf': ['.rtf'],
+            'application/xml': ['.xml'],
+            'application/epub+zip': ['.epub'],
+            'application/vnd.apple.keynote': ['.key'],
+            'application/vnd.apple.pages': ['.pages'],
+            'application/vnd.apple.numbers': ['.numbers'],
+            'application/vnd.wordperfect': ['.wpd'],
+            'application/vnd.oasis.opendocument.text': ['.odt'],
+            'application/vnd.oasis.opendocument.presentation': ['.odp'],
+            'application/vnd.oasis.opendocument.graphics': ['.odg'],
+            'application/vnd.oasis.opendocument.spreadsheet': ['.ods'],
+            'application/vnd.oasis.opendocument.formula': ['.fods'],
+            'text/plain': ['.txt'],
+            'text/csv': ['.csv'],
+            'text/tab-separated-values': ['.tsv'],
+            'text/html': ['.html', '.htm', '.web'],
+            'image/jpeg': ['.jpg', '.jpeg'],
+            'image/png': ['.png'],
+            'image/gif': ['.gif'],
+            'image/bmp': ['.bmp'],
+            'image/svg+xml': ['.svg'],
+            'image/tiff': ['.tiff'],
+            'image/webp': ['.webp'],
+            'application/dbase': ['.dbf'],
+            'application/vnd.lotus-1-2-3': ['.123'],
+            'text/x-web-markdown': ['.602', '.abw', '.cgm', '.cwk', '.hwp', '.lwp', '.mw', '.mcw', '.pbd', '.sda', '.sdd', '.sdp', '.sdw', '.sgl', '.sti', '.sxi', '.sxw', '.stw', '.sxg', '.uof', '.uop', '.uot', '.vor', '.wps', '.zabw'],
+            'text/x-spreadsheet': ['.dif', '.sylk', '.slk', '.prn', '.et', '.uos1', '.uos2', '.wk1', '.wk2', '.wk3', '.wk4', '.wks', '.wq1', '.wq2', '.wb1', '.wb2', '.wb3', '.qpw', '.xlr', '.eth'],
+            // Audio files (always supported)
+            ...audioFileTypes,
+        }
+        : {
+            // Unstructured supported file types
+            'image/bmp': ['.bmp'],
+            'text/csv': ['.csv'],
+            'application/msword': ['.doc'],
+            'application/vnd.openxmlformats-officedocument.wordprocessingml.document': ['.docx'],
+            'message/rfc822': ['.eml'],
+            'application/epub+zip': ['.epub'],
+            'image/heic': ['.heic'],
+            'text/html': ['.html'],
+            'image/jpeg': ['.jpeg', '.jpg'],
+            'image/png': ['.png'],
+            'text/markdown': ['.md', '.markdown'],
+            'application/vnd.ms-outlook': ['.msg'],
+            'application/vnd.oasis.opendocument.text': ['.odt'],
+            'text/x-org': ['.org'],
+            'application/pkcs7-signature': ['.p7s'],
+            'application/pdf': ['.pdf'],
+            'application/vnd.ms-powerpoint': ['.ppt'],
+            'application/vnd.openxmlformats-officedocument.presentationml.presentation': ['.pptx'],
+            'text/x-rst': ['.rst'],
+            'application/rtf': ['.rtf'],
+            'image/tiff': ['.tiff'],
+            'text/plain': ['.txt'],
+            'text/tab-separated-values': ['.tsv'],
+            'application/vnd.ms-excel': ['.xls'],
+            'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet': ['.xlsx'],
+            'application/xml': ['.xml'],
+            // Audio files (always supported)
+            ...audioFileTypes,
+        };
 
     const supportedExtensions = Array.from(new Set(Object.values(acceptedFileTypes).flat())).sort()
 
diff --git a/surfsense_web/content/docs/docker-installation.mdx b/surfsense_web/content/docs/docker-installation.mdx
index aac7cc7cc..03d687489 100644
--- a/surfsense_web/content/docs/docker-installation.mdx
+++ b/surfsense_web/content/docs/docker-installation.mdx
@@ -90,7 +90,9 @@ Before you begin, ensure you have:
 | FAST_LLM                   | LiteLLM routed smaller, faster LLM (e.g., `openai/gpt-4o-mini`, `ollama/deepseek-r1:8b`)                                                                                                  |
 | STRATEGIC_LLM              | LiteLLM routed advanced LLM for complex tasks (e.g., `openai/gpt-4o`, `ollama/gemma3:12b`)                                                                                                |
 | LONG_CONTEXT_LLM           | LiteLLM routed LLM for longer context windows (e.g., `gemini/gemini-2.0-flash`, `ollama/deepseek-r1:8b`)                                                                                  |
-| UNSTRUCTURED_API_KEY       | API key for Unstructured.io service for document parsing                                                                                                                                  |
+| ETL_SERVICE                | Document parsing service: `UNSTRUCTURED` (supports 34+ formats) or `LLAMACLOUD` (supports 50+ formats including legacy document types)                                                  |
+| UNSTRUCTURED_API_KEY       | API key for Unstructured.io service for document parsing (required if ETL_SERVICE=UNSTRUCTURED)                                                                                           |
+| LLAMA_CLOUD_API_KEY        | API key for LlamaCloud service for document parsing (required if ETL_SERVICE=LLAMACLOUD)                                                                                                  |
 | FIRECRAWL_API_KEY          | API key for Firecrawl service for web crawling                                                                                                                                            |
 | TTS_SERVICE                | Text-to-Speech API provider for Podcasts (e.g., `openai/tts-1`, `azure/neural`, `vertex_ai/`). See [supported providers](https://docs.litellm.ai/docs/text_to_speech#supported-providers) |
 | STT_SERVICE                | Speech-to-Text API provider for Podcasts (e.g., `openai/whisper-1`). See [supported providers](https://docs.litellm.ai/docs/audio_transcription#supported-providers)                      |
@@ -136,6 +138,7 @@ For other LLM providers, refer to the [LiteLLM documentation](https://docs.litel
 | ------------------------------- | ---------------------------------------------------------- |
 | NEXT_PUBLIC_FASTAPI_BACKEND_URL | URL of the backend service (e.g., `http://localhost:8000`) |
 | NEXT_PUBLIC_FASTAPI_BACKEND_AUTH_TYPE | Same value as set in backend AUTH_TYPE i.e `GOOGLE` for OAuth with Google, `LOCAL` for email/password authentication  |
+| NEXT_PUBLIC_ETL_SERVICE         | Document parsing service (should match backend ETL_SERVICE): `UNSTRUCTURED` or `LLAMACLOUD` - affects supported file formats in upload interface |
 
 2. **Build and Start Containers**
 
diff --git a/surfsense_web/content/docs/manual-installation.mdx b/surfsense_web/content/docs/manual-installation.mdx
index 72492c13b..82b9f0fef 100644
--- a/surfsense_web/content/docs/manual-installation.mdx
+++ b/surfsense_web/content/docs/manual-installation.mdx
@@ -61,7 +61,9 @@ Edit the `.env` file and set the following variables:
 | FAST_LLM                   | LiteLLM routed smaller, faster LLM (e.g., `openai/gpt-4o-mini`, `ollama/deepseek-r1:8b`)                                                                                                  |
 | STRATEGIC_LLM              | LiteLLM routed advanced LLM for complex tasks (e.g., `openai/gpt-4o`, `ollama/gemma3:12b`)                                                                                                |
 | LONG_CONTEXT_LLM           | LiteLLM routed LLM for longer context windows (e.g., `gemini/gemini-2.0-flash`, `ollama/deepseek-r1:8b`)                                                                                  |
-| UNSTRUCTURED_API_KEY       | API key for Unstructured.io service for document parsing                                                                                                                                  |
+| ETL_SERVICE                | Document parsing service: `UNSTRUCTURED` (supports 34+ formats) or `LLAMACLOUD` (supports 50+ formats including legacy document types)                                                  |
+| UNSTRUCTURED_API_KEY       | API key for Unstructured.io service for document parsing (required if ETL_SERVICE=UNSTRUCTURED)                                                                                           |
+| LLAMA_CLOUD_API_KEY        | API key for LlamaCloud service for document parsing (required if ETL_SERVICE=LLAMACLOUD)                                                                                                  |
 | FIRECRAWL_API_KEY          | API key for Firecrawl service for web crawling                                                                                                                                            |
 | TTS_SERVICE                | Text-to-Speech API provider for Podcasts (e.g., `openai/tts-1`, `azure/neural`, `vertex_ai/`). See [supported providers](https://docs.litellm.ai/docs/text_to_speech#supported-providers) |
 | STT_SERVICE                | Speech-to-Text API provider for Podcasts (e.g., `openai/whisper-1`). See [supported providers](https://docs.litellm.ai/docs/audio_transcription#supported-providers)                      |
@@ -182,6 +184,7 @@ Edit the `.env` file and set:
 | ------------------------------- | ------------------------------------------- |
 | NEXT_PUBLIC_FASTAPI_BACKEND_URL | Backend URL (e.g., `http://localhost:8000`) |
 | NEXT_PUBLIC_FASTAPI_BACKEND_AUTH_TYPE | Same value as set in backend AUTH_TYPE i.e `GOOGLE` for OAuth with Google, `LOCAL` for email/password authentication  |
+| NEXT_PUBLIC_ETL_SERVICE         | Document parsing service (should match backend ETL_SERVICE): `UNSTRUCTURED` or `LLAMACLOUD` - affects supported file formats in upload interface |
 
 ### 2. Install Dependencies
 

From b67b4972f05010e9e3f2087ebc5c4d7ff0039856 Mon Sep 17 00:00:00 2001
From: "DESKTOP-RTLN3BA\\$punk" <vermarohanfinal@gmail.com>
Date: Fri, 30 May 2025 19:27:38 -0700
Subject: [PATCH 68/70] readme fix

---
 README.md | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/README.md b/README.md
index 00cf5ade1..9eefaf603 100644
--- a/README.md
+++ b/README.md
@@ -64,34 +64,34 @@ Open source and easy to deploy locally.
 - GitHub
 - and more to come.....
 
-### 📄 **Supported File Extensions**
+## 📄 **Supported File Extensions**
 
 > **Note**: File format support depends on your ETL service configuration. LlamaCloud supports 100+ formats, while Unstructured supports 34+ core formats.
 
-#### Documents & Text
+### Documents & Text
 **LlamaCloud**: `.pdf`, `.doc`, `.docx`, `.docm`, `.dot`, `.dotm`, `.rtf`, `.txt`, `.xml`, `.epub`, `.odt`, `.wpd`, `.pages`, `.key`, `.numbers`, `.602`, `.abw`, `.cgm`, `.cwk`, `.hwp`, `.lwp`, `.mw`, `.mcw`, `.pbd`, `.sda`, `.sdd`, `.sdp`, `.sdw`, `.sgl`, `.sti`, `.sxi`, `.sxw`, `.stw`, `.sxg`, `.uof`, `.uop`, `.uot`, `.vor`, `.wps`, `.zabw`
 
 **Unstructured**: `.doc`, `.docx`, `.odt`, `.rtf`, `.pdf`, `.xml`, `.txt`, `.md`, `.markdown`, `.rst`, `.html`, `.org`, `.epub`
 
-#### Presentations
+### Presentations
 **LlamaCloud**: `.ppt`, `.pptx`, `.pptm`, `.pot`, `.potm`, `.potx`, `.odp`, `.key`
 
 **Unstructured**: `.ppt`, `.pptx`
 
-#### Spreadsheets & Data
+### Spreadsheets & Data
 **LlamaCloud**: `.xlsx`, `.xls`, `.xlsm`, `.xlsb`, `.xlw`, `.csv`, `.tsv`, `.ods`, `.fods`, `.numbers`, `.dbf`, `.123`, `.dif`, `.sylk`, `.slk`, `.prn`, `.et`, `.uos1`, `.uos2`, `.wk1`, `.wk2`, `.wk3`, `.wk4`, `.wks`, `.wq1`, `.wq2`, `.wb1`, `.wb2`, `.wb3`, `.qpw`, `.xlr`, `.eth`
 
 **Unstructured**: `.xls`, `.xlsx`, `.csv`, `.tsv`
 
-#### Images
+### Images
 **LlamaCloud**: `.jpg`, `.jpeg`, `.png`, `.gif`, `.bmp`, `.svg`, `.tiff`, `.webp`, `.html`, `.htm`, `.web`
 
 **Unstructured**: `.jpg`, `.jpeg`, `.png`, `.bmp`, `.tiff`, `.heic`
 
-#### Audio & Video *(Always Supported)*
+### Audio & Video *(Always Supported)*
 `.mp3`, `.mpga`, `.m4a`, `.wav`, `.mp4`, `.mpeg`, `.webm`
 
-#### Email & Communication
+### Email & Communication
 **Unstructured**: `.eml`, `.msg`, `.p7s`
 
 ### 🔖 Cross Browser Extension

From 0dbcf56e33471773204db4f7bcbc0e7d7f3b8fb0 Mon Sep 17 00:00:00 2001
From: "DESKTOP-RTLN3BA\\$punk" <vermarohanfinal@gmail.com>
Date: Fri, 30 May 2025 19:28:59 -0700
Subject: [PATCH 69/70] bs fix

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 9eefaf603..7272206a4 100644
--- a/README.md
+++ b/README.md
@@ -66,7 +66,7 @@ Open source and easy to deploy locally.
 
 ## 📄 **Supported File Extensions**
 
-> **Note**: File format support depends on your ETL service configuration. LlamaCloud supports 100+ formats, while Unstructured supports 34+ core formats.
+> **Note**: File format support depends on your ETL service configuration. LlamaCloud supports 50+ formats, while Unstructured supports 34+ core formats.
 
 ### Documents & Text
 **LlamaCloud**: `.pdf`, `.doc`, `.docx`, `.docm`, `.dot`, `.dotm`, `.rtf`, `.txt`, `.xml`, `.epub`, `.odt`, `.wpd`, `.pages`, `.key`, `.numbers`, `.602`, `.abw`, `.cgm`, `.cwk`, `.hwp`, `.lwp`, `.mw`, `.mcw`, `.pbd`, `.sda`, `.sdd`, `.sdp`, `.sdw`, `.sgl`, `.sti`, `.sxi`, `.sxw`, `.stw`, `.sxg`, `.uof`, `.uop`, `.uot`, `.vor`, `.wps`, `.zabw`

From 0365febc8cf5bd2187bf3dba697384bad7959ce7 Mon Sep 17 00:00:00 2001
From: "DESKTOP-RTLN3BA\\$punk" <vermarohanfinal@gmail.com>
Date: Fri, 30 May 2025 19:30:56 -0700
Subject: [PATCH 70/70] fix for content hashing

---
 surfsense_backend/app/tasks/background_tasks.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/surfsense_backend/app/tasks/background_tasks.py b/surfsense_backend/app/tasks/background_tasks.py
index f6b1eb203..18ac2917a 100644
--- a/surfsense_backend/app/tasks/background_tasks.py
+++ b/surfsense_backend/app/tasks/background_tasks.py
@@ -268,7 +268,6 @@ async def add_received_markdown_file_document(
             document_type=DocumentType.FILE,
             document_metadata={
                 "FILE_NAME": file_name,
-                "SAVED_AT": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
             },
             content=summary_content,
             embedding=summary_embedding,
@@ -336,7 +335,7 @@ async def add_received_file_document_using_unstructured(
             document_type=DocumentType.FILE,
             document_metadata={
                 "FILE_NAME": file_name,
-                "SAVED_AT": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
+                "ETL_SERVICE": "UNSTRUCTURED",
             },
             content=summary_content,
             embedding=summary_embedding,