(null);
-
- // Function to validate a URL
- const isValidUrl = (url: string): boolean => {
- return urlRegex.test(url);
- };
-
- // Function to handle URL submission
- const handleSubmit = async () => {
- // Validate that we have at least one URL
- if (urlTags.length === 0) {
- setError(t("error_no_url"));
- return;
- }
-
- // Validate all URLs
- const invalidUrls = urlTags.filter((tag) => !isValidUrl(tag.text));
- if (invalidUrls.length > 0) {
- setError(t("error_invalid_urls", { urls: invalidUrls.map((tag) => tag.text).join(", ") }));
- return;
- }
-
- setError(null);
- setIsSubmitting(true);
-
- try {
- toast(t("crawling_toast"), {
- description: t("crawling_toast_desc"),
- });
-
- // Extract URLs from tags
- const urls = urlTags.map((tag) => tag.text);
-
- // Make API call to backend
- const response = await fetch(
- `${process.env.NEXT_PUBLIC_FASTAPI_BACKEND_URL}/api/v1/documents`,
- {
- method: "POST",
- headers: {
- "Content-Type": "application/json",
- Authorization: `Bearer ${localStorage.getItem("surfsense_bearer_token")}`,
- },
- body: JSON.stringify({
- document_type: "CRAWLED_URL",
- content: urls,
- search_space_id: parseInt(search_space_id),
- }),
- }
- );
-
- if (!response.ok) {
- throw new Error("Failed to crawl URLs");
- }
-
- await response.json();
-
- toast(t("success_toast"), {
- description: t("success_toast_desc"),
- });
-
- // Redirect to documents page
- router.push(`/dashboard/${search_space_id}/documents`);
- } catch (error: any) {
- setError(error.message || t("error_generic"));
- toast(t("error_toast"), {
- description: `${t("error_toast_desc")}: ${error.message}`,
- });
- } finally {
- setIsSubmitting(false);
- }
- };
-
- // Function to add a new URL tag
- const handleAddTag = (text: string) => {
- // Basic URL validation
- if (!isValidUrl(text)) {
- toast(t("invalid_url_toast"), {
- description: t("invalid_url_toast_desc"),
- });
- return;
- }
-
- // Check for duplicates
- if (urlTags.some((tag) => tag.text === text)) {
- toast(t("duplicate_url_toast"), {
- description: t("duplicate_url_toast_desc"),
- });
- return;
- }
-
- // Add the new tag
- const newTag: Tag = {
- id: Date.now().toString(),
- text: text,
- };
-
- setUrlTags([...urlTags, newTag]);
- };
-
- return (
-
-
-
-
-
- {t("title")}
-
- {t("subtitle")}
-
-
-
-
-
-
-
{t("hint")}
-
-
- {error &&
{error}
}
-
-
-
{t("tips_title")}
-
- - {t("tip_1")}
- - {t("tip_2")}
- - {t("tip_3")}
- - {t("tip_4")}
-
-
-
-
-
-
-
-
-
-
- );
-}
diff --git a/surfsense_web/app/dashboard/[search_space_id]/sources/add/page.tsx b/surfsense_web/app/dashboard/[search_space_id]/sources/add/page.tsx
index 5172a0fa8..335d6d235 100644
--- a/surfsense_web/app/dashboard/[search_space_id]/sources/add/page.tsx
+++ b/surfsense_web/app/dashboard/[search_space_id]/sources/add/page.tsx
@@ -1,9 +1,9 @@
"use client";
import { IconBrandYoutube } from "@tabler/icons-react";
-import { Cable, Database, Upload } from "lucide-react";
+import { Cable, Database, Globe, Upload } from "lucide-react";
import { motion } from "motion/react";
-import { useParams, useSearchParams } from "next/navigation";
+import { useParams, useRouter, useSearchParams } from "next/navigation";
import { useEffect, useState } from "react";
import { ConnectorsTab } from "@/components/sources/ConnectorsTab";
import { DocumentUploadTab } from "@/components/sources/DocumentUploadTab";
@@ -12,6 +12,7 @@ import { Tabs, TabsContent, TabsList, TabsTrigger } from "@/components/ui/tabs";
export default function AddSourcesPage() {
const params = useParams();
+ const router = useRouter();
const searchParams = useSearchParams();
const search_space_id = params.search_space_id as string;
const [activeTab, setActiveTab] = useState("documents");
@@ -24,6 +25,14 @@ export default function AddSourcesPage() {
}
}, [searchParams]);
+ const handleTabChange = (value: string) => {
+ if (value === "webpages") {
+ router.push(`/dashboard/${search_space_id}/connectors/add/webcrawler-connector`);
+ } else {
+ setActiveTab(value);
+ }
+ };
+
return (
{/* Tabs */}
-
-
+
+
- Documents
+ Documents
+ Docs
YouTube
+
+
+ Web Pages
+ Web
+
- Connectors
+ Connectors
+ More
diff --git a/surfsense_web/components/dashboard-breadcrumb.tsx b/surfsense_web/components/dashboard-breadcrumb.tsx
index 0324ee1b6..3e2e5199e 100644
--- a/surfsense_web/components/dashboard-breadcrumb.tsx
+++ b/surfsense_web/components/dashboard-breadcrumb.tsx
@@ -138,6 +138,7 @@ export function DashboardBreadcrumb() {
"linkup-api": "LinkUp API",
"luma-connector": "Luma",
"elasticsearch-connector": "Elasticsearch",
+ "webcrawler-connector": "Web Pages",
};
const connectorLabel = connectorLabels[connectorType] || connectorType;
diff --git a/surfsense_web/components/editConnector/types.ts b/surfsense_web/components/editConnector/types.ts
index 8a9ef29d2..490474222 100644
--- a/surfsense_web/components/editConnector/types.ts
+++ b/surfsense_web/components/editConnector/types.ts
@@ -52,5 +52,7 @@ export const editConnectorSchema = z.object({
GOOGLE_CALENDAR_CALENDAR_IDS: z.string().optional(),
LUMA_API_KEY: z.string().optional(),
ELASTICSEARCH_API_KEY: z.string().optional(),
+ FIRECRAWL_API_KEY: z.string().optional(),
+ INITIAL_URLS: z.string().optional(),
});
export type EditConnectorFormValues = z.infer;
diff --git a/surfsense_web/components/homepage/integrations.tsx b/surfsense_web/components/homepage/integrations.tsx
index 6d8433ac0..0a5d93a98 100644
--- a/surfsense_web/components/homepage/integrations.tsx
+++ b/surfsense_web/components/homepage/integrations.tsx
@@ -29,6 +29,7 @@ const INTEGRATIONS: Integration[] = [
// Documentation & Knowledge
{ name: "Confluence", icon: "https://cdn.simpleicons.org/confluence/172B4D" },
{ name: "Notion", icon: "https://cdn.simpleicons.org/notion/000000/ffffff" },
+ { name: "Web Pages", icon: "https://cdn.jsdelivr.net/npm/lucide-static@0.294.0/icons/globe.svg" },
// Cloud Storage
{ name: "Google Drive", icon: "https://cdn.simpleicons.org/googledrive/4285F4" },
diff --git a/surfsense_web/components/sources/ConnectorsTab.tsx b/surfsense_web/components/sources/ConnectorsTab.tsx
index b8d3486f6..c9640d09b 100644
--- a/surfsense_web/components/sources/ConnectorsTab.tsx
+++ b/surfsense_web/components/sources/ConnectorsTab.tsx
@@ -19,11 +19,14 @@ interface ConnectorsTabProps {
export function ConnectorsTab({ searchSpaceId }: ConnectorsTabProps) {
const t = useTranslations("add_connector");
const [expandedCategories, setExpandedCategories] = useState([
- "search-engines",
- "knowledge-bases",
+ "web-search",
+ "messaging",
"project-management",
- "team-chats",
- "communication",
+ "documentation",
+ "development",
+ "databases",
+ "productivity",
+ "web-crawling",
]);
const toggleCategory = (categoryId: string) => {
diff --git a/surfsense_web/components/sources/connector-data.tsx b/surfsense_web/components/sources/connector-data.tsx
index 7f8f6bdf3..631bb8606 100644
--- a/surfsense_web/components/sources/connector-data.tsx
+++ b/surfsense_web/components/sources/connector-data.tsx
@@ -5,8 +5,21 @@ import type { ConnectorCategory } from "./types";
export const connectorCategories: ConnectorCategory[] = [
{
- id: "search-engines",
- title: "search_engines",
+ id: "web-crawling",
+ title: "web_crawling",
+ connectors: [
+ {
+ id: "webcrawler-connector",
+ title: "Web Pages",
+ description: "webcrawler_desc",
+ icon: getConnectorIcon(EnumConnectorName.WEBCRAWLER_CONNECTOR, "h-6 w-6"),
+ status: "available",
+ },
+ ],
+ },
+ {
+ id: "web-search",
+ title: "web_search",
connectors: [
{
id: "tavily-api",
@@ -29,13 +42,6 @@ export const connectorCategories: ConnectorCategory[] = [
icon: getConnectorIcon(EnumConnectorName.LINKUP_API, "h-6 w-6"),
status: "available",
},
- {
- id: "elasticsearch-connector",
- title: "Elasticsearch",
- description: "elasticsearch_desc",
- icon: getConnectorIcon(EnumConnectorName.ELASTICSEARCH_CONNECTOR, "h-6 w-6"),
- status: "available",
- },
{
id: "baidu-search-api",
title: "Baidu Search",
@@ -46,8 +52,8 @@ export const connectorCategories: ConnectorCategory[] = [
],
},
{
- id: "team-chats",
- title: "team_chats",
+ id: "messaging",
+ title: "messaging",
connectors: [
{
id: "slack-connector",
@@ -56,13 +62,6 @@ export const connectorCategories: ConnectorCategory[] = [
icon: getConnectorIcon(EnumConnectorName.SLACK_CONNECTOR, "h-6 w-6"),
status: "available",
},
- {
- id: "ms-teams",
- title: "Microsoft Teams",
- description: "teams_desc",
- icon: ,
- status: "coming-soon",
- },
{
id: "discord-connector",
title: "Discord",
@@ -70,6 +69,13 @@ export const connectorCategories: ConnectorCategory[] = [
icon: getConnectorIcon(EnumConnectorName.DISCORD_CONNECTOR, "h-6 w-6"),
status: "available",
},
+ {
+ id: "ms-teams",
+ title: "Microsoft Teams",
+ description: "teams_desc",
+ icon: ,
+ status: "coming-soon",
+ },
],
},
{
@@ -100,8 +106,8 @@ export const connectorCategories: ConnectorCategory[] = [
],
},
{
- id: "knowledge-bases",
- title: "knowledge_bases",
+ id: "documentation",
+ title: "documentation",
connectors: [
{
id: "notion-connector",
@@ -110,6 +116,19 @@ export const connectorCategories: ConnectorCategory[] = [
icon: getConnectorIcon(EnumConnectorName.NOTION_CONNECTOR, "h-6 w-6"),
status: "available",
},
+ {
+ id: "confluence-connector",
+ title: "Confluence",
+ description: "confluence_desc",
+ icon: getConnectorIcon(EnumConnectorName.CONFLUENCE_CONNECTOR, "h-6 w-6"),
+ status: "available",
+ },
+ ],
+ },
+ {
+ id: "development",
+ title: "development",
+ connectors: [
{
id: "github-connector",
title: "GitHub",
@@ -117,11 +136,17 @@ export const connectorCategories: ConnectorCategory[] = [
icon: getConnectorIcon(EnumConnectorName.GITHUB_CONNECTOR, "h-6 w-6"),
status: "available",
},
+ ],
+ },
+ {
+ id: "databases",
+ title: "databases",
+ connectors: [
{
- id: "confluence-connector",
- title: "Confluence",
- description: "confluence_desc",
- icon: getConnectorIcon(EnumConnectorName.CONFLUENCE_CONNECTOR, "h-6 w-6"),
+ id: "elasticsearch-connector",
+ title: "Elasticsearch",
+ description: "elasticsearch_desc",
+ icon: getConnectorIcon(EnumConnectorName.ELASTICSEARCH_CONNECTOR, "h-6 w-6"),
status: "available",
},
{
@@ -131,18 +156,11 @@ export const connectorCategories: ConnectorCategory[] = [
icon: getConnectorIcon(EnumConnectorName.AIRTABLE_CONNECTOR, "h-6 w-6"),
status: "available",
},
- {
- id: "luma-connector",
- title: "Luma",
- description: "luma_desc",
- icon: getConnectorIcon(EnumConnectorName.LUMA_CONNECTOR, "h-6 w-6"),
- status: "available",
- },
],
},
{
- id: "communication",
- title: "communication",
+ id: "productivity",
+ title: "productivity",
connectors: [
{
id: "google-calendar-connector",
@@ -158,6 +176,13 @@ export const connectorCategories: ConnectorCategory[] = [
icon: getConnectorIcon(EnumConnectorName.GOOGLE_GMAIL_CONNECTOR, "h-6 w-6"),
status: "available",
},
+ {
+ id: "luma-connector",
+ title: "Luma",
+ description: "luma_desc",
+ icon: getConnectorIcon(EnumConnectorName.LUMA_CONNECTOR, "h-6 w-6"),
+ status: "available",
+ },
{
id: "zoom",
title: "Zoom",
diff --git a/surfsense_web/content/docs/docker-installation.mdx b/surfsense_web/content/docs/docker-installation.mdx
index 507003a5f..46ef4128b 100644
--- a/surfsense_web/content/docs/docker-installation.mdx
+++ b/surfsense_web/content/docs/docker-installation.mdx
@@ -97,7 +97,7 @@ Before you begin, ensure you have:
| STT_SERVICE | Speech-to-Text API provider for Audio Files (e.g., `local/base`, `openai/whisper-1`). See [supported providers](https://docs.litellm.ai/docs/audio_transcription#supported-providers) |
| STT_SERVICE_API_KEY | (Optional if local) API key for the Speech-to-Text service |
| STT_SERVICE_API_BASE | (Optional) Custom API base URL for the Speech-to-Text service |
-| FIRECRAWL_API_KEY | API key for Firecrawl service for web crawling |
+| FIRECRAWL_API_KEY | API key for Firecrawl service for web crawling |
| ETL_SERVICE | Document parsing service: `UNSTRUCTURED` (supports 34+ formats), `LLAMACLOUD` (supports 50+ formats including legacy document types), or `DOCLING` (local processing, supports PDF, Office docs, images, HTML, CSV) |
| UNSTRUCTURED_API_KEY | API key for Unstructured.io service for document parsing (required if ETL_SERVICE=UNSTRUCTURED) |
| LLAMA_CLOUD_API_KEY | API key for LlamaCloud service for document parsing (required if ETL_SERVICE=LLAMACLOUD) |
diff --git a/surfsense_web/contracts/enums/connector.ts b/surfsense_web/contracts/enums/connector.ts
index 50486c92e..5fd6fb723 100644
--- a/surfsense_web/contracts/enums/connector.ts
+++ b/surfsense_web/contracts/enums/connector.ts
@@ -17,4 +17,5 @@ export enum EnumConnectorName {
AIRTABLE_CONNECTOR = "AIRTABLE_CONNECTOR",
LUMA_CONNECTOR = "LUMA_CONNECTOR",
ELASTICSEARCH_CONNECTOR = "ELASTICSEARCH_CONNECTOR",
+ WEBCRAWLER_CONNECTOR = "WEBCRAWLER_CONNECTOR",
}
diff --git a/surfsense_web/contracts/enums/connectorIcons.tsx b/surfsense_web/contracts/enums/connectorIcons.tsx
index 66e9edfcc..a2b7e0b12 100644
--- a/surfsense_web/contracts/enums/connectorIcons.tsx
+++ b/surfsense_web/contracts/enums/connectorIcons.tsx
@@ -59,11 +59,13 @@ export const getConnectorIcon = (connectorType: EnumConnectorName | string, clas
return ;
case EnumConnectorName.ELASTICSEARCH_CONNECTOR:
return ;
+ case EnumConnectorName.WEBCRAWLER_CONNECTOR:
+ return ;
// Additional cases for non-enum connector types
- case "YOUTUBE_VIDEO":
- return ;
case "CRAWLED_URL":
return ;
+ case "YOUTUBE_VIDEO":
+ return ;
case "FILE":
return ;
case "EXTENSION":
diff --git a/surfsense_web/hooks/use-connector-edit-page.ts b/surfsense_web/hooks/use-connector-edit-page.ts
index 870a87dcb..00be9ef17 100644
--- a/surfsense_web/hooks/use-connector-edit-page.ts
+++ b/surfsense_web/hooks/use-connector-edit-page.ts
@@ -97,6 +97,8 @@ export function useConnectorEditPage(connectorId: number, searchSpaceId: string)
JIRA_API_TOKEN: "",
LUMA_API_KEY: "",
ELASTICSEARCH_API_KEY: "",
+ FIRECRAWL_API_KEY: "",
+ INITIAL_URLS: "",
},
});
@@ -142,6 +144,8 @@ export function useConnectorEditPage(connectorId: number, searchSpaceId: string)
JIRA_API_TOKEN: config.JIRA_API_TOKEN || "",
LUMA_API_KEY: config.LUMA_API_KEY || "",
ELASTICSEARCH_API_KEY: config.ELASTICSEARCH_API_KEY || "",
+ FIRECRAWL_API_KEY: config.FIRECRAWL_API_KEY || "",
+ INITIAL_URLS: config.INITIAL_URLS || "",
});
if (currentConnector.connector_type === "GITHUB_CONNECTOR") {
const savedRepos = config.repo_full_names || [];
@@ -469,6 +473,35 @@ export function useConnectorEditPage(connectorId: number, searchSpaceId: string)
newConfig = { ELASTICSEARCH_API_KEY: formData.ELASTICSEARCH_API_KEY };
}
break;
+ case "WEBCRAWLER_CONNECTOR":
+ if (
+ formData.FIRECRAWL_API_KEY !== originalConfig.FIRECRAWL_API_KEY ||
+ formData.INITIAL_URLS !== originalConfig.INITIAL_URLS
+ ) {
+ newConfig = {};
+
+ if (formData.FIRECRAWL_API_KEY && formData.FIRECRAWL_API_KEY.trim()) {
+ if (!formData.FIRECRAWL_API_KEY.startsWith("fc-")) {
+ toast.warning(
+ "Firecrawl API keys typically start with 'fc-'. Please verify your key."
+ );
+ }
+ newConfig.FIRECRAWL_API_KEY = formData.FIRECRAWL_API_KEY.trim();
+ } else if (originalConfig.FIRECRAWL_API_KEY) {
+ toast.info(
+ "Firecrawl API key removed. Web crawler will use AsyncChromiumLoader as fallback."
+ );
+ }
+
+ if (formData.INITIAL_URLS !== undefined) {
+ if (formData.INITIAL_URLS && formData.INITIAL_URLS.trim()) {
+ newConfig.INITIAL_URLS = formData.INITIAL_URLS.trim();
+ } else if (originalConfig.INITIAL_URLS) {
+ toast.info("URLs removed from crawler configuration.");
+ }
+ }
+ }
+ break;
}
if (newConfig !== null) {
@@ -562,6 +595,9 @@ export function useConnectorEditPage(connectorId: number, searchSpaceId: string)
"ELASTICSEARCH_API_KEY",
newlySavedConfig.ELASTICSEARCH_API_KEY || ""
);
+ } else if (connector.connector_type === "WEBCRAWLER_CONNECTOR") {
+ editForm.setValue("FIRECRAWL_API_KEY", newlySavedConfig.FIRECRAWL_API_KEY || "");
+ editForm.setValue("INITIAL_URLS", newlySavedConfig.INITIAL_URLS || "");
}
}
if (connector.connector_type === "GITHUB_CONNECTOR") {
diff --git a/surfsense_web/lib/connectors/utils.ts b/surfsense_web/lib/connectors/utils.ts
index f2052900c..c921bd1a8 100644
--- a/surfsense_web/lib/connectors/utils.ts
+++ b/surfsense_web/lib/connectors/utils.ts
@@ -18,6 +18,7 @@ export const getConnectorTypeDisplay = (type: string): string => {
AIRTABLE_CONNECTOR: "Airtable",
LUMA_CONNECTOR: "Luma",
ELASTICSEARCH_CONNECTOR: "Elasticsearch",
+ WEBCRAWLER_CONNECTOR: "Web Pages",
};
return typeMap[type] || type;
};
diff --git a/surfsense_web/messages/en.json b/surfsense_web/messages/en.json
index ee1bea40b..758e558b3 100644
--- a/surfsense_web/messages/en.json
+++ b/surfsense_web/messages/en.json
@@ -304,11 +304,14 @@
"add_connector": {
"title": "Connect Your Tools",
"subtitle": "Integrate with your favorite services to enhance your research capabilities.",
- "search_engines": "Search Engines",
- "team_chats": "Team Chats",
+ "web_search": "Web Search",
+ "messaging": "Messaging",
"project_management": "Project Management",
- "knowledge_bases": "Knowledge Bases",
- "communication": "Communication",
+ "documentation": "Documentation",
+ "development": "Development",
+ "databases": "Databases",
+ "productivity": "Productivity",
+ "web_crawling": "Web Crawling",
"connect": "Connect",
"coming_soon": "Coming Soon",
"connected": "Connected",
@@ -328,10 +331,11 @@
"github_desc": "Connect a GitHub PAT to index code and docs from accessible repositories.",
"confluence_desc": "Connect to Confluence to search pages, comments and documentation.",
"airtable_desc": "Connect to Airtable to search records, tables and database content.",
- "luma_desc": "Connect to Luma to search events",
+ "luma_desc": "Connect to Luma to search events, meetups and gatherings.",
"calendar_desc": "Connect to Google Calendar to search events, meetings and schedules.",
"gmail_desc": "Connect to your Gmail account to search through your emails.",
- "zoom_desc": "Connect to Zoom to access meeting recordings and transcripts."
+ "zoom_desc": "Connect to Zoom to access meeting recordings and transcripts.",
+ "webcrawler_desc": "Crawl and index content from any public web pages."
},
"upload_documents": {
"title": "Upload Documents",
diff --git a/surfsense_web/messages/zh.json b/surfsense_web/messages/zh.json
index 11880a2e8..857649b80 100644
--- a/surfsense_web/messages/zh.json
+++ b/surfsense_web/messages/zh.json
@@ -304,11 +304,14 @@
"add_connector": {
"title": "连接您的工具",
"subtitle": "集成您喜欢的服务以增强研究能力。",
- "search_engines": "搜索引擎",
- "team_chats": "团队聊天",
+ "web_search": "网络搜索",
+ "messaging": "即时通讯",
"project_management": "项目管理",
- "knowledge_bases": "知识库",
- "communication": "通讯",
+ "documentation": "文档协作",
+ "development": "开发工具",
+ "databases": "数据库",
+ "productivity": "效率工具",
+ "web_crawling": "网页爬取",
"connect": "连接",
"coming_soon": "即将推出",
"connected": "已连接",
@@ -328,10 +331,11 @@
"github_desc": "连接 GitHub PAT 以索引可访问存储库的代码和文档。",
"confluence_desc": "连接到 Confluence 以搜索页面、评论和文档。",
"airtable_desc": "连接到 Airtable 以搜索记录、表格和数据库内容。",
- "luma_desc": "连接到 Luma 以搜索活动",
+ "luma_desc": "连接到 Luma 以搜索活动、聚会和集会。",
"calendar_desc": "连接到 Google 日历以搜索活动、会议和日程。",
"gmail_desc": "连接到您的 Gmail 账户以搜索您的电子邮件。",
- "zoom_desc": "连接到 Zoom 以访问会议录制和转录。"
+ "zoom_desc": "连接到 Zoom 以访问会议录制和转录。",
+ "webcrawler_desc": "爬取和索引任何公开网页的内容。"
},
"upload_documents": {
"title": "上传文档",