Renaming resources

This commit is contained in:
samkul-swe 2025-11-22 19:19:00 -08:00
parent 6d19e0fad8
commit 121e2f0c0e
24 changed files with 117 additions and 273 deletions

View file

@ -44,9 +44,9 @@ def upgrade() -> None:
IF NOT EXISTS (
SELECT 1 FROM pg_type t
JOIN pg_enum e ON t.oid = e.enumtypid
WHERE t.typname = 'documenttype' AND e.enumlabel = 'WEBCRAWLER_CONNECTOR'
WHERE t.typname = 'documenttype' AND e.enumlabel = 'CRAWLED_URL'
) THEN
ALTER TYPE documenttype ADD VALUE 'WEBCRAWLER_CONNECTOR';
ALTER TYPE documenttype ADD VALUE 'CRAWLED_URL';
END IF;
END
$$;

View file

@ -671,7 +671,7 @@ async def fetch_relevant_documents(
(
source_object,
crawled_urls_chunks,
) = await connector_service.search_crawled_urls(
) = await connector_service.search_webcrawler(
user_query=reformulated_query,
user_id=user_id,
search_space_id=search_space_id,

View file

@ -34,7 +34,7 @@ You are SurfSense, an advanced AI research assistant that provides detailed, wel
- TAVILY_API: "Tavily search API results" (personalized search results)
- LINKUP_API: "Linkup search API results" (personalized search results)
- LUMA_CONNECTOR: "Luma events"
- WEBCRAWLER_CONNECTOR: "Webpages indexed by SurfSense web crawler" (personally selected websites)
- WEBCRAWLER_CONNECTOR: "Webpages indexed by SurfSense" (personally selected websites)
</knowledge_sources>
<instructions>

View file

@ -208,9 +208,6 @@ class Config:
# LlamaCloud API Key
LLAMA_CLOUD_API_KEY = os.getenv("LLAMA_CLOUD_API_KEY")
# Firecrawl API Key
FIRECRAWL_API_KEY = os.getenv("FIRECRAWL_API_KEY", None)
# Litellm TTS Configuration
TTS_SERVICE = os.getenv("TTS_SERVICE")
TTS_SERVICE_API_BASE = os.getenv("TTS_SERVICE_API_BASE")

View file

@ -49,7 +49,7 @@ from app.tasks.connector_indexers import (
index_luma_events,
index_notion_pages,
index_slack_messages,
index_webcrawler_urls,
index_crawled_urls,
)
from app.users import current_active_user
from app.utils.check_ownership import check_ownership
@ -691,12 +691,12 @@ async def index_connector_content(
response_message = "Elasticsearch indexing started in the background."
elif connector.connector_type == SearchSourceConnectorType.WEBCRAWLER_CONNECTOR:
from app.tasks.celery_tasks.connector_tasks import index_webcrawler_urls_task
from app.tasks.celery_tasks.connector_tasks import index_crawled_urls_task
logger.info(
f"Triggering web pages indexing for connector {connector_id} into search space {search_space_id} from {indexing_from} to {indexing_to}"
)
index_webcrawler_urls_task.delay(
index_crawled_urls_task.delay(
connector_id, search_space_id, str(user.id), indexing_from, indexing_to
)
response_message = "Web page indexing started in the background."
@ -1537,8 +1537,8 @@ async def run_elasticsearch_indexing(
exc_info=True,
)
# Add new helper functions for webcrawler indexing
async def run_webcrawler_indexing_with_new_session(
# Add new helper functions for crawled web page indexing
async def run_web_page_indexing_with_new_session(
connector_id: int,
search_space_id: int,
user_id: str,
@ -1546,16 +1546,16 @@ async def run_webcrawler_indexing_with_new_session(
end_date: str,
):
"""
Create a new session and run the Webcrawler indexing task.
Create a new session and run the Web page indexing task.
This prevents session leaks by creating a dedicated session for the background task.
"""
async with async_session_maker() as session:
await run_webcrawler_indexing(
await run_web_page_indexing(
session, connector_id, search_space_id, user_id, start_date, end_date
)
async def run_webcrawler_indexing(
async def run_web_page_indexing(
session: AsyncSession,
connector_id: int,
search_space_id: int,
@ -1564,7 +1564,7 @@ async def run_webcrawler_indexing(
end_date: str,
):
"""
Background task to run Webcrawler indexing.
Background task to run Web page indexing.
Args:
session: Database session
connector_id: ID of the webcrawler connector
@ -1574,7 +1574,7 @@ async def run_webcrawler_indexing(
end_date: End date for indexing
"""
try:
documents_processed, error_or_warning = await index_webcrawler_urls(
documents_processed, error_or_warning = await index_crawled_urls(
session=session,
connector_id=connector_id,
search_space_id=search_space_id,
@ -1588,11 +1588,11 @@ async def run_webcrawler_indexing(
if documents_processed > 0:
await update_connector_last_indexed(session, connector_id)
logger.info(
f"Webcrawler indexing completed successfully: {documents_processed} documents processed"
f"Web page indexing completed successfully: {documents_processed} documents processed"
)
else:
logger.error(
f"Webcrawler indexing failed or no documents processed: {error_or_warning}"
f"Web page indexing failed or no documents processed: {error_or_warning}"
)
except Exception as e:
logger.error(f"Error in background Webcrawler indexing task: {e!s}")
logger.error(f"Error in background Web page indexing task: {e!s}")

View file

@ -2573,4 +2573,4 @@ class ConnectorService:
"sources": sources_list,
}
return result_object, elasticsearch_chunks
return result_object, elasticsearch_chunks

View file

@ -602,8 +602,8 @@ async def _index_elasticsearch_documents(
)
@celery_app.task(name="index_webcrawler_urls", bind=True)
def index_webcrawler_urls_task(
@celery_app.task(name="index_crawled_urls", bind=True)
def index_crawled_urls_task(
self,
connector_id: int,
search_space_id: int,
@ -611,7 +611,7 @@ def index_webcrawler_urls_task(
start_date: str,
end_date: str,
):
"""Celery task to index Webcrawler Urls."""
"""Celery task to index Web page Urls."""
import asyncio
loop = asyncio.new_event_loop()
@ -619,7 +619,7 @@ def index_webcrawler_urls_task(
try:
loop.run_until_complete(
_index_webcrawler_urls(
_index_crawled_urls(
connector_id, search_space_id, user_id, start_date, end_date
)
)
@ -627,19 +627,19 @@ def index_webcrawler_urls_task(
loop.close()
async def _index_webcrawler_urls(
async def _index_crawled_urls(
connector_id: int,
search_space_id: int,
user_id: str,
start_date: str,
end_date: str,
):
"""Index Webcrawler Urls with new session."""
"""Index Web page Urls with new session."""
from app.routes.search_source_connectors_routes import (
run_webcrawler_indexing,
run_web_page_indexing,
)
async with get_celery_session_maker()() as session:
await run_webcrawler_indexing(
await run_web_page_indexing(
session, connector_id, search_space_id, user_id, start_date, end_date
)

View file

@ -77,7 +77,7 @@ async def _check_and_trigger_schedules():
index_luma_events_task,
index_notion_pages_task,
index_slack_messages_task,
index_webcrawler_urls_task
index_crawled_urls_task
)
# Map connector types to their tasks
@ -95,7 +95,7 @@ async def _check_and_trigger_schedules():
SearchSourceConnectorType.DISCORD_CONNECTOR: index_discord_messages_task,
SearchSourceConnectorType.LUMA_CONNECTOR: index_luma_events_task,
SearchSourceConnectorType.ELASTICSEARCH_CONNECTOR: index_elasticsearch_documents_task,
SearchSourceConnectorType.WEBCRAWLER_CONNECTOR: index_webcrawler_urls_task,
SearchSourceConnectorType.WEBCRAWLER_CONNECTOR: index_crawled_urls_task,
}
# Trigger indexing for each due connector

View file

@ -42,7 +42,7 @@ from .luma_indexer import index_luma_events
# Documentation and knowledge management
from .notion_indexer import index_notion_pages
from .slack_indexer import index_slack_messages
from .webcrawler_indexer import index_webcrawler_urls
from .webcrawler_indexer import index_crawled_urls
__all__ = [ # noqa: RUF022
"index_airtable_records",
@ -60,7 +60,7 @@ __all__ = [ # noqa: RUF022
"index_linear_issues",
# Documentation and knowledge management
"index_notion_pages",
"index_webcrawler_urls",
"index_crawled_urls",
# Communication platforms
"index_slack_messages",
"index_google_gmail_messages",

View file

@ -27,7 +27,7 @@ from .base import (
)
async def index_webcrawler_urls(
async def index_crawled_urls(
session: AsyncSession,
connector_id: int,
search_space_id: int,
@ -37,7 +37,7 @@ async def index_webcrawler_urls(
update_last_indexed: bool = True,
) -> tuple[int, str | None]:
"""
Index webcrawler URLs.
Index web page URLs.
Args:
session: Database session
@ -55,9 +55,9 @@ async def index_webcrawler_urls(
# Log task start
log_entry = await task_logger.log_task_start(
task_name="webcrawler_url_indexing",
task_name="crawled_url_indexing",
source="connector_indexing_task",
message=f"Starting webcrawler URL indexing for connector {connector_id}",
message=f"Starting web page URL indexing for connector {connector_id}",
metadata={
"connector_id": connector_id,
"user_id": str(user_id),
@ -104,7 +104,7 @@ async def index_webcrawler_urls(
urls = []
logger.info(
f"Starting webcrawler indexing for connector {connector_id} with {len(urls)} URLs"
f"Starting crawled web page indexing for connector {connector_id} with {len(urls)} URLs"
)
# Initialize webcrawler client
@ -367,7 +367,7 @@ async def index_webcrawler_urls(
await task_logger.log_task_success(
log_entry,
f"Successfully completed webcrawler indexing for connector {connector_id}",
f"Successfully completed crawled web page indexing for connector {connector_id}",
{
"urls_processed": total_processed,
"documents_indexed": documents_indexed,
@ -378,7 +378,7 @@ async def index_webcrawler_urls(
)
logger.info(
f"Webcrawler indexing completed: {documents_indexed} new, "
f"Web page indexing completed: {documents_indexed} new, "
f"{documents_updated} updated, {documents_skipped} skipped, "
f"{len(failed_urls)} failed"
)
@ -388,7 +388,7 @@ async def index_webcrawler_urls(
await session.rollback()
await task_logger.log_task_failure(
log_entry,
f"Database error during webcrawler indexing for connector {connector_id}",
f"Database error during web page indexing for connector {connector_id}",
str(db_error),
{"error_type": "SQLAlchemyError"},
)
@ -398,12 +398,12 @@ async def index_webcrawler_urls(
await session.rollback()
await task_logger.log_task_failure(
log_entry,
f"Failed to index webcrawler URLs for connector {connector_id}",
f"Failed to index web page URLs for connector {connector_id}",
str(e),
{"error_type": type(e).__name__},
)
logger.error(f"Failed to index webcrawler URLs: {e!s}", exc_info=True)
return 0, f"Failed to index webcrawler URLs: {e!s}"
logger.error(f"Failed to index web page URLs: {e!s}", exc_info=True)
return 0, f"Failed to index web page URLs: {e!s}"
async def get_crawled_url_documents(

View file

@ -31,7 +31,7 @@ CONNECTOR_TASK_MAP = {
SearchSourceConnectorType.DISCORD_CONNECTOR: "index_discord_messages",
SearchSourceConnectorType.LUMA_CONNECTOR: "index_luma_events",
SearchSourceConnectorType.ELASTICSEARCH_CONNECTOR: "index_elasticsearch_documents",
SearchSourceConnectorType.WEBCRAWLER_CONNECTOR: "index_webcrawler_urls",
SearchSourceConnectorType.WEBCRAWLER_CONNECTOR: "index_crawled_urls",
}
@ -80,7 +80,7 @@ def create_periodic_schedule(
index_luma_events_task,
index_notion_pages_task,
index_slack_messages_task,
index_webcrawler_urls_task,
index_crawled_urls_task,
)
# Map connector type to task
@ -98,7 +98,7 @@ def create_periodic_schedule(
SearchSourceConnectorType.DISCORD_CONNECTOR: index_discord_messages_task,
SearchSourceConnectorType.LUMA_CONNECTOR: index_luma_events_task,
SearchSourceConnectorType.ELASTICSEARCH_CONNECTOR: index_elasticsearch_documents_task,
SearchSourceConnectorType.WEBCRAWLER_CONNECTOR: index_webcrawler_urls_task,
SearchSourceConnectorType.WEBCRAWLER_CONNECTOR: index_crawled_urls_task,
}
# Trigger the first run immediately

View file

@ -18,7 +18,16 @@ import {
CardHeader,
CardTitle,
} from "@/components/ui/card";
import { Form } from "@/components/ui/form";
import {
Form,
FormControl,
FormDescription,
FormField,
FormItem,
FormLabel,
FormMessage,
} from "@/components/ui/form";
import { Textarea } from "@/components/ui/textarea";
import { getConnectorIcon } from "@/contracts/enums/connectorIcons";
import { useConnectorEditPage } from "@/hooks/use-connector-edit-page";
// Import Utils, Types, Hook, and Components
@ -285,13 +294,35 @@ export default function EditConnectorPage() {
{/* == Webcrawler == */}
{connector.connector_type === "WEBCRAWLER_CONNECTOR" && (
<EditSimpleTokenForm
control={editForm.control}
fieldName="FIRECRAWL_API_KEY"
fieldLabel="Firecrawl API Key (Optional)"
fieldDescription="Add a Firecrawl API key for enhanced crawling capabilities. If not provided, will use AsyncChromiumLoader as fallback."
placeholder="fc-xxxxxxxxxxxxx"
/>
<div className="space-y-4">
<EditSimpleTokenForm
control={editForm.control}
fieldName="FIRECRAWL_API_KEY"
fieldLabel="Firecrawl API Key (Optional)"
fieldDescription="Add a Firecrawl API key for enhanced crawling capabilities. If not provided, will use AsyncChromiumLoader as fallback."
placeholder="fc-xxxxxxxxxxxxx"
/>
<FormField
control={editForm.control}
name="INITIAL_URLS"
render={({ field }) => (
<FormItem>
<FormLabel>URLs to Crawl</FormLabel>
<FormControl>
<Textarea
placeholder="https://example.com&#10;https://docs.example.com&#10;https://blog.example.com"
className="min-h-[150px] font-mono text-sm"
{...field}
/>
</FormControl>
<FormDescription>
Enter URLs to crawl (one per line). These URLs will be indexed when you trigger indexing.
</FormDescription>
<FormMessage />
</FormItem>
)}
/>
</div>
)}
</CardContent>

View file

@ -55,7 +55,7 @@ const getConnectorTypeDisplay = (type: string): string => {
AIRTABLE_CONNECTOR: "Airtable Connector",
LUMA_CONNECTOR: "Luma Connector",
ELASTICSEARCH_CONNECTOR: "Elasticsearch Connector",
WEBCRAWLER_CONNECTOR: "Web Crawler Connector",
WEBCRAWLER_CONNECTOR: "Web Page Connector",
// Add other connector types here as needed
};
return typeMap[type] || type;

View file

@ -64,7 +64,7 @@ export default function WebcrawlerConnectorPage() {
const form = useForm<WebcrawlerConnectorFormValues>({
resolver: zodResolver(webcrawlerConnectorFormSchema),
defaultValues: {
name: "Web Crawler",
name: "Web Pages",
api_key: "",
initial_urls: "",
},
@ -150,7 +150,7 @@ export default function WebcrawlerConnectorPage() {
{getConnectorIcon(EnumConnectorName.WEBCRAWLER_CONNECTOR, "h-6 w-6")}
</div>
<div>
<h1 className="text-3xl font-bold tracking-tight">Connect Web Crawler</h1>
<h1 className="text-3xl font-bold tracking-tight">Connect Web Pages</h1>
<p className="text-muted-foreground">Crawl and index web pages for search.</p>
</div>
</div>
@ -160,9 +160,9 @@ export default function WebcrawlerConnectorPage() {
{!doesConnectorExist ? (
<Card>
<CardHeader>
<CardTitle>Set Up Web Crawler</CardTitle>
<CardTitle>Set Up Web Page crawler</CardTitle>
<CardDescription>
Configure your web crawler to index web pages. Optionally add a Firecrawl API key
Configure your web page crawler to index web pages. Optionally add a Firecrawl API key
for enhanced crawling capabilities.
</CardDescription>
</CardHeader>
@ -277,7 +277,7 @@ export default function WebcrawlerConnectorPage() {
/* Success Card */
<Card>
<CardHeader>
<CardTitle> Your web crawler is successfully set up!</CardTitle>
<CardTitle> Your web page crawler is successfully set up!</CardTitle>
<CardDescription>
You can now add URLs to crawl from the connector management page.
</CardDescription>

View file

@ -1,201 +0,0 @@
"use client";
import { type Tag, TagInput } from "emblor";
import { Globe, Loader2 } from "lucide-react";
import { useParams, useRouter } from "next/navigation";
import { useTranslations } from "next-intl";
import { useState } from "react";
import { toast } from "sonner";
import { Button } from "@/components/ui/button";
import {
Card,
CardContent,
CardDescription,
CardFooter,
CardHeader,
CardTitle,
} from "@/components/ui/card";
import { Label } from "@/components/ui/label";
// URL validation regex
const urlRegex = /^(https?:\/\/)?([\da-z.-]+)\.([a-z.]{2,6})([/\w .-]*)*\/?$/;
export default function WebpageCrawler() {
const t = useTranslations("add_webpage");
const params = useParams();
const router = useRouter();
const search_space_id = params.search_space_id as string;
const [urlTags, setUrlTags] = useState<Tag[]>([]);
const [activeTagIndex, setActiveTagIndex] = useState<number | null>(null);
const [isSubmitting, setIsSubmitting] = useState(false);
const [error, setError] = useState<string | null>(null);
// Function to validate a URL
const isValidUrl = (url: string): boolean => {
return urlRegex.test(url);
};
// Function to handle URL submission
const handleSubmit = async () => {
// Validate that we have at least one URL
if (urlTags.length === 0) {
setError(t("error_no_url"));
return;
}
// Validate all URLs
const invalidUrls = urlTags.filter((tag) => !isValidUrl(tag.text));
if (invalidUrls.length > 0) {
setError(t("error_invalid_urls", { urls: invalidUrls.map((tag) => tag.text).join(", ") }));
return;
}
setError(null);
setIsSubmitting(true);
try {
toast(t("crawling_toast"), {
description: t("crawling_toast_desc"),
});
// Extract URLs from tags
const urls = urlTags.map((tag) => tag.text);
// Make API call to backend
const response = await fetch(
`${process.env.NEXT_PUBLIC_FASTAPI_BACKEND_URL}/api/v1/documents`,
{
method: "POST",
headers: {
"Content-Type": "application/json",
Authorization: `Bearer ${localStorage.getItem("surfsense_bearer_token")}`,
},
body: JSON.stringify({
document_type: "CRAWLED_URL",
content: urls,
search_space_id: parseInt(search_space_id),
}),
}
);
if (!response.ok) {
throw new Error("Failed to crawl URLs");
}
await response.json();
toast(t("success_toast"), {
description: t("success_toast_desc"),
});
// Redirect to documents page
router.push(`/dashboard/${search_space_id}/documents`);
} catch (error: any) {
setError(error.message || t("error_generic"));
toast(t("error_toast"), {
description: `${t("error_toast_desc")}: ${error.message}`,
});
} finally {
setIsSubmitting(false);
}
};
// Function to add a new URL tag
const handleAddTag = (text: string) => {
// Basic URL validation
if (!isValidUrl(text)) {
toast(t("invalid_url_toast"), {
description: t("invalid_url_toast_desc"),
});
return;
}
// Check for duplicates
if (urlTags.some((tag) => tag.text === text)) {
toast(t("duplicate_url_toast"), {
description: t("duplicate_url_toast_desc"),
});
return;
}
// Add the new tag
const newTag: Tag = {
id: Date.now().toString(),
text: text,
};
setUrlTags([...urlTags, newTag]);
};
return (
<div className="container mx-auto py-8">
<Card className="max-w-2xl mx-auto">
<CardHeader>
<CardTitle className="flex items-center gap-2">
<Globe className="h-5 w-5" />
{t("title")}
</CardTitle>
<CardDescription>{t("subtitle")}</CardDescription>
</CardHeader>
<CardContent>
<div className="space-y-4">
<div className="space-y-2">
<Label htmlFor="url-input">{t("label")}</Label>
<TagInput
id="url-input"
tags={urlTags}
setTags={setUrlTags}
placeholder={t("placeholder")}
onAddTag={handleAddTag}
styleClasses={{
inlineTagsContainer:
"border-input rounded-lg bg-background shadow-sm shadow-black/5 transition-shadow focus-within:border-ring focus-within:outline-none focus-within:ring-[3px] focus-within:ring-ring/20 p-1 gap-1",
input: "w-full min-w-[80px] focus-visible:outline-none shadow-none px-2 h-7",
tag: {
body: "h-7 relative bg-background border border-input hover:bg-background rounded-md font-medium text-xs ps-2 pe-7 flex",
closeButton:
"absolute -inset-y-px -end-px p-0 rounded-e-lg flex size-7 transition-colors outline-0 focus-visible:outline focus-visible:outline-2 focus-visible:outline-ring/70 text-muted-foreground/80 hover:text-foreground",
},
}}
activeTagIndex={activeTagIndex}
setActiveTagIndex={setActiveTagIndex}
/>
<p className="text-xs text-muted-foreground mt-1">{t("hint")}</p>
</div>
{error && <div className="text-sm text-red-500 mt-2">{error}</div>}
<div className="bg-muted/50 rounded-lg p-4 text-sm">
<h4 className="font-medium mb-2">{t("tips_title")}</h4>
<ul className="list-disc pl-5 space-y-1 text-muted-foreground">
<li>{t("tip_1")}</li>
<li>{t("tip_2")}</li>
<li>{t("tip_3")}</li>
<li>{t("tip_4")}</li>
</ul>
</div>
</div>
</CardContent>
<CardFooter className="flex justify-between">
<Button
variant="outline"
onClick={() => router.push(`/dashboard/${search_space_id}/documents`)}
>
{t("cancel")}
</Button>
<Button onClick={handleSubmit} disabled={isSubmitting || urlTags.length === 0}>
{isSubmitting ? (
<>
<Loader2 className="mr-2 h-4 w-4 animate-spin" />
{t("submitting")}
</>
) : (
t("submit")
)}
</Button>
</CardFooter>
</Card>
</div>
);
}

View file

@ -138,7 +138,7 @@ export function DashboardBreadcrumb() {
"linkup-api": "LinkUp API",
"luma-connector": "Luma",
"elasticsearch-connector": "Elasticsearch",
"webcrawler-connector": "WebCrawler",
"webcrawler-connector": "Web Pages",
};
const connectorLabel = connectorLabels[connectorType] || connectorType;

View file

@ -53,5 +53,6 @@ export const editConnectorSchema = z.object({
LUMA_API_KEY: z.string().optional(),
ELASTICSEARCH_API_KEY: z.string().optional(),
FIRECRAWL_API_KEY: z.string().optional(),
INITIAL_URLS: z.string().optional()
});
export type EditConnectorFormValues = z.infer<typeof editConnectorSchema>;

View file

@ -29,7 +29,7 @@ const INTEGRATIONS: Integration[] = [
// Documentation & Knowledge
{ name: "Confluence", icon: "https://cdn.simpleicons.org/confluence/172B4D" },
{ name: "Notion", icon: "https://cdn.simpleicons.org/notion/000000/ffffff" },
{ name: "Web Crawler", icon: "https://cdn.jsdelivr.net/npm/lucide-static@0.294.0/icons/globe.svg"},
{ name: "Web Pages", icon: "https://cdn.jsdelivr.net/npm/lucide-static@0.294.0/icons/globe.svg"},
// Cloud Storage
{ name: "Google Drive", icon: "https://cdn.simpleicons.org/googledrive/4285F4" },

View file

@ -140,7 +140,7 @@ export const connectorCategories: ConnectorCategory[] = [
},
{
id: "webcrawler-connector",
title: "Web Crawler",
title: "Web Pages",
description: "webcrawler_desc",
icon: getConnectorIcon(EnumConnectorName.WEBCRAWLER_CONNECTOR, "h-6 w-6"),
status: "available",

View file

@ -96,8 +96,7 @@ Before you begin, ensure you have:
| TTS_SERVICE_API_BASE | (Optional) Custom API base URL for the Text-to-Speech service |
| STT_SERVICE | Speech-to-Text API provider for Audio Files (e.g., `local/base`, `openai/whisper-1`). See [supported providers](https://docs.litellm.ai/docs/audio_transcription#supported-providers) |
| STT_SERVICE_API_KEY | (Optional if local) API key for the Speech-to-Text service |
| STT_SERVICE_API_BASE | (Optional) Custom API base URL for the Speech-to-Text service |
| FIRECRAWL_API_KEY | API key for Firecrawl service for web crawling |
| STT_SERVICE_API_BASE | (Optional) Custom API base URL for the Speech-to-Text service | |
| ETL_SERVICE | Document parsing service: `UNSTRUCTURED` (supports 34+ formats), `LLAMACLOUD` (supports 50+ formats including legacy document types), or `DOCLING` (local processing, supports PDF, Office docs, images, HTML, CSV) |
| UNSTRUCTURED_API_KEY | API key for Unstructured.io service for document parsing (required if ETL_SERVICE=UNSTRUCTURED) |
| LLAMA_CLOUD_API_KEY | API key for LlamaCloud service for document parsing (required if ETL_SERVICE=LLAMACLOUD) |

View file

@ -62,6 +62,8 @@ export const getConnectorIcon = (connectorType: EnumConnectorName | string, clas
case EnumConnectorName.WEBCRAWLER_CONNECTOR:
return <Globe {...iconProps} />;
// Additional cases for non-enum connector types
case "CRAWLED_URL":
return <Globe {...iconProps} />;
case "YOUTUBE_VIDEO":
return <IconBrandYoutube {...iconProps} />;
case "FILE":

View file

@ -98,6 +98,7 @@ export function useConnectorEditPage(connectorId: number, searchSpaceId: string)
LUMA_API_KEY: "",
ELASTICSEARCH_API_KEY: "",
FIRECRAWL_API_KEY: "",
INITIAL_URLS: ""
},
});
@ -144,6 +145,7 @@ export function useConnectorEditPage(connectorId: number, searchSpaceId: string)
LUMA_API_KEY: config.LUMA_API_KEY || "",
ELASTICSEARCH_API_KEY: config.ELASTICSEARCH_API_KEY || "",
FIRECRAWL_API_KEY: config.FIRECRAWL_API_KEY || "",
INTIAL_URLS: config.INITIAL_URLS || ""
});
if (currentConnector.connector_type === "GITHUB_CONNECTOR") {
const savedRepos = config.repo_full_names || [];
@ -472,16 +474,28 @@ export function useConnectorEditPage(connectorId: number, searchSpaceId: string)
}
break;
case "WEBCRAWLER_CONNECTOR":
if (formData.FIRECRAWL_API_KEY !== originalConfig.FIRECRAWL_API_KEY) {
if (
formData.FIRECRAWL_API_KEY !== originalConfig.FIRECRAWL_API_KEY ||
formData.INITIAL_URLS !== originalConfig.INITIAL_URLS
) {
newConfig = {};
if (formData.FIRECRAWL_API_KEY && formData.FIRECRAWL_API_KEY.trim()) {
if (!formData.FIRECRAWL_API_KEY.startsWith("fc-")) {
toast.warning("Firecrawl API keys typically start with 'fc-'. Please verify your key.");
}
newConfig = { FIRECRAWL_API_KEY: formData.FIRECRAWL_API_KEY };
} else {
newConfig = {};
newConfig.FIRECRAWL_API_KEY = formData.FIRECRAWL_API_KEY.trim();
} else if (originalConfig.FIRECRAWL_API_KEY) {
toast.info("Firecrawl API key removed. Web crawler will use AsyncChromiumLoader as fallback.");
}
if (formData.INITIAL_URLS !== undefined) {
if (formData.INITIAL_URLS && formData.INITIAL_URLS.trim()) {
newConfig.INITIAL_URLS = formData.INITIAL_URLS.trim();
} else if (originalConfig.INITIAL_URLS) {
toast.info("URLs removed from crawler configuration.");
}
}
}
break;
}
@ -579,6 +593,7 @@ export function useConnectorEditPage(connectorId: number, searchSpaceId: string)
);
} else if (connector.connector_type == "WEBCRAWLER_CONNECTOR") {
editForm.setValue("FIRECRAWL_API_KEY",newlySavedConfig.FIRECRAWL_API_KEY || "");
editForm.setValue("INITIAL_URLS", newlySavedConfig.INITIAL_URLS || "");
}
}
if (connector.connector_type === "GITHUB_CONNECTOR") {

View file

@ -18,7 +18,7 @@ export const getConnectorTypeDisplay = (type: string): string => {
AIRTABLE_CONNECTOR: "Airtable",
LUMA_CONNECTOR: "Luma",
ELASTICSEARCH_CONNECTOR: "Elasticsearch",
WEBCRAWLER_CONNECTOR: "Web Crawler",
WEBCRAWLER_CONNECTOR: "Web Pages",
};
return typeMap[type] || type;
};

View file

@ -332,7 +332,7 @@
"calendar_desc": "Connect to Google Calendar to search events, meetings and schedules.",
"gmail_desc": "Connect to your Gmail account to search through your emails.",
"zoom_desc": "Connect to Zoom to access meeting recordings and transcripts.",
"webcrawler_desc": "Scrape web pages using FireCrawl."
"webcrawler_desc": "Crawl web pages"
},
"upload_documents": {
"title": "Upload Documents",