mirror of
https://github.com/MODSetter/SurfSense.git
synced 2026-05-25 19:15:18 +02:00
Renaming resources
This commit is contained in:
parent
6d19e0fad8
commit
121e2f0c0e
24 changed files with 117 additions and 273 deletions
|
|
@ -44,9 +44,9 @@ def upgrade() -> None:
|
|||
IF NOT EXISTS (
|
||||
SELECT 1 FROM pg_type t
|
||||
JOIN pg_enum e ON t.oid = e.enumtypid
|
||||
WHERE t.typname = 'documenttype' AND e.enumlabel = 'WEBCRAWLER_CONNECTOR'
|
||||
WHERE t.typname = 'documenttype' AND e.enumlabel = 'CRAWLED_URL'
|
||||
) THEN
|
||||
ALTER TYPE documenttype ADD VALUE 'WEBCRAWLER_CONNECTOR';
|
||||
ALTER TYPE documenttype ADD VALUE 'CRAWLED_URL';
|
||||
END IF;
|
||||
END
|
||||
$$;
|
||||
|
|
@ -671,7 +671,7 @@ async def fetch_relevant_documents(
|
|||
(
|
||||
source_object,
|
||||
crawled_urls_chunks,
|
||||
) = await connector_service.search_crawled_urls(
|
||||
) = await connector_service.search_webcrawler(
|
||||
user_query=reformulated_query,
|
||||
user_id=user_id,
|
||||
search_space_id=search_space_id,
|
||||
|
|
|
|||
|
|
@ -34,7 +34,7 @@ You are SurfSense, an advanced AI research assistant that provides detailed, wel
|
|||
- TAVILY_API: "Tavily search API results" (personalized search results)
|
||||
- LINKUP_API: "Linkup search API results" (personalized search results)
|
||||
- LUMA_CONNECTOR: "Luma events"
|
||||
- WEBCRAWLER_CONNECTOR: "Webpages indexed by SurfSense web crawler" (personally selected websites)
|
||||
- WEBCRAWLER_CONNECTOR: "Webpages indexed by SurfSense" (personally selected websites)
|
||||
</knowledge_sources>
|
||||
|
||||
<instructions>
|
||||
|
|
|
|||
|
|
@ -208,9 +208,6 @@ class Config:
|
|||
# LlamaCloud API Key
|
||||
LLAMA_CLOUD_API_KEY = os.getenv("LLAMA_CLOUD_API_KEY")
|
||||
|
||||
# Firecrawl API Key
|
||||
FIRECRAWL_API_KEY = os.getenv("FIRECRAWL_API_KEY", None)
|
||||
|
||||
# Litellm TTS Configuration
|
||||
TTS_SERVICE = os.getenv("TTS_SERVICE")
|
||||
TTS_SERVICE_API_BASE = os.getenv("TTS_SERVICE_API_BASE")
|
||||
|
|
|
|||
|
|
@ -49,7 +49,7 @@ from app.tasks.connector_indexers import (
|
|||
index_luma_events,
|
||||
index_notion_pages,
|
||||
index_slack_messages,
|
||||
index_webcrawler_urls,
|
||||
index_crawled_urls,
|
||||
)
|
||||
from app.users import current_active_user
|
||||
from app.utils.check_ownership import check_ownership
|
||||
|
|
@ -691,12 +691,12 @@ async def index_connector_content(
|
|||
response_message = "Elasticsearch indexing started in the background."
|
||||
|
||||
elif connector.connector_type == SearchSourceConnectorType.WEBCRAWLER_CONNECTOR:
|
||||
from app.tasks.celery_tasks.connector_tasks import index_webcrawler_urls_task
|
||||
from app.tasks.celery_tasks.connector_tasks import index_crawled_urls_task
|
||||
|
||||
logger.info(
|
||||
f"Triggering web pages indexing for connector {connector_id} into search space {search_space_id} from {indexing_from} to {indexing_to}"
|
||||
)
|
||||
index_webcrawler_urls_task.delay(
|
||||
index_crawled_urls_task.delay(
|
||||
connector_id, search_space_id, str(user.id), indexing_from, indexing_to
|
||||
)
|
||||
response_message = "Web page indexing started in the background."
|
||||
|
|
@ -1537,8 +1537,8 @@ async def run_elasticsearch_indexing(
|
|||
exc_info=True,
|
||||
)
|
||||
|
||||
# Add new helper functions for webcrawler indexing
|
||||
async def run_webcrawler_indexing_with_new_session(
|
||||
# Add new helper functions for crawled web page indexing
|
||||
async def run_web_page_indexing_with_new_session(
|
||||
connector_id: int,
|
||||
search_space_id: int,
|
||||
user_id: str,
|
||||
|
|
@ -1546,16 +1546,16 @@ async def run_webcrawler_indexing_with_new_session(
|
|||
end_date: str,
|
||||
):
|
||||
"""
|
||||
Create a new session and run the Webcrawler indexing task.
|
||||
Create a new session and run the Web page indexing task.
|
||||
This prevents session leaks by creating a dedicated session for the background task.
|
||||
"""
|
||||
async with async_session_maker() as session:
|
||||
await run_webcrawler_indexing(
|
||||
await run_web_page_indexing(
|
||||
session, connector_id, search_space_id, user_id, start_date, end_date
|
||||
)
|
||||
|
||||
|
||||
async def run_webcrawler_indexing(
|
||||
async def run_web_page_indexing(
|
||||
session: AsyncSession,
|
||||
connector_id: int,
|
||||
search_space_id: int,
|
||||
|
|
@ -1564,7 +1564,7 @@ async def run_webcrawler_indexing(
|
|||
end_date: str,
|
||||
):
|
||||
"""
|
||||
Background task to run Webcrawler indexing.
|
||||
Background task to run Web page indexing.
|
||||
Args:
|
||||
session: Database session
|
||||
connector_id: ID of the webcrawler connector
|
||||
|
|
@ -1574,7 +1574,7 @@ async def run_webcrawler_indexing(
|
|||
end_date: End date for indexing
|
||||
"""
|
||||
try:
|
||||
documents_processed, error_or_warning = await index_webcrawler_urls(
|
||||
documents_processed, error_or_warning = await index_crawled_urls(
|
||||
session=session,
|
||||
connector_id=connector_id,
|
||||
search_space_id=search_space_id,
|
||||
|
|
@ -1588,11 +1588,11 @@ async def run_webcrawler_indexing(
|
|||
if documents_processed > 0:
|
||||
await update_connector_last_indexed(session, connector_id)
|
||||
logger.info(
|
||||
f"Webcrawler indexing completed successfully: {documents_processed} documents processed"
|
||||
f"Web page indexing completed successfully: {documents_processed} documents processed"
|
||||
)
|
||||
else:
|
||||
logger.error(
|
||||
f"Webcrawler indexing failed or no documents processed: {error_or_warning}"
|
||||
f"Web page indexing failed or no documents processed: {error_or_warning}"
|
||||
)
|
||||
except Exception as e:
|
||||
logger.error(f"Error in background Webcrawler indexing task: {e!s}")
|
||||
logger.error(f"Error in background Web page indexing task: {e!s}")
|
||||
|
|
@ -2573,4 +2573,4 @@ class ConnectorService:
|
|||
"sources": sources_list,
|
||||
}
|
||||
|
||||
return result_object, elasticsearch_chunks
|
||||
return result_object, elasticsearch_chunks
|
||||
|
|
@ -602,8 +602,8 @@ async def _index_elasticsearch_documents(
|
|||
)
|
||||
|
||||
|
||||
@celery_app.task(name="index_webcrawler_urls", bind=True)
|
||||
def index_webcrawler_urls_task(
|
||||
@celery_app.task(name="index_crawled_urls", bind=True)
|
||||
def index_crawled_urls_task(
|
||||
self,
|
||||
connector_id: int,
|
||||
search_space_id: int,
|
||||
|
|
@ -611,7 +611,7 @@ def index_webcrawler_urls_task(
|
|||
start_date: str,
|
||||
end_date: str,
|
||||
):
|
||||
"""Celery task to index Webcrawler Urls."""
|
||||
"""Celery task to index Web page Urls."""
|
||||
import asyncio
|
||||
|
||||
loop = asyncio.new_event_loop()
|
||||
|
|
@ -619,7 +619,7 @@ def index_webcrawler_urls_task(
|
|||
|
||||
try:
|
||||
loop.run_until_complete(
|
||||
_index_webcrawler_urls(
|
||||
_index_crawled_urls(
|
||||
connector_id, search_space_id, user_id, start_date, end_date
|
||||
)
|
||||
)
|
||||
|
|
@ -627,19 +627,19 @@ def index_webcrawler_urls_task(
|
|||
loop.close()
|
||||
|
||||
|
||||
async def _index_webcrawler_urls(
|
||||
async def _index_crawled_urls(
|
||||
connector_id: int,
|
||||
search_space_id: int,
|
||||
user_id: str,
|
||||
start_date: str,
|
||||
end_date: str,
|
||||
):
|
||||
"""Index Webcrawler Urls with new session."""
|
||||
"""Index Web page Urls with new session."""
|
||||
from app.routes.search_source_connectors_routes import (
|
||||
run_webcrawler_indexing,
|
||||
run_web_page_indexing,
|
||||
)
|
||||
|
||||
async with get_celery_session_maker()() as session:
|
||||
await run_webcrawler_indexing(
|
||||
await run_web_page_indexing(
|
||||
session, connector_id, search_space_id, user_id, start_date, end_date
|
||||
)
|
||||
|
|
|
|||
|
|
@ -77,7 +77,7 @@ async def _check_and_trigger_schedules():
|
|||
index_luma_events_task,
|
||||
index_notion_pages_task,
|
||||
index_slack_messages_task,
|
||||
index_webcrawler_urls_task
|
||||
index_crawled_urls_task
|
||||
)
|
||||
|
||||
# Map connector types to their tasks
|
||||
|
|
@ -95,7 +95,7 @@ async def _check_and_trigger_schedules():
|
|||
SearchSourceConnectorType.DISCORD_CONNECTOR: index_discord_messages_task,
|
||||
SearchSourceConnectorType.LUMA_CONNECTOR: index_luma_events_task,
|
||||
SearchSourceConnectorType.ELASTICSEARCH_CONNECTOR: index_elasticsearch_documents_task,
|
||||
SearchSourceConnectorType.WEBCRAWLER_CONNECTOR: index_webcrawler_urls_task,
|
||||
SearchSourceConnectorType.WEBCRAWLER_CONNECTOR: index_crawled_urls_task,
|
||||
}
|
||||
|
||||
# Trigger indexing for each due connector
|
||||
|
|
|
|||
|
|
@ -42,7 +42,7 @@ from .luma_indexer import index_luma_events
|
|||
# Documentation and knowledge management
|
||||
from .notion_indexer import index_notion_pages
|
||||
from .slack_indexer import index_slack_messages
|
||||
from .webcrawler_indexer import index_webcrawler_urls
|
||||
from .webcrawler_indexer import index_crawled_urls
|
||||
|
||||
__all__ = [ # noqa: RUF022
|
||||
"index_airtable_records",
|
||||
|
|
@ -60,7 +60,7 @@ __all__ = [ # noqa: RUF022
|
|||
"index_linear_issues",
|
||||
# Documentation and knowledge management
|
||||
"index_notion_pages",
|
||||
"index_webcrawler_urls",
|
||||
"index_crawled_urls",
|
||||
# Communication platforms
|
||||
"index_slack_messages",
|
||||
"index_google_gmail_messages",
|
||||
|
|
|
|||
|
|
@ -27,7 +27,7 @@ from .base import (
|
|||
)
|
||||
|
||||
|
||||
async def index_webcrawler_urls(
|
||||
async def index_crawled_urls(
|
||||
session: AsyncSession,
|
||||
connector_id: int,
|
||||
search_space_id: int,
|
||||
|
|
@ -37,7 +37,7 @@ async def index_webcrawler_urls(
|
|||
update_last_indexed: bool = True,
|
||||
) -> tuple[int, str | None]:
|
||||
"""
|
||||
Index webcrawler URLs.
|
||||
Index web page URLs.
|
||||
|
||||
Args:
|
||||
session: Database session
|
||||
|
|
@ -55,9 +55,9 @@ async def index_webcrawler_urls(
|
|||
|
||||
# Log task start
|
||||
log_entry = await task_logger.log_task_start(
|
||||
task_name="webcrawler_url_indexing",
|
||||
task_name="crawled_url_indexing",
|
||||
source="connector_indexing_task",
|
||||
message=f"Starting webcrawler URL indexing for connector {connector_id}",
|
||||
message=f"Starting web page URL indexing for connector {connector_id}",
|
||||
metadata={
|
||||
"connector_id": connector_id,
|
||||
"user_id": str(user_id),
|
||||
|
|
@ -104,7 +104,7 @@ async def index_webcrawler_urls(
|
|||
urls = []
|
||||
|
||||
logger.info(
|
||||
f"Starting webcrawler indexing for connector {connector_id} with {len(urls)} URLs"
|
||||
f"Starting crawled web page indexing for connector {connector_id} with {len(urls)} URLs"
|
||||
)
|
||||
|
||||
# Initialize webcrawler client
|
||||
|
|
@ -367,7 +367,7 @@ async def index_webcrawler_urls(
|
|||
|
||||
await task_logger.log_task_success(
|
||||
log_entry,
|
||||
f"Successfully completed webcrawler indexing for connector {connector_id}",
|
||||
f"Successfully completed crawled web page indexing for connector {connector_id}",
|
||||
{
|
||||
"urls_processed": total_processed,
|
||||
"documents_indexed": documents_indexed,
|
||||
|
|
@ -378,7 +378,7 @@ async def index_webcrawler_urls(
|
|||
)
|
||||
|
||||
logger.info(
|
||||
f"Webcrawler indexing completed: {documents_indexed} new, "
|
||||
f"Web page indexing completed: {documents_indexed} new, "
|
||||
f"{documents_updated} updated, {documents_skipped} skipped, "
|
||||
f"{len(failed_urls)} failed"
|
||||
)
|
||||
|
|
@ -388,7 +388,7 @@ async def index_webcrawler_urls(
|
|||
await session.rollback()
|
||||
await task_logger.log_task_failure(
|
||||
log_entry,
|
||||
f"Database error during webcrawler indexing for connector {connector_id}",
|
||||
f"Database error during web page indexing for connector {connector_id}",
|
||||
str(db_error),
|
||||
{"error_type": "SQLAlchemyError"},
|
||||
)
|
||||
|
|
@ -398,12 +398,12 @@ async def index_webcrawler_urls(
|
|||
await session.rollback()
|
||||
await task_logger.log_task_failure(
|
||||
log_entry,
|
||||
f"Failed to index webcrawler URLs for connector {connector_id}",
|
||||
f"Failed to index web page URLs for connector {connector_id}",
|
||||
str(e),
|
||||
{"error_type": type(e).__name__},
|
||||
)
|
||||
logger.error(f"Failed to index webcrawler URLs: {e!s}", exc_info=True)
|
||||
return 0, f"Failed to index webcrawler URLs: {e!s}"
|
||||
logger.error(f"Failed to index web page URLs: {e!s}", exc_info=True)
|
||||
return 0, f"Failed to index web page URLs: {e!s}"
|
||||
|
||||
|
||||
async def get_crawled_url_documents(
|
||||
|
|
|
|||
|
|
@ -31,7 +31,7 @@ CONNECTOR_TASK_MAP = {
|
|||
SearchSourceConnectorType.DISCORD_CONNECTOR: "index_discord_messages",
|
||||
SearchSourceConnectorType.LUMA_CONNECTOR: "index_luma_events",
|
||||
SearchSourceConnectorType.ELASTICSEARCH_CONNECTOR: "index_elasticsearch_documents",
|
||||
SearchSourceConnectorType.WEBCRAWLER_CONNECTOR: "index_webcrawler_urls",
|
||||
SearchSourceConnectorType.WEBCRAWLER_CONNECTOR: "index_crawled_urls",
|
||||
}
|
||||
|
||||
|
||||
|
|
@ -80,7 +80,7 @@ def create_periodic_schedule(
|
|||
index_luma_events_task,
|
||||
index_notion_pages_task,
|
||||
index_slack_messages_task,
|
||||
index_webcrawler_urls_task,
|
||||
index_crawled_urls_task,
|
||||
)
|
||||
|
||||
# Map connector type to task
|
||||
|
|
@ -98,7 +98,7 @@ def create_periodic_schedule(
|
|||
SearchSourceConnectorType.DISCORD_CONNECTOR: index_discord_messages_task,
|
||||
SearchSourceConnectorType.LUMA_CONNECTOR: index_luma_events_task,
|
||||
SearchSourceConnectorType.ELASTICSEARCH_CONNECTOR: index_elasticsearch_documents_task,
|
||||
SearchSourceConnectorType.WEBCRAWLER_CONNECTOR: index_webcrawler_urls_task,
|
||||
SearchSourceConnectorType.WEBCRAWLER_CONNECTOR: index_crawled_urls_task,
|
||||
}
|
||||
|
||||
# Trigger the first run immediately
|
||||
|
|
|
|||
|
|
@ -18,7 +18,16 @@ import {
|
|||
CardHeader,
|
||||
CardTitle,
|
||||
} from "@/components/ui/card";
|
||||
import { Form } from "@/components/ui/form";
|
||||
import {
|
||||
Form,
|
||||
FormControl,
|
||||
FormDescription,
|
||||
FormField,
|
||||
FormItem,
|
||||
FormLabel,
|
||||
FormMessage,
|
||||
} from "@/components/ui/form";
|
||||
import { Textarea } from "@/components/ui/textarea";
|
||||
import { getConnectorIcon } from "@/contracts/enums/connectorIcons";
|
||||
import { useConnectorEditPage } from "@/hooks/use-connector-edit-page";
|
||||
// Import Utils, Types, Hook, and Components
|
||||
|
|
@ -285,13 +294,35 @@ export default function EditConnectorPage() {
|
|||
|
||||
{/* == Webcrawler == */}
|
||||
{connector.connector_type === "WEBCRAWLER_CONNECTOR" && (
|
||||
<EditSimpleTokenForm
|
||||
control={editForm.control}
|
||||
fieldName="FIRECRAWL_API_KEY"
|
||||
fieldLabel="Firecrawl API Key (Optional)"
|
||||
fieldDescription="Add a Firecrawl API key for enhanced crawling capabilities. If not provided, will use AsyncChromiumLoader as fallback."
|
||||
placeholder="fc-xxxxxxxxxxxxx"
|
||||
/>
|
||||
<div className="space-y-4">
|
||||
<EditSimpleTokenForm
|
||||
control={editForm.control}
|
||||
fieldName="FIRECRAWL_API_KEY"
|
||||
fieldLabel="Firecrawl API Key (Optional)"
|
||||
fieldDescription="Add a Firecrawl API key for enhanced crawling capabilities. If not provided, will use AsyncChromiumLoader as fallback."
|
||||
placeholder="fc-xxxxxxxxxxxxx"
|
||||
/>
|
||||
<FormField
|
||||
control={editForm.control}
|
||||
name="INITIAL_URLS"
|
||||
render={({ field }) => (
|
||||
<FormItem>
|
||||
<FormLabel>URLs to Crawl</FormLabel>
|
||||
<FormControl>
|
||||
<Textarea
|
||||
placeholder="https://example.com https://docs.example.com https://blog.example.com"
|
||||
className="min-h-[150px] font-mono text-sm"
|
||||
{...field}
|
||||
/>
|
||||
</FormControl>
|
||||
<FormDescription>
|
||||
Enter URLs to crawl (one per line). These URLs will be indexed when you trigger indexing.
|
||||
</FormDescription>
|
||||
<FormMessage />
|
||||
</FormItem>
|
||||
)}
|
||||
/>
|
||||
</div>
|
||||
)}
|
||||
|
||||
</CardContent>
|
||||
|
|
|
|||
|
|
@ -55,7 +55,7 @@ const getConnectorTypeDisplay = (type: string): string => {
|
|||
AIRTABLE_CONNECTOR: "Airtable Connector",
|
||||
LUMA_CONNECTOR: "Luma Connector",
|
||||
ELASTICSEARCH_CONNECTOR: "Elasticsearch Connector",
|
||||
WEBCRAWLER_CONNECTOR: "Web Crawler Connector",
|
||||
WEBCRAWLER_CONNECTOR: "Web Page Connector",
|
||||
// Add other connector types here as needed
|
||||
};
|
||||
return typeMap[type] || type;
|
||||
|
|
|
|||
|
|
@ -64,7 +64,7 @@ export default function WebcrawlerConnectorPage() {
|
|||
const form = useForm<WebcrawlerConnectorFormValues>({
|
||||
resolver: zodResolver(webcrawlerConnectorFormSchema),
|
||||
defaultValues: {
|
||||
name: "Web Crawler",
|
||||
name: "Web Pages",
|
||||
api_key: "",
|
||||
initial_urls: "",
|
||||
},
|
||||
|
|
@ -150,7 +150,7 @@ export default function WebcrawlerConnectorPage() {
|
|||
{getConnectorIcon(EnumConnectorName.WEBCRAWLER_CONNECTOR, "h-6 w-6")}
|
||||
</div>
|
||||
<div>
|
||||
<h1 className="text-3xl font-bold tracking-tight">Connect Web Crawler</h1>
|
||||
<h1 className="text-3xl font-bold tracking-tight">Connect Web Pages</h1>
|
||||
<p className="text-muted-foreground">Crawl and index web pages for search.</p>
|
||||
</div>
|
||||
</div>
|
||||
|
|
@ -160,9 +160,9 @@ export default function WebcrawlerConnectorPage() {
|
|||
{!doesConnectorExist ? (
|
||||
<Card>
|
||||
<CardHeader>
|
||||
<CardTitle>Set Up Web Crawler</CardTitle>
|
||||
<CardTitle>Set Up Web Page crawler</CardTitle>
|
||||
<CardDescription>
|
||||
Configure your web crawler to index web pages. Optionally add a Firecrawl API key
|
||||
Configure your web page crawler to index web pages. Optionally add a Firecrawl API key
|
||||
for enhanced crawling capabilities.
|
||||
</CardDescription>
|
||||
</CardHeader>
|
||||
|
|
@ -277,7 +277,7 @@ export default function WebcrawlerConnectorPage() {
|
|||
/* Success Card */
|
||||
<Card>
|
||||
<CardHeader>
|
||||
<CardTitle>✅ Your web crawler is successfully set up!</CardTitle>
|
||||
<CardTitle>✅ Your web page crawler is successfully set up!</CardTitle>
|
||||
<CardDescription>
|
||||
You can now add URLs to crawl from the connector management page.
|
||||
</CardDescription>
|
||||
|
|
|
|||
|
|
@ -1,201 +0,0 @@
|
|||
"use client";
|
||||
|
||||
import { type Tag, TagInput } from "emblor";
|
||||
import { Globe, Loader2 } from "lucide-react";
|
||||
import { useParams, useRouter } from "next/navigation";
|
||||
import { useTranslations } from "next-intl";
|
||||
import { useState } from "react";
|
||||
import { toast } from "sonner";
|
||||
import { Button } from "@/components/ui/button";
|
||||
import {
|
||||
Card,
|
||||
CardContent,
|
||||
CardDescription,
|
||||
CardFooter,
|
||||
CardHeader,
|
||||
CardTitle,
|
||||
} from "@/components/ui/card";
|
||||
import { Label } from "@/components/ui/label";
|
||||
|
||||
// URL validation regex
|
||||
const urlRegex = /^(https?:\/\/)?([\da-z.-]+)\.([a-z.]{2,6})([/\w .-]*)*\/?$/;
|
||||
|
||||
export default function WebpageCrawler() {
|
||||
const t = useTranslations("add_webpage");
|
||||
const params = useParams();
|
||||
const router = useRouter();
|
||||
const search_space_id = params.search_space_id as string;
|
||||
|
||||
const [urlTags, setUrlTags] = useState<Tag[]>([]);
|
||||
const [activeTagIndex, setActiveTagIndex] = useState<number | null>(null);
|
||||
const [isSubmitting, setIsSubmitting] = useState(false);
|
||||
const [error, setError] = useState<string | null>(null);
|
||||
|
||||
// Function to validate a URL
|
||||
const isValidUrl = (url: string): boolean => {
|
||||
return urlRegex.test(url);
|
||||
};
|
||||
|
||||
// Function to handle URL submission
|
||||
const handleSubmit = async () => {
|
||||
// Validate that we have at least one URL
|
||||
if (urlTags.length === 0) {
|
||||
setError(t("error_no_url"));
|
||||
return;
|
||||
}
|
||||
|
||||
// Validate all URLs
|
||||
const invalidUrls = urlTags.filter((tag) => !isValidUrl(tag.text));
|
||||
if (invalidUrls.length > 0) {
|
||||
setError(t("error_invalid_urls", { urls: invalidUrls.map((tag) => tag.text).join(", ") }));
|
||||
return;
|
||||
}
|
||||
|
||||
setError(null);
|
||||
setIsSubmitting(true);
|
||||
|
||||
try {
|
||||
toast(t("crawling_toast"), {
|
||||
description: t("crawling_toast_desc"),
|
||||
});
|
||||
|
||||
// Extract URLs from tags
|
||||
const urls = urlTags.map((tag) => tag.text);
|
||||
|
||||
// Make API call to backend
|
||||
const response = await fetch(
|
||||
`${process.env.NEXT_PUBLIC_FASTAPI_BACKEND_URL}/api/v1/documents`,
|
||||
{
|
||||
method: "POST",
|
||||
headers: {
|
||||
"Content-Type": "application/json",
|
||||
Authorization: `Bearer ${localStorage.getItem("surfsense_bearer_token")}`,
|
||||
},
|
||||
body: JSON.stringify({
|
||||
document_type: "CRAWLED_URL",
|
||||
content: urls,
|
||||
search_space_id: parseInt(search_space_id),
|
||||
}),
|
||||
}
|
||||
);
|
||||
|
||||
if (!response.ok) {
|
||||
throw new Error("Failed to crawl URLs");
|
||||
}
|
||||
|
||||
await response.json();
|
||||
|
||||
toast(t("success_toast"), {
|
||||
description: t("success_toast_desc"),
|
||||
});
|
||||
|
||||
// Redirect to documents page
|
||||
router.push(`/dashboard/${search_space_id}/documents`);
|
||||
} catch (error: any) {
|
||||
setError(error.message || t("error_generic"));
|
||||
toast(t("error_toast"), {
|
||||
description: `${t("error_toast_desc")}: ${error.message}`,
|
||||
});
|
||||
} finally {
|
||||
setIsSubmitting(false);
|
||||
}
|
||||
};
|
||||
|
||||
// Function to add a new URL tag
|
||||
const handleAddTag = (text: string) => {
|
||||
// Basic URL validation
|
||||
if (!isValidUrl(text)) {
|
||||
toast(t("invalid_url_toast"), {
|
||||
description: t("invalid_url_toast_desc"),
|
||||
});
|
||||
return;
|
||||
}
|
||||
|
||||
// Check for duplicates
|
||||
if (urlTags.some((tag) => tag.text === text)) {
|
||||
toast(t("duplicate_url_toast"), {
|
||||
description: t("duplicate_url_toast_desc"),
|
||||
});
|
||||
return;
|
||||
}
|
||||
|
||||
// Add the new tag
|
||||
const newTag: Tag = {
|
||||
id: Date.now().toString(),
|
||||
text: text,
|
||||
};
|
||||
|
||||
setUrlTags([...urlTags, newTag]);
|
||||
};
|
||||
|
||||
return (
|
||||
<div className="container mx-auto py-8">
|
||||
<Card className="max-w-2xl mx-auto">
|
||||
<CardHeader>
|
||||
<CardTitle className="flex items-center gap-2">
|
||||
<Globe className="h-5 w-5" />
|
||||
{t("title")}
|
||||
</CardTitle>
|
||||
<CardDescription>{t("subtitle")}</CardDescription>
|
||||
</CardHeader>
|
||||
<CardContent>
|
||||
<div className="space-y-4">
|
||||
<div className="space-y-2">
|
||||
<Label htmlFor="url-input">{t("label")}</Label>
|
||||
<TagInput
|
||||
id="url-input"
|
||||
tags={urlTags}
|
||||
setTags={setUrlTags}
|
||||
placeholder={t("placeholder")}
|
||||
onAddTag={handleAddTag}
|
||||
styleClasses={{
|
||||
inlineTagsContainer:
|
||||
"border-input rounded-lg bg-background shadow-sm shadow-black/5 transition-shadow focus-within:border-ring focus-within:outline-none focus-within:ring-[3px] focus-within:ring-ring/20 p-1 gap-1",
|
||||
input: "w-full min-w-[80px] focus-visible:outline-none shadow-none px-2 h-7",
|
||||
tag: {
|
||||
body: "h-7 relative bg-background border border-input hover:bg-background rounded-md font-medium text-xs ps-2 pe-7 flex",
|
||||
closeButton:
|
||||
"absolute -inset-y-px -end-px p-0 rounded-e-lg flex size-7 transition-colors outline-0 focus-visible:outline focus-visible:outline-2 focus-visible:outline-ring/70 text-muted-foreground/80 hover:text-foreground",
|
||||
},
|
||||
}}
|
||||
activeTagIndex={activeTagIndex}
|
||||
setActiveTagIndex={setActiveTagIndex}
|
||||
/>
|
||||
<p className="text-xs text-muted-foreground mt-1">{t("hint")}</p>
|
||||
</div>
|
||||
|
||||
{error && <div className="text-sm text-red-500 mt-2">{error}</div>}
|
||||
|
||||
<div className="bg-muted/50 rounded-lg p-4 text-sm">
|
||||
<h4 className="font-medium mb-2">{t("tips_title")}</h4>
|
||||
<ul className="list-disc pl-5 space-y-1 text-muted-foreground">
|
||||
<li>{t("tip_1")}</li>
|
||||
<li>{t("tip_2")}</li>
|
||||
<li>{t("tip_3")}</li>
|
||||
<li>{t("tip_4")}</li>
|
||||
</ul>
|
||||
</div>
|
||||
</div>
|
||||
</CardContent>
|
||||
<CardFooter className="flex justify-between">
|
||||
<Button
|
||||
variant="outline"
|
||||
onClick={() => router.push(`/dashboard/${search_space_id}/documents`)}
|
||||
>
|
||||
{t("cancel")}
|
||||
</Button>
|
||||
<Button onClick={handleSubmit} disabled={isSubmitting || urlTags.length === 0}>
|
||||
{isSubmitting ? (
|
||||
<>
|
||||
<Loader2 className="mr-2 h-4 w-4 animate-spin" />
|
||||
{t("submitting")}
|
||||
</>
|
||||
) : (
|
||||
t("submit")
|
||||
)}
|
||||
</Button>
|
||||
</CardFooter>
|
||||
</Card>
|
||||
</div>
|
||||
);
|
||||
}
|
||||
|
|
@ -138,7 +138,7 @@ export function DashboardBreadcrumb() {
|
|||
"linkup-api": "LinkUp API",
|
||||
"luma-connector": "Luma",
|
||||
"elasticsearch-connector": "Elasticsearch",
|
||||
"webcrawler-connector": "WebCrawler",
|
||||
"webcrawler-connector": "Web Pages",
|
||||
};
|
||||
|
||||
const connectorLabel = connectorLabels[connectorType] || connectorType;
|
||||
|
|
|
|||
|
|
@ -53,5 +53,6 @@ export const editConnectorSchema = z.object({
|
|||
LUMA_API_KEY: z.string().optional(),
|
||||
ELASTICSEARCH_API_KEY: z.string().optional(),
|
||||
FIRECRAWL_API_KEY: z.string().optional(),
|
||||
INITIAL_URLS: z.string().optional()
|
||||
});
|
||||
export type EditConnectorFormValues = z.infer<typeof editConnectorSchema>;
|
||||
|
|
|
|||
|
|
@ -29,7 +29,7 @@ const INTEGRATIONS: Integration[] = [
|
|||
// Documentation & Knowledge
|
||||
{ name: "Confluence", icon: "https://cdn.simpleicons.org/confluence/172B4D" },
|
||||
{ name: "Notion", icon: "https://cdn.simpleicons.org/notion/000000/ffffff" },
|
||||
{ name: "Web Crawler", icon: "https://cdn.jsdelivr.net/npm/lucide-static@0.294.0/icons/globe.svg"},
|
||||
{ name: "Web Pages", icon: "https://cdn.jsdelivr.net/npm/lucide-static@0.294.0/icons/globe.svg"},
|
||||
|
||||
// Cloud Storage
|
||||
{ name: "Google Drive", icon: "https://cdn.simpleicons.org/googledrive/4285F4" },
|
||||
|
|
|
|||
|
|
@ -140,7 +140,7 @@ export const connectorCategories: ConnectorCategory[] = [
|
|||
},
|
||||
{
|
||||
id: "webcrawler-connector",
|
||||
title: "Web Crawler",
|
||||
title: "Web Pages",
|
||||
description: "webcrawler_desc",
|
||||
icon: getConnectorIcon(EnumConnectorName.WEBCRAWLER_CONNECTOR, "h-6 w-6"),
|
||||
status: "available",
|
||||
|
|
|
|||
|
|
@ -96,8 +96,7 @@ Before you begin, ensure you have:
|
|||
| TTS_SERVICE_API_BASE | (Optional) Custom API base URL for the Text-to-Speech service |
|
||||
| STT_SERVICE | Speech-to-Text API provider for Audio Files (e.g., `local/base`, `openai/whisper-1`). See [supported providers](https://docs.litellm.ai/docs/audio_transcription#supported-providers) |
|
||||
| STT_SERVICE_API_KEY | (Optional if local) API key for the Speech-to-Text service |
|
||||
| STT_SERVICE_API_BASE | (Optional) Custom API base URL for the Speech-to-Text service |
|
||||
| FIRECRAWL_API_KEY | API key for Firecrawl service for web crawling |
|
||||
| STT_SERVICE_API_BASE | (Optional) Custom API base URL for the Speech-to-Text service | |
|
||||
| ETL_SERVICE | Document parsing service: `UNSTRUCTURED` (supports 34+ formats), `LLAMACLOUD` (supports 50+ formats including legacy document types), or `DOCLING` (local processing, supports PDF, Office docs, images, HTML, CSV) |
|
||||
| UNSTRUCTURED_API_KEY | API key for Unstructured.io service for document parsing (required if ETL_SERVICE=UNSTRUCTURED) |
|
||||
| LLAMA_CLOUD_API_KEY | API key for LlamaCloud service for document parsing (required if ETL_SERVICE=LLAMACLOUD) |
|
||||
|
|
|
|||
|
|
@ -62,6 +62,8 @@ export const getConnectorIcon = (connectorType: EnumConnectorName | string, clas
|
|||
case EnumConnectorName.WEBCRAWLER_CONNECTOR:
|
||||
return <Globe {...iconProps} />;
|
||||
// Additional cases for non-enum connector types
|
||||
case "CRAWLED_URL":
|
||||
return <Globe {...iconProps} />;
|
||||
case "YOUTUBE_VIDEO":
|
||||
return <IconBrandYoutube {...iconProps} />;
|
||||
case "FILE":
|
||||
|
|
|
|||
|
|
@ -98,6 +98,7 @@ export function useConnectorEditPage(connectorId: number, searchSpaceId: string)
|
|||
LUMA_API_KEY: "",
|
||||
ELASTICSEARCH_API_KEY: "",
|
||||
FIRECRAWL_API_KEY: "",
|
||||
INITIAL_URLS: ""
|
||||
},
|
||||
});
|
||||
|
||||
|
|
@ -144,6 +145,7 @@ export function useConnectorEditPage(connectorId: number, searchSpaceId: string)
|
|||
LUMA_API_KEY: config.LUMA_API_KEY || "",
|
||||
ELASTICSEARCH_API_KEY: config.ELASTICSEARCH_API_KEY || "",
|
||||
FIRECRAWL_API_KEY: config.FIRECRAWL_API_KEY || "",
|
||||
INTIAL_URLS: config.INITIAL_URLS || ""
|
||||
});
|
||||
if (currentConnector.connector_type === "GITHUB_CONNECTOR") {
|
||||
const savedRepos = config.repo_full_names || [];
|
||||
|
|
@ -472,16 +474,28 @@ export function useConnectorEditPage(connectorId: number, searchSpaceId: string)
|
|||
}
|
||||
break;
|
||||
case "WEBCRAWLER_CONNECTOR":
|
||||
if (formData.FIRECRAWL_API_KEY !== originalConfig.FIRECRAWL_API_KEY) {
|
||||
if (
|
||||
formData.FIRECRAWL_API_KEY !== originalConfig.FIRECRAWL_API_KEY ||
|
||||
formData.INITIAL_URLS !== originalConfig.INITIAL_URLS
|
||||
) {
|
||||
newConfig = {};
|
||||
|
||||
if (formData.FIRECRAWL_API_KEY && formData.FIRECRAWL_API_KEY.trim()) {
|
||||
if (!formData.FIRECRAWL_API_KEY.startsWith("fc-")) {
|
||||
toast.warning("Firecrawl API keys typically start with 'fc-'. Please verify your key.");
|
||||
}
|
||||
newConfig = { FIRECRAWL_API_KEY: formData.FIRECRAWL_API_KEY };
|
||||
} else {
|
||||
newConfig = {};
|
||||
newConfig.FIRECRAWL_API_KEY = formData.FIRECRAWL_API_KEY.trim();
|
||||
} else if (originalConfig.FIRECRAWL_API_KEY) {
|
||||
toast.info("Firecrawl API key removed. Web crawler will use AsyncChromiumLoader as fallback.");
|
||||
}
|
||||
|
||||
if (formData.INITIAL_URLS !== undefined) {
|
||||
if (formData.INITIAL_URLS && formData.INITIAL_URLS.trim()) {
|
||||
newConfig.INITIAL_URLS = formData.INITIAL_URLS.trim();
|
||||
} else if (originalConfig.INITIAL_URLS) {
|
||||
toast.info("URLs removed from crawler configuration.");
|
||||
}
|
||||
}
|
||||
}
|
||||
break;
|
||||
}
|
||||
|
|
@ -579,6 +593,7 @@ export function useConnectorEditPage(connectorId: number, searchSpaceId: string)
|
|||
);
|
||||
} else if (connector.connector_type == "WEBCRAWLER_CONNECTOR") {
|
||||
editForm.setValue("FIRECRAWL_API_KEY",newlySavedConfig.FIRECRAWL_API_KEY || "");
|
||||
editForm.setValue("INITIAL_URLS", newlySavedConfig.INITIAL_URLS || "");
|
||||
}
|
||||
}
|
||||
if (connector.connector_type === "GITHUB_CONNECTOR") {
|
||||
|
|
|
|||
|
|
@ -18,7 +18,7 @@ export const getConnectorTypeDisplay = (type: string): string => {
|
|||
AIRTABLE_CONNECTOR: "Airtable",
|
||||
LUMA_CONNECTOR: "Luma",
|
||||
ELASTICSEARCH_CONNECTOR: "Elasticsearch",
|
||||
WEBCRAWLER_CONNECTOR: "Web Crawler",
|
||||
WEBCRAWLER_CONNECTOR: "Web Pages",
|
||||
};
|
||||
return typeMap[type] || type;
|
||||
};
|
||||
|
|
|
|||
|
|
@ -332,7 +332,7 @@
|
|||
"calendar_desc": "Connect to Google Calendar to search events, meetings and schedules.",
|
||||
"gmail_desc": "Connect to your Gmail account to search through your emails.",
|
||||
"zoom_desc": "Connect to Zoom to access meeting recordings and transcripts.",
|
||||
"webcrawler_desc": "Scrape web pages using FireCrawl."
|
||||
"webcrawler_desc": "Crawl web pages"
|
||||
},
|
||||
"upload_documents": {
|
||||
"title": "Upload Documents",
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue