diff --git a/surfsense_backend/app/routes/__init__.py b/surfsense_backend/app/routes/__init__.py index 6114dd207..7568c0f25 100644 --- a/surfsense_backend/app/routes/__init__.py +++ b/surfsense_backend/app/routes/__init__.py @@ -42,6 +42,7 @@ from .search_spaces_routes import router as search_spaces_router from .slack_add_connector_route import router as slack_add_connector_router from .surfsense_docs_routes import router as surfsense_docs_router from .teams_add_connector_route import router as teams_add_connector_router +from .youtube_routes import router as youtube_router router = APIRouter() @@ -79,3 +80,4 @@ router.include_router(notifications_router) # Notifications with Electric SQL s router.include_router(composio_router) # Composio OAuth and toolkit management router.include_router(public_chat_router) # Public chat sharing and cloning router.include_router(incentive_tasks_router) # Incentive tasks for earning free pages +router.include_router(youtube_router) # YouTube playlist resolution diff --git a/surfsense_backend/app/routes/youtube_routes.py b/surfsense_backend/app/routes/youtube_routes.py new file mode 100644 index 000000000..142995499 --- /dev/null +++ b/surfsense_backend/app/routes/youtube_routes.py @@ -0,0 +1,205 @@ +"""YouTube utility routes (playlist resolution).""" + +import json +import logging +import re + +import aiohttp +from fake_useragent import UserAgent +from fastapi import APIRouter, Depends, HTTPException, Query + +from app.db import User +from app.users import current_active_user +from app.utils.proxy_config import get_requests_proxies + +router = APIRouter() +logger = logging.getLogger(__name__) + +_PLAYLIST_ID_RE = re.compile(r"[?&]list=([\w-]+)") + +_INNERTUBE_API_URL = "https://www.youtube.com/youtubei/v1/browse" +_INNERTUBE_CLIENT = { + "clientName": "WEB", + "clientVersion": "2.20240313.05.00", + "hl": "en", + "gl": "US", +} + + +@router.get("/youtube/playlist-videos") +async def get_playlist_videos( + url: str = Query(..., description="YouTube playlist URL"), + _user: User = Depends(current_active_user), +): + """Resolve a YouTube playlist URL into individual video URLs.""" + match = _PLAYLIST_ID_RE.search(url) + if not match: + raise HTTPException(status_code=400, detail="Invalid YouTube playlist URL") + + playlist_id = match.group(1) + + try: + video_ids = await _fetch_playlist_via_innertube(playlist_id) + + if not video_ids: + video_ids = await _fetch_playlist_via_html(playlist_id) + + if not video_ids: + raise HTTPException( + status_code=404, + detail="No videos found in the playlist. It may be private or empty.", + ) + + video_urls = [ + f"https://www.youtube.com/watch?v={vid}" for vid in video_ids + ] + return {"video_urls": video_urls, "count": len(video_urls)} + + except HTTPException: + raise + except Exception as e: + logger.error("Error resolving playlist %s: %s", url, e) + raise HTTPException( + status_code=500, + detail=f"Failed to resolve playlist: {e!s}", + ) from e + + +async def _fetch_playlist_via_innertube(playlist_id: str) -> list[str]: + """Fetch playlist videos using YouTube's innertube API (no cookies needed).""" + payload = { + "context": {"client": _INNERTUBE_CLIENT}, + "browseId": f"VL{playlist_id}", + } + proxies = get_requests_proxies() + + try: + async with aiohttp.ClientSession() as session, session.post( + _INNERTUBE_API_URL, + json=payload, + headers={"Content-Type": "application/json"}, + proxy=proxies["http"] if proxies else None, + ) as response: + if response.status != 200: + logger.warning( + "Innertube API returned %d for playlist %s", + response.status, + playlist_id, + ) + return [] + data = await response.json() + + return _extract_playlist_video_ids(data) + except Exception as e: + logger.warning("Innertube API failed for playlist %s: %s", playlist_id, e) + return [] + + +async def _fetch_playlist_via_html(playlist_id: str) -> list[str]: + """Fallback: scrape playlist page HTML with consent cookies set.""" + ua = UserAgent() + headers = { + "User-Agent": ua.random, + "Accept-Language": "en-US,en;q=0.9", + } + cookies = { + "CONSENT": "PENDING+999", + "SOCS": "CAISNQgDEitib3FfaWRlbnRpdHlmcm9udGVuZHVpc2VydmVyXzIwMjMwODI5LjA3X3AxGgJlbiADGgYIgOa_pgY", + } + proxies = get_requests_proxies() + playlist_url = f"https://www.youtube.com/playlist?list={playlist_id}" + + try: + async with ( + aiohttp.ClientSession(cookies=cookies) as session, + session.get( + playlist_url, + headers=headers, + proxy=proxies["http"] if proxies else None, + ) as response, + ): + if response.status != 200: + logger.warning( + "HTML fallback returned %d for playlist %s", + response.status, + playlist_id, + ) + return [] + html = await response.text() + + yt_data = _extract_yt_initial_data(html) + if not yt_data: + logger.warning( + "Could not find ytInitialData in HTML for playlist %s", + playlist_id, + ) + return [] + + return _extract_playlist_video_ids(yt_data) + except Exception as e: + logger.warning("HTML fallback failed for playlist %s: %s", playlist_id, e) + return [] + + +def _extract_yt_initial_data(html: str) -> dict | None: + """Extract the ytInitialData JSON object embedded in a YouTube page.""" + patterns = [ + re.compile(r"var\s+ytInitialData\s*=\s*"), + re.compile(r'window\["ytInitialData"\]\s*=\s*'), + ] + + start = -1 + for pattern in patterns: + match = pattern.search(html) + if match: + start = match.end() + break + + if start == -1: + return None + + depth = 0 + i = start + while i < len(html): + ch = html[i] + if ch == "{": + depth += 1 + elif ch == "}": + depth -= 1 + if depth == 0: + break + elif ch == '"': + i += 1 + while i < len(html) and html[i] != '"': + if html[i] == "\\": + i += 1 + i += 1 + i += 1 + + try: + return json.loads(html[start : i + 1]) + except (json.JSONDecodeError, IndexError): + return None + + +def _extract_playlist_video_ids(data: dict) -> list[str]: + """Walk the data tree and collect videoIds from playlistVideoRenderer nodes.""" + video_ids: list[str] = [] + seen: set[str] = set() + + def _walk(obj: object) -> None: + if isinstance(obj, dict): + if "playlistVideoRenderer" in obj: + vid = obj["playlistVideoRenderer"].get("videoId") + if vid and vid not in seen: + seen.add(vid) + video_ids.append(vid) + else: + for v in obj.values(): + _walk(v) + elif isinstance(obj, list): + for item in obj: + _walk(item) + + _walk(data) + return video_ids diff --git a/surfsense_web/components/assistant-ui/connector-popup/views/youtube-crawler-view.tsx b/surfsense_web/components/assistant-ui/connector-popup/views/youtube-crawler-view.tsx index 7e246f847..c8e565df5 100644 --- a/surfsense_web/components/assistant-ui/connector-popup/views/youtube-crawler-view.tsx +++ b/surfsense_web/components/assistant-ui/connector-popup/views/youtube-crawler-view.tsx @@ -2,9 +2,9 @@ import { TagInput, type Tag as TagType } from "emblor"; import { useAtom } from "jotai"; -import { ArrowLeft } from "lucide-react"; +import { ArrowLeft, Info } from "lucide-react"; import { useTranslations } from "next-intl"; -import { type FC, useState } from "react"; +import { type FC, useCallback, useState } from "react"; import { toast } from "sonner"; import { createDocumentMutationAtom } from "@/atoms/documents/document-mutation.atoms"; import { Button } from "@/components/ui/button"; @@ -12,9 +12,29 @@ import { Label } from "@/components/ui/label"; import { Spinner } from "@/components/ui/spinner"; import { EnumConnectorName } from "@/contracts/enums/connector"; import { getConnectorIcon } from "@/contracts/enums/connectorIcons"; +import { baseApiService } from "@/lib/apis/base-api.service"; -const youtubeRegex = - /^(https:\/\/)?(www\.)?(youtube\.com\/watch\?v=|youtu\.be\/)([a-zA-Z0-9_-]{11})$/; +const YOUTUBE_VIDEO_URL_RE = + /(?:https?:\/\/)?(?:www\.)?(?:youtube\.com\/watch\?[^\s]*v=[\w-]{11}|youtu\.be\/[\w-]{11})[^\s]*/; + +const YOUTUBE_PLAYLIST_URL_RE = + /(?:https?:\/\/)?(?:www\.)?youtube\.com\/[^\s]*[?&]list=[\w-]+[^\s]*/; + +const YOUTUBE_ANY_URL_RE = + /(?:https?:\/\/)?(?:www\.)?(?:youtube\.com\/(?:watch[^\s]*|playlist[^\s]*)|youtu\.be\/[\w-]+[^\s]*)/gi; + +function isYoutubeVideoUrl(url: string): boolean { + return YOUTUBE_VIDEO_URL_RE.test(url.trim()); +} + +function isYoutubePlaylistUrl(url: string): boolean { + return YOUTUBE_PLAYLIST_URL_RE.test(url.trim()); +} + +function extractYoutubeUrls(text: string): string[] { + const matches = text.match(YOUTUBE_ANY_URL_RE); + return matches ? [...new Set(matches)] : []; +} interface YouTubeCrawlerViewProps { searchSpaceId: string; @@ -26,27 +46,107 @@ export const YouTubeCrawlerView: FC = ({ searchSpaceId, const [videoTags, setVideoTags] = useState([]); const [activeTagIndex, setActiveTagIndex] = useState(null); const [error, setError] = useState(null); + const [isFetchingPlaylist, setIsFetchingPlaylist] = useState(false); - // Use the createDocumentMutationAtom const [createDocumentMutation] = useAtom(createDocumentMutationAtom); const { mutate: createYouTubeDocument, isPending: isSubmitting } = createDocumentMutation; - const isValidYoutubeUrl = (url: string): boolean => { - return youtubeRegex.test(url); - }; - const extractVideoId = (url: string): string | null => { - const match = url.match(/(?:youtube\.com\/watch\?v=|youtu\.be\/)([a-zA-Z0-9_-]{11})/); + const match = url.match(/(?:[?&]v=|youtu\.be\/)([\w-]{11})/); return match ? match[1] : null; }; + const resolvePlaylist = useCallback( + async (url: string) => { + setIsFetchingPlaylist(true); + toast(t("resolving_playlist_toast"), { + description: t("resolving_playlist_toast_desc"), + }); + + try { + const response = (await baseApiService.get( + `/api/v1/youtube/playlist-videos?url=${encodeURIComponent(url)}` + )) as { video_urls: string[]; count: number }; + + const resolvedUrls: string[] = response.video_urls ?? []; + + setVideoTags((prev) => { + const existingTexts = new Set(prev.map((tag) => tag.text)); + const newTags = resolvedUrls + .filter((vUrl) => !existingTexts.has(vUrl)) + .map((vUrl) => ({ + id: `${Date.now()}-${Math.random()}`, + text: vUrl, + })); + return newTags.length > 0 ? [...prev, ...newTags] : prev; + }); + + toast(t("playlist_resolved_toast"), { + description: t("playlist_resolved_toast_desc", { count: resolvedUrls.length }), + }); + } catch (err) { + const message = err instanceof Error ? err.message : t("error_generic"); + toast(t("playlist_error_toast"), { description: message }); + } finally { + setIsFetchingPlaylist(false); + } + }, + [t] + ); + + const handlePaste = useCallback( + async (e: React.ClipboardEvent) => { + const text = e.clipboardData.getData("text/plain"); + if (!text) return; + + const urls = extractYoutubeUrls(text); + if (urls.length === 0) return; + + e.preventDefault(); + + const playlistUrls: string[] = []; + const videoUrls: string[] = []; + + for (const url of urls) { + if (isYoutubePlaylistUrl(url)) { + playlistUrls.push(url); + } else if (isYoutubeVideoUrl(url)) { + videoUrls.push(url); + } + } + + if (videoUrls.length > 0) { + setVideoTags((prev) => { + const existingTexts = new Set(prev.map((tag) => tag.text)); + const newTags = videoUrls + .filter((url) => !existingTexts.has(url.trim())) + .map((url) => ({ + id: `${Date.now()}-${Math.random()}`, + text: url.trim(), + })); + if (newTags.length === 0) { + toast(t("duplicate_url_toast"), { + description: t("duplicate_url_toast_desc"), + }); + } + return newTags.length > 0 ? [...prev, ...newTags] : prev; + }); + } + + for (const url of playlistUrls) { + await resolvePlaylist(url); + } + }, + [resolvePlaylist, t] + ); + const handleSubmit = async () => { if (videoTags.length === 0) { setError(t("error_no_video")); return; } - const invalidUrls = videoTags.filter((tag) => !isValidYoutubeUrl(tag.text)); + const invalidUrls = videoTags.filter((tag) => !isYoutubeVideoUrl(tag.text)); if (invalidUrls.length > 0) { setError(t("error_invalid_urls", { urls: invalidUrls.map((tag) => tag.text).join(", ") })); return; @@ -60,7 +160,6 @@ export const YouTubeCrawlerView: FC = ({ searchSpaceId, const videoUrls = videoTags.map((tag) => tag.text); - // Use the mutation to create YouTube documents createYouTubeDocument( { document_type: "YOUTUBE_VIDEO", @@ -86,7 +185,12 @@ export const YouTubeCrawlerView: FC = ({ searchSpaceId, }; const handleAddTag = (text: string) => { - if (!isValidYoutubeUrl(text)) { + if (isYoutubePlaylistUrl(text)) { + resolvePlaylist(text); + return; + } + + if (!isYoutubeVideoUrl(text)) { toast(t("invalid_url_toast"), { description: t("invalid_url_toast_desc"), }); @@ -111,7 +215,7 @@ export const YouTubeCrawlerView: FC = ({ searchSpaceId, return (
{/* Header */} -
+
+ {isFetchingPlaylist && ( +
+ + {t("resolving_playlist")} +
+ )} + {error &&
{error}
} +
+ +

+ {t("chat_tip")} +

+
+

{t("tips_title")}

    @@ -171,14 +292,15 @@ export const YouTubeCrawlerView: FC = ({ searchSpaceId,
  • {t("tip_2")}
  • {t("tip_3")}
  • {t("tip_4")}
  • +
  • {t("tip_5")}
- {videoTags.length > 0 && ( + {videoTags.length > 0 && videoTags.length <= 3 && (

{t("preview")}:

- {videoTags.map((tag, _index) => { + {videoTags.map((tag) => { const videoId = extractVideoId(tag.text); return videoId ? (
= ({ searchSpaceId,
{/* Fixed Footer - Action buttons */} -
+