feat: add YouTube video and playlist support in document collection with enhanced URL handling

2026-06-08 20:25:19 +02:00 · 2026-03-09 16:07:54 -07:00 · 2026-03-09 16:07:54 -07:00 · c6fc4edbc2
commit c6fc4edbc2
parent e481415655
10 changed files with 445 additions and 100 deletions
--- a/surfsense_backend/app/routes/init.py
+++ b/surfsense_backend/app/routes/init.py
@ -42,6 +42,7 @@ from .search_spaces_routes import router as search_spaces_router
 from .slack_add_connector_route import router as slack_add_connector_router
 from .surfsense_docs_routes import router as surfsense_docs_router
 from .teams_add_connector_route import router as teams_add_connector_router
+from .youtube_routes import router as youtube_router

 router = APIRouter()

@ -79,3 +80,4 @@ router.include_router(notifications_router)  # Notifications with Electric SQL s
 router.include_router(composio_router)  # Composio OAuth and toolkit management
 router.include_router(public_chat_router)  # Public chat sharing and cloning
 router.include_router(incentive_tasks_router)  # Incentive tasks for earning free pages
+router.include_router(youtube_router)  # YouTube playlist resolution
--- a/surfsense_backend/app/routes/youtube_routes.py
+++ b/surfsense_backend/app/routes/youtube_routes.py
@ -0,0 +1,205 @@
+"""YouTube utility routes (playlist resolution)."""
+
+import json
+import logging
+import re
+
+import aiohttp
+from fake_useragent import UserAgent
+from fastapi import APIRouter, Depends, HTTPException, Query
+
+from app.db import User
+from app.users import current_active_user
+from app.utils.proxy_config import get_requests_proxies
+
+router = APIRouter()
+logger = logging.getLogger(__name__)
+
+_PLAYLIST_ID_RE = re.compile(r"[?&]list=([\w-]+)")
+
+_INNERTUBE_API_URL = "https://www.youtube.com/youtubei/v1/browse"
+_INNERTUBE_CLIENT = {
+    "clientName": "WEB",
+    "clientVersion": "2.20240313.05.00",
+    "hl": "en",
+    "gl": "US",
+}
+
+
+@router.get("/youtube/playlist-videos")
+async def get_playlist_videos(
+    url: str = Query(..., description="YouTube playlist URL"),
+    _user: User = Depends(current_active_user),
+):
+    """Resolve a YouTube playlist URL into individual video URLs."""
+    match = _PLAYLIST_ID_RE.search(url)
+    if not match:
+        raise HTTPException(status_code=400, detail="Invalid YouTube playlist URL")
+
+    playlist_id = match.group(1)
+
+    try:
+        video_ids = await _fetch_playlist_via_innertube(playlist_id)
+
+        if not video_ids:
+            video_ids = await _fetch_playlist_via_html(playlist_id)
+
+        if not video_ids:
+            raise HTTPException(
+                status_code=404,
+                detail="No videos found in the playlist. It may be private or empty.",
+            )
+
+        video_urls = [
+            f"https://www.youtube.com/watch?v={vid}" for vid in video_ids
+        ]
+        return {"video_urls": video_urls, "count": len(video_urls)}
+
+    except HTTPException:
+        raise
+    except Exception as e:
+        logger.error("Error resolving playlist %s: %s", url, e)
+        raise HTTPException(
+            status_code=500,
+            detail=f"Failed to resolve playlist: {e!s}",
+        ) from e
+
+
+async def _fetch_playlist_via_innertube(playlist_id: str) -> list[str]:
+    """Fetch playlist videos using YouTube's innertube API (no cookies needed)."""
+    payload = {
+        "context": {"client": _INNERTUBE_CLIENT},
+        "browseId": f"VL{playlist_id}",
+    }
+    proxies = get_requests_proxies()
+
+    try:
+        async with aiohttp.ClientSession() as session, session.post(
+            _INNERTUBE_API_URL,
+            json=payload,
+            headers={"Content-Type": "application/json"},
+            proxy=proxies["http"] if proxies else None,
+        ) as response:
+            if response.status != 200:
+                logger.warning(
+                    "Innertube API returned %d for playlist %s",
+                    response.status,
+                    playlist_id,
+                )
+                return []
+            data = await response.json()
+
+        return _extract_playlist_video_ids(data)
+    except Exception as e:
+        logger.warning("Innertube API failed for playlist %s: %s", playlist_id, e)
+        return []
+
+
+async def _fetch_playlist_via_html(playlist_id: str) -> list[str]:
+    """Fallback: scrape playlist page HTML with consent cookies set."""
+    ua = UserAgent()
+    headers = {
+        "User-Agent": ua.random,
+        "Accept-Language": "en-US,en;q=0.9",
+    }
+    cookies = {
+        "CONSENT": "PENDING+999",
+        "SOCS": "CAISNQgDEitib3FfaWRlbnRpdHlmcm9udGVuZHVpc2VydmVyXzIwMjMwODI5LjA3X3AxGgJlbiADGgYIgOa_pgY",
+    }
+    proxies = get_requests_proxies()
+    playlist_url = f"https://www.youtube.com/playlist?list={playlist_id}"
+
+    try:
+        async with (
+            aiohttp.ClientSession(cookies=cookies) as session,
+            session.get(
+                playlist_url,
+                headers=headers,
+                proxy=proxies["http"] if proxies else None,
+            ) as response,
+        ):
+                if response.status != 200:
+                    logger.warning(
+                        "HTML fallback returned %d for playlist %s",
+                        response.status,
+                        playlist_id,
+                    )
+                    return []
+                html = await response.text()
+
+        yt_data = _extract_yt_initial_data(html)
+        if not yt_data:
+            logger.warning(
+                "Could not find ytInitialData in HTML for playlist %s",
+                playlist_id,
+            )
+            return []
+
+        return _extract_playlist_video_ids(yt_data)
+    except Exception as e:
+        logger.warning("HTML fallback failed for playlist %s: %s", playlist_id, e)
+        return []
+
+
+def _extract_yt_initial_data(html: str) -> dict | None:
+    """Extract the ytInitialData JSON object embedded in a YouTube page."""
+    patterns = [
+        re.compile(r"var\s+ytInitialData\s*=\s*"),
+        re.compile(r'window\["ytInitialData"\]\s*=\s*'),
+    ]
+
+    start = -1
+    for pattern in patterns:
+        match = pattern.search(html)
+        if match:
+            start = match.end()
+            break
+
+    if start == -1:
+        return None
+
+    depth = 0
+    i = start
+    while i < len(html):
+        ch = html[i]
+        if ch == "{":
+            depth += 1
+        elif ch == "}":
+            depth -= 1
+            if depth == 0:
+                break
+        elif ch == '"':
+            i += 1
+            while i < len(html) and html[i] != '"':
+                if html[i] == "\\":
+                    i += 1
+                i += 1
+        i += 1
+
+    try:
+        return json.loads(html[start : i + 1])
+    except (json.JSONDecodeError, IndexError):
+        return None
+
+
+def _extract_playlist_video_ids(data: dict) -> list[str]:
+    """Walk the data tree and collect videoIds from playlistVideoRenderer nodes."""
+    video_ids: list[str] = []
+    seen: set[str] = set()
+
+    def _walk(obj: object) -> None:
+        if isinstance(obj, dict):
+            if "playlistVideoRenderer" in obj:
+                vid = obj["playlistVideoRenderer"].get("videoId")
+                if vid and vid not in seen:
+                    seen.add(vid)
+                    video_ids.append(vid)
+            else:
+                for v in obj.values():
+                    _walk(v)
+        elif isinstance(obj, list):
+            for item in obj:
+                _walk(item)
+
+    _walk(data)
+    return video_ids