mirror of
https://github.com/MODSetter/SurfSense.git
synced 2026-06-08 20:25:19 +02:00
feat: add YouTube video and playlist support in document collection with enhanced URL handling
This commit is contained in:
parent
e481415655
commit
c6fc4edbc2
10 changed files with 445 additions and 100 deletions
|
|
@ -42,6 +42,7 @@ from .search_spaces_routes import router as search_spaces_router
|
|||
from .slack_add_connector_route import router as slack_add_connector_router
|
||||
from .surfsense_docs_routes import router as surfsense_docs_router
|
||||
from .teams_add_connector_route import router as teams_add_connector_router
|
||||
from .youtube_routes import router as youtube_router
|
||||
|
||||
router = APIRouter()
|
||||
|
||||
|
|
@ -79,3 +80,4 @@ router.include_router(notifications_router) # Notifications with Electric SQL s
|
|||
router.include_router(composio_router) # Composio OAuth and toolkit management
|
||||
router.include_router(public_chat_router) # Public chat sharing and cloning
|
||||
router.include_router(incentive_tasks_router) # Incentive tasks for earning free pages
|
||||
router.include_router(youtube_router) # YouTube playlist resolution
|
||||
|
|
|
|||
205
surfsense_backend/app/routes/youtube_routes.py
Normal file
205
surfsense_backend/app/routes/youtube_routes.py
Normal file
|
|
@ -0,0 +1,205 @@
|
|||
"""YouTube utility routes (playlist resolution)."""
|
||||
|
||||
import json
|
||||
import logging
|
||||
import re
|
||||
|
||||
import aiohttp
|
||||
from fake_useragent import UserAgent
|
||||
from fastapi import APIRouter, Depends, HTTPException, Query
|
||||
|
||||
from app.db import User
|
||||
from app.users import current_active_user
|
||||
from app.utils.proxy_config import get_requests_proxies
|
||||
|
||||
router = APIRouter()
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
_PLAYLIST_ID_RE = re.compile(r"[?&]list=([\w-]+)")
|
||||
|
||||
_INNERTUBE_API_URL = "https://www.youtube.com/youtubei/v1/browse"
|
||||
_INNERTUBE_CLIENT = {
|
||||
"clientName": "WEB",
|
||||
"clientVersion": "2.20240313.05.00",
|
||||
"hl": "en",
|
||||
"gl": "US",
|
||||
}
|
||||
|
||||
|
||||
@router.get("/youtube/playlist-videos")
|
||||
async def get_playlist_videos(
|
||||
url: str = Query(..., description="YouTube playlist URL"),
|
||||
_user: User = Depends(current_active_user),
|
||||
):
|
||||
"""Resolve a YouTube playlist URL into individual video URLs."""
|
||||
match = _PLAYLIST_ID_RE.search(url)
|
||||
if not match:
|
||||
raise HTTPException(status_code=400, detail="Invalid YouTube playlist URL")
|
||||
|
||||
playlist_id = match.group(1)
|
||||
|
||||
try:
|
||||
video_ids = await _fetch_playlist_via_innertube(playlist_id)
|
||||
|
||||
if not video_ids:
|
||||
video_ids = await _fetch_playlist_via_html(playlist_id)
|
||||
|
||||
if not video_ids:
|
||||
raise HTTPException(
|
||||
status_code=404,
|
||||
detail="No videos found in the playlist. It may be private or empty.",
|
||||
)
|
||||
|
||||
video_urls = [
|
||||
f"https://www.youtube.com/watch?v={vid}" for vid in video_ids
|
||||
]
|
||||
return {"video_urls": video_urls, "count": len(video_urls)}
|
||||
|
||||
except HTTPException:
|
||||
raise
|
||||
except Exception as e:
|
||||
logger.error("Error resolving playlist %s: %s", url, e)
|
||||
raise HTTPException(
|
||||
status_code=500,
|
||||
detail=f"Failed to resolve playlist: {e!s}",
|
||||
) from e
|
||||
|
||||
|
||||
async def _fetch_playlist_via_innertube(playlist_id: str) -> list[str]:
|
||||
"""Fetch playlist videos using YouTube's innertube API (no cookies needed)."""
|
||||
payload = {
|
||||
"context": {"client": _INNERTUBE_CLIENT},
|
||||
"browseId": f"VL{playlist_id}",
|
||||
}
|
||||
proxies = get_requests_proxies()
|
||||
|
||||
try:
|
||||
async with aiohttp.ClientSession() as session, session.post(
|
||||
_INNERTUBE_API_URL,
|
||||
json=payload,
|
||||
headers={"Content-Type": "application/json"},
|
||||
proxy=proxies["http"] if proxies else None,
|
||||
) as response:
|
||||
if response.status != 200:
|
||||
logger.warning(
|
||||
"Innertube API returned %d for playlist %s",
|
||||
response.status,
|
||||
playlist_id,
|
||||
)
|
||||
return []
|
||||
data = await response.json()
|
||||
|
||||
return _extract_playlist_video_ids(data)
|
||||
except Exception as e:
|
||||
logger.warning("Innertube API failed for playlist %s: %s", playlist_id, e)
|
||||
return []
|
||||
|
||||
|
||||
async def _fetch_playlist_via_html(playlist_id: str) -> list[str]:
|
||||
"""Fallback: scrape playlist page HTML with consent cookies set."""
|
||||
ua = UserAgent()
|
||||
headers = {
|
||||
"User-Agent": ua.random,
|
||||
"Accept-Language": "en-US,en;q=0.9",
|
||||
}
|
||||
cookies = {
|
||||
"CONSENT": "PENDING+999",
|
||||
"SOCS": "CAISNQgDEitib3FfaWRlbnRpdHlmcm9udGVuZHVpc2VydmVyXzIwMjMwODI5LjA3X3AxGgJlbiADGgYIgOa_pgY",
|
||||
}
|
||||
proxies = get_requests_proxies()
|
||||
playlist_url = f"https://www.youtube.com/playlist?list={playlist_id}"
|
||||
|
||||
try:
|
||||
async with (
|
||||
aiohttp.ClientSession(cookies=cookies) as session,
|
||||
session.get(
|
||||
playlist_url,
|
||||
headers=headers,
|
||||
proxy=proxies["http"] if proxies else None,
|
||||
) as response,
|
||||
):
|
||||
if response.status != 200:
|
||||
logger.warning(
|
||||
"HTML fallback returned %d for playlist %s",
|
||||
response.status,
|
||||
playlist_id,
|
||||
)
|
||||
return []
|
||||
html = await response.text()
|
||||
|
||||
yt_data = _extract_yt_initial_data(html)
|
||||
if not yt_data:
|
||||
logger.warning(
|
||||
"Could not find ytInitialData in HTML for playlist %s",
|
||||
playlist_id,
|
||||
)
|
||||
return []
|
||||
|
||||
return _extract_playlist_video_ids(yt_data)
|
||||
except Exception as e:
|
||||
logger.warning("HTML fallback failed for playlist %s: %s", playlist_id, e)
|
||||
return []
|
||||
|
||||
|
||||
def _extract_yt_initial_data(html: str) -> dict | None:
|
||||
"""Extract the ytInitialData JSON object embedded in a YouTube page."""
|
||||
patterns = [
|
||||
re.compile(r"var\s+ytInitialData\s*=\s*"),
|
||||
re.compile(r'window\["ytInitialData"\]\s*=\s*'),
|
||||
]
|
||||
|
||||
start = -1
|
||||
for pattern in patterns:
|
||||
match = pattern.search(html)
|
||||
if match:
|
||||
start = match.end()
|
||||
break
|
||||
|
||||
if start == -1:
|
||||
return None
|
||||
|
||||
depth = 0
|
||||
i = start
|
||||
while i < len(html):
|
||||
ch = html[i]
|
||||
if ch == "{":
|
||||
depth += 1
|
||||
elif ch == "}":
|
||||
depth -= 1
|
||||
if depth == 0:
|
||||
break
|
||||
elif ch == '"':
|
||||
i += 1
|
||||
while i < len(html) and html[i] != '"':
|
||||
if html[i] == "\\":
|
||||
i += 1
|
||||
i += 1
|
||||
i += 1
|
||||
|
||||
try:
|
||||
return json.loads(html[start : i + 1])
|
||||
except (json.JSONDecodeError, IndexError):
|
||||
return None
|
||||
|
||||
|
||||
def _extract_playlist_video_ids(data: dict) -> list[str]:
|
||||
"""Walk the data tree and collect videoIds from playlistVideoRenderer nodes."""
|
||||
video_ids: list[str] = []
|
||||
seen: set[str] = set()
|
||||
|
||||
def _walk(obj: object) -> None:
|
||||
if isinstance(obj, dict):
|
||||
if "playlistVideoRenderer" in obj:
|
||||
vid = obj["playlistVideoRenderer"].get("videoId")
|
||||
if vid and vid not in seen:
|
||||
seen.add(vid)
|
||||
video_ids.append(vid)
|
||||
else:
|
||||
for v in obj.values():
|
||||
_walk(v)
|
||||
elif isinstance(obj, list):
|
||||
for item in obj:
|
||||
_walk(item)
|
||||
|
||||
_walk(data)
|
||||
return video_ids
|
||||
Loading…
Add table
Add a link
Reference in a new issue