mirror of
https://github.com/MODSetter/SurfSense.git
synced 2026-04-25 00:36:31 +02:00
206 lines
6 KiB
Python
206 lines
6 KiB
Python
"""YouTube utility routes (playlist resolution)."""
|
|
|
|
import json
|
|
import logging
|
|
import re
|
|
|
|
import aiohttp
|
|
from fake_useragent import UserAgent
|
|
from fastapi import APIRouter, Depends, HTTPException, Query
|
|
|
|
from app.db import User
|
|
from app.users import current_active_user
|
|
from app.utils.proxy_config import get_requests_proxies
|
|
|
|
router = APIRouter()
|
|
logger = logging.getLogger(__name__)
|
|
|
|
_PLAYLIST_ID_RE = re.compile(r"[?&]list=([\w-]+)")
|
|
|
|
_INNERTUBE_API_URL = "https://www.youtube.com/youtubei/v1/browse"
|
|
_INNERTUBE_CLIENT = {
|
|
"clientName": "WEB",
|
|
"clientVersion": "2.20240313.05.00",
|
|
"hl": "en",
|
|
"gl": "US",
|
|
}
|
|
|
|
|
|
@router.get("/youtube/playlist-videos")
|
|
async def get_playlist_videos(
|
|
url: str = Query(..., description="YouTube playlist URL"),
|
|
_user: User = Depends(current_active_user),
|
|
):
|
|
"""Resolve a YouTube playlist URL into individual video URLs."""
|
|
match = _PLAYLIST_ID_RE.search(url)
|
|
if not match:
|
|
raise HTTPException(status_code=400, detail="Invalid YouTube playlist URL")
|
|
|
|
playlist_id = match.group(1)
|
|
|
|
try:
|
|
video_ids = await _fetch_playlist_via_innertube(playlist_id)
|
|
|
|
if not video_ids:
|
|
video_ids = await _fetch_playlist_via_html(playlist_id)
|
|
|
|
if not video_ids:
|
|
raise HTTPException(
|
|
status_code=404,
|
|
detail="No videos found in the playlist. It may be private or empty.",
|
|
)
|
|
|
|
video_urls = [f"https://www.youtube.com/watch?v={vid}" for vid in video_ids]
|
|
return {"video_urls": video_urls, "count": len(video_urls)}
|
|
|
|
except HTTPException:
|
|
raise
|
|
except Exception as e:
|
|
logger.error("Error resolving playlist %s: %s", url, e)
|
|
raise HTTPException(
|
|
status_code=500,
|
|
detail=f"Failed to resolve playlist: {e!s}",
|
|
) from e
|
|
|
|
|
|
async def _fetch_playlist_via_innertube(playlist_id: str) -> list[str]:
|
|
"""Fetch playlist videos using YouTube's innertube API (no cookies needed)."""
|
|
payload = {
|
|
"context": {"client": _INNERTUBE_CLIENT},
|
|
"browseId": f"VL{playlist_id}",
|
|
}
|
|
proxies = get_requests_proxies()
|
|
|
|
try:
|
|
async with (
|
|
aiohttp.ClientSession() as session,
|
|
session.post(
|
|
_INNERTUBE_API_URL,
|
|
json=payload,
|
|
headers={"Content-Type": "application/json"},
|
|
proxy=proxies["http"] if proxies else None,
|
|
) as response,
|
|
):
|
|
if response.status != 200:
|
|
logger.warning(
|
|
"Innertube API returned %d for playlist %s",
|
|
response.status,
|
|
playlist_id,
|
|
)
|
|
return []
|
|
data = await response.json()
|
|
|
|
return _extract_playlist_video_ids(data)
|
|
except Exception as e:
|
|
logger.warning("Innertube API failed for playlist %s: %s", playlist_id, e)
|
|
return []
|
|
|
|
|
|
async def _fetch_playlist_via_html(playlist_id: str) -> list[str]:
|
|
"""Fallback: scrape playlist page HTML with consent cookies set."""
|
|
ua = UserAgent()
|
|
headers = {
|
|
"User-Agent": ua.random,
|
|
"Accept-Language": "en-US,en;q=0.9",
|
|
}
|
|
cookies = {
|
|
"CONSENT": "PENDING+999",
|
|
"SOCS": "CAISNQgDEitib3FfaWRlbnRpdHlmcm9udGVuZHVpc2VydmVyXzIwMjMwODI5LjA3X3AxGgJlbiADGgYIgOa_pgY",
|
|
}
|
|
proxies = get_requests_proxies()
|
|
playlist_url = f"https://www.youtube.com/playlist?list={playlist_id}"
|
|
|
|
try:
|
|
async with (
|
|
aiohttp.ClientSession(cookies=cookies) as session,
|
|
session.get(
|
|
playlist_url,
|
|
headers=headers,
|
|
proxy=proxies["http"] if proxies else None,
|
|
) as response,
|
|
):
|
|
if response.status != 200:
|
|
logger.warning(
|
|
"HTML fallback returned %d for playlist %s",
|
|
response.status,
|
|
playlist_id,
|
|
)
|
|
return []
|
|
html = await response.text()
|
|
|
|
yt_data = _extract_yt_initial_data(html)
|
|
if not yt_data:
|
|
logger.warning(
|
|
"Could not find ytInitialData in HTML for playlist %s",
|
|
playlist_id,
|
|
)
|
|
return []
|
|
|
|
return _extract_playlist_video_ids(yt_data)
|
|
except Exception as e:
|
|
logger.warning("HTML fallback failed for playlist %s: %s", playlist_id, e)
|
|
return []
|
|
|
|
|
|
def _extract_yt_initial_data(html: str) -> dict | None:
|
|
"""Extract the ytInitialData JSON object embedded in a YouTube page."""
|
|
patterns = [
|
|
re.compile(r"var\s+ytInitialData\s*=\s*"),
|
|
re.compile(r'window\["ytInitialData"\]\s*=\s*'),
|
|
]
|
|
|
|
start = -1
|
|
for pattern in patterns:
|
|
match = pattern.search(html)
|
|
if match:
|
|
start = match.end()
|
|
break
|
|
|
|
if start == -1:
|
|
return None
|
|
|
|
depth = 0
|
|
i = start
|
|
while i < len(html):
|
|
ch = html[i]
|
|
if ch == "{":
|
|
depth += 1
|
|
elif ch == "}":
|
|
depth -= 1
|
|
if depth == 0:
|
|
break
|
|
elif ch == '"':
|
|
i += 1
|
|
while i < len(html) and html[i] != '"':
|
|
if html[i] == "\\":
|
|
i += 1
|
|
i += 1
|
|
i += 1
|
|
|
|
try:
|
|
return json.loads(html[start : i + 1])
|
|
except (json.JSONDecodeError, IndexError):
|
|
return None
|
|
|
|
|
|
def _extract_playlist_video_ids(data: dict) -> list[str]:
|
|
"""Walk the data tree and collect videoIds from playlistVideoRenderer nodes."""
|
|
video_ids: list[str] = []
|
|
seen: set[str] = set()
|
|
|
|
def _walk(obj: object) -> None:
|
|
if isinstance(obj, dict):
|
|
if "playlistVideoRenderer" in obj:
|
|
vid = obj["playlistVideoRenderer"].get("videoId")
|
|
if vid and vid not in seen:
|
|
seen.add(vid)
|
|
video_ids.append(vid)
|
|
else:
|
|
for v in obj.values():
|
|
_walk(v)
|
|
elif isinstance(obj, list):
|
|
for item in obj:
|
|
_walk(item)
|
|
|
|
_walk(data)
|
|
return video_ids
|