fix(ytdl): instant cache hits via URL id regex + force playlist expansion

Address PR #27 review: - download() now parses the video id straight from the URL and returns a cached file without calling _probe_remote(), so cached reruns no longer hit YouTube on every startup. Unrecognized URLs fall through to the probe. - expand_playlist() passes --yes-playlist so a watch?v=...&list=... URL expands the full list instead of queuing a single video.
2026-06-23 22:48:06 +02:00 · 2026-06-20 13:51:54 -04:00 · 2026-06-20 13:51:54 -04:00 · 6047f8ed22
commit 6047f8ed22
parent bfbf94637b
1 changed files with 38 additions and 3 deletions
--- a/ytdl.py
+++ b/ytdl.py
@ -22,6 +22,7 @@ Robustness contract (why this file is more than a one-liner):
  is left untouched: it only ever sees clean, canonical mp4s.
 """
 import os
+import re
 import sys
 import json
 import shutil
@ -30,6 +31,17 @@ import subprocess

 _URL_HINTS = ("http://", "https://", "youtube.com", "youtu.be")

+# YouTube video ids are exactly 11 chars of [A-Za-z0-9_-]. Match the id wherever
+# it appears across the common URL shapes so a cached file can be found without
+# touching the network. Anything else (other yt-dlp sources, odd URLs) returns
+# None and falls through to the normal probe+download path.
+_YT_ID = r"([A-Za-z0-9_-]{11})"
+_YT_ID_PATTERNS = (
+    re.compile(r"[?&]v=" + _YT_ID),            # watch?v=ID  (also with &list=...)
+    re.compile(r"youtu\.be/" + _YT_ID),        # youtu.be/ID
+    re.compile(r"/(?:shorts|embed|live|v)/" + _YT_ID),  # shorts/embed/live/v
+)
+
 # What the engine can open without surprises.
 _OK_VCODEC = "h264"
 _OK_ACODEC = "aac"
@ -61,11 +73,14 @@ def expand_playlist(url: str) -> list[str]:
    Expand a playlist/channel URL into a list of individual video URLs.

    Uses `--flat-playlist` so it only reads the index (no per-video download).
+    `--yes-playlist` forces expansion of a `watch?v=...&list=...` URL, which
+    yt-dlp otherwise treats as a single video (queuing only 1 entry).
    Returns ``[url]`` unchanged for a single video, or if expansion fails for
    any reason, so the caller can still attempt a normal single download.
    """
    _require_ytdlp()
-    res = _ytdlp("--flat-playlist", "-J", url, timeout=_PROBE_TIMEOUT)
+    res = _ytdlp("--yes-playlist", "--flat-playlist", "-J", url,
+                 timeout=_PROBE_TIMEOUT)
    if res.returncode != 0 or not res.stdout.strip():
        return [url]
    try:
@ -93,6 +108,15 @@ def _ytdlp(*args: str, timeout: int = _DL_TIMEOUT) -> subprocess.CompletedProces
        raise RuntimeError(f"yt-dlp timed out after {timeout}s")


+def _video_id_from_url(url: str) -> str | None:
+    """Extract a YouTube video id from `url` via regex, or None if not recognized."""
+    for pat in _YT_ID_PATTERNS:
+        m = pat.search(url)
+        if m:
+            return m.group(1)
+    return None
+
+
 def _probe_remote(url: str) -> tuple[str, bool]:
    """Return (video_id, is_live) for a single video URL, without downloading."""
    res = _ytdlp("--no-playlist", "--print", "id", "--print", "is_live", url,
@ -110,14 +134,25 @@ def download(url: str, cache_dir: str = "videos") -> str:
    _require_ytdlp()
    os.makedirs(cache_dir, exist_ok=True)

+    # Cheap cache check first: if the id is parseable straight from the URL and a
+    # finalized file already exists, return it without ever hitting YouTube. A
+    # file at `out` is only created by the atomic rename below, so its mere
+    # existence guarantees a complete, already-normalized video — and it could
+    # only have been written after passing the is_live guard, so re-checking is
+    # unnecessary on a hit.
+    quick_id = _video_id_from_url(url)
+    if quick_id:
+        cached = os.path.join(cache_dir, f"{quick_id}.mp4")
+        if os.path.exists(cached):
+            print(f"[YT] cached: {cached}")
+            return cached
+
    video_id, is_live = _probe_remote(url)
    if is_live:
        raise RuntimeError(
            f"{url!r} is a live stream; ASCILINE plays finite videos only")

    out = os.path.join(cache_dir, f"{video_id}.mp4")
-    # A file at `out` is only ever created by the atomic rename below, so its
-    # mere existence guarantees a complete, already-normalized video.
    if os.path.exists(out):
        print(f"[YT] cached: {out}")
        return out