From 6047f8ed22950570d279511d703f96ec0dab3592 Mon Sep 17 00:00:00 2001
From: Nate <nathanielsproul04@gmail.com>
Date: Sat, 20 Jun 2026 13:51:54 -0400
Subject: [PATCH] fix(ytdl): instant cache hits via URL id regex + force
 playlist expansion

Address PR #27 review:
- download() now parses the video id straight from the URL and returns a
  cached file without calling _probe_remote(), so cached reruns no longer
  hit YouTube on every startup. Unrecognized URLs fall through to the probe.
- expand_playlist() passes --yes-playlist so a watch?v=...&list=... URL
  expands the full list instead of queuing a single video.
---
 ytdl.py | 41 ++++++++++++++++++++++++++++++++++++++---
 1 file changed, 38 insertions(+), 3 deletions(-)

diff --git a/ytdl.py b/ytdl.py
index 8af18d3..8ef2a26 100644
--- a/ytdl.py
+++ b/ytdl.py
@@ -22,6 +22,7 @@ Robustness contract (why this file is more than a one-liner):
   is left untouched: it only ever sees clean, canonical mp4s.
 """
 import os
+import re
 import sys
 import json
 import shutil
@@ -30,6 +31,17 @@ import subprocess
 
 _URL_HINTS = ("http://", "https://", "youtube.com", "youtu.be")
 
+# YouTube video ids are exactly 11 chars of [A-Za-z0-9_-]. Match the id wherever
+# it appears across the common URL shapes so a cached file can be found without
+# touching the network. Anything else (other yt-dlp sources, odd URLs) returns
+# None and falls through to the normal probe+download path.
+_YT_ID = r"([A-Za-z0-9_-]{11})"
+_YT_ID_PATTERNS = (
+    re.compile(r"[?&]v=" + _YT_ID),            # watch?v=ID  (also with &list=...)
+    re.compile(r"youtu\.be/" + _YT_ID),        # youtu.be/ID
+    re.compile(r"/(?:shorts|embed|live|v)/" + _YT_ID),  # shorts/embed/live/v
+)
+
 # What the engine can open without surprises.
 _OK_VCODEC = "h264"
 _OK_ACODEC = "aac"
@@ -61,11 +73,14 @@ def expand_playlist(url: str) -> list[str]:
     Expand a playlist/channel URL into a list of individual video URLs.
 
     Uses `--flat-playlist` so it only reads the index (no per-video download).
+    `--yes-playlist` forces expansion of a `watch?v=...&list=...` URL, which
+    yt-dlp otherwise treats as a single video (queuing only 1 entry).
     Returns ``[url]`` unchanged for a single video, or if expansion fails for
     any reason, so the caller can still attempt a normal single download.
     """
     _require_ytdlp()
-    res = _ytdlp("--flat-playlist", "-J", url, timeout=_PROBE_TIMEOUT)
+    res = _ytdlp("--yes-playlist", "--flat-playlist", "-J", url,
+                 timeout=_PROBE_TIMEOUT)
     if res.returncode != 0 or not res.stdout.strip():
         return [url]
     try:
@@ -93,6 +108,15 @@ def _ytdlp(*args: str, timeout: int = _DL_TIMEOUT) -> subprocess.CompletedProces
         raise RuntimeError(f"yt-dlp timed out after {timeout}s")
 
 
+def _video_id_from_url(url: str) -> str | None:
+    """Extract a YouTube video id from `url` via regex, or None if not recognized."""
+    for pat in _YT_ID_PATTERNS:
+        m = pat.search(url)
+        if m:
+            return m.group(1)
+    return None
+
+
 def _probe_remote(url: str) -> tuple[str, bool]:
     """Return (video_id, is_live) for a single video URL, without downloading."""
     res = _ytdlp("--no-playlist", "--print", "id", "--print", "is_live", url,
@@ -110,14 +134,25 @@ def download(url: str, cache_dir: str = "videos") -> str:
     _require_ytdlp()
     os.makedirs(cache_dir, exist_ok=True)
 
+    # Cheap cache check first: if the id is parseable straight from the URL and a
+    # finalized file already exists, return it without ever hitting YouTube. A
+    # file at `out` is only created by the atomic rename below, so its mere
+    # existence guarantees a complete, already-normalized video — and it could
+    # only have been written after passing the is_live guard, so re-checking is
+    # unnecessary on a hit.
+    quick_id = _video_id_from_url(url)
+    if quick_id:
+        cached = os.path.join(cache_dir, f"{quick_id}.mp4")
+        if os.path.exists(cached):
+            print(f"[YT] cached: {cached}")
+            return cached
+
     video_id, is_live = _probe_remote(url)
     if is_live:
         raise RuntimeError(
             f"{url!r} is a live stream; ASCILINE plays finite videos only")
 
     out = os.path.join(cache_dir, f"{video_id}.mp4")
-    # A file at `out` is only ever created by the atomic rename below, so its
-    # mere existence guarantees a complete, already-normalized video.
     if os.path.exists(out):
         print(f"[YT] cached: {out}")
         return out