fix(youtube): recognise youtu.be / shorts / embed / live URL shapes

MukundaKatta · claude · MukundaKatta · commit f6e18c459c2b · 2026-04-14T23:39:14.000-07:00
The YouTube converter only accepted canonical watch URLs (https://www.youtube.com/watch?v=...). Short links (youtu.be/<id>), Shorts (/shorts/<id>), embeds, live streams, and m./music. hosts silently fell through to the generic HTML converter, so the caller missed the YouTube-specific title / description / transcript path. Centralise the URL → video_id mapping in a single helper (_extract_video_id) used by both accepts() and convert(). That also removes the brittle urlparse+parse_qs duplication in convert(). Fixes #1775. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
diff --git a/packages/markitdown/src/markitdown/converters/_youtube_converter.py b/packages/markitdown/src/markitdown/converters/_youtube_converter.py
@@ -34,6 +34,49 @@
 ]
 
 
+# Hosts we treat as YouTube. Each entry covers schemes http/https and an
+# optional leading "www." / "m." so mobile and short links work too.
+_YOUTUBE_HOSTS = {
+    "www.youtube.com",
+    "youtube.com",
+    "m.youtube.com",
+    "music.youtube.com",
+    "youtu.be",
+}
+
+
+def _extract_video_id(url: str) -> str | None:
+    """Return the 11-character video id for any supported YouTube URL shape.
+
+    Accepts the canonical watch URL (``/watch?v=...``), the short form
+    (``youtu.be/<id>``), Shorts (``/shorts/<id>``), embed (``/embed/<id>``),
+    and live (``/live/<id>``). Returns ``None`` for anything else so the
+    caller can fall back to the regular HTML converter.
+    """
+    try:
+        parsed = urlparse(url)
+    except ValueError:
+        return None
+    host = (parsed.hostname or "").lower()
+    if host not in _YOUTUBE_HOSTS:
+        return None
+    # youtu.be/<id>
+    if host == "youtu.be":
+        vid = parsed.path.lstrip("/").split("/", 1)[0]
+        return vid or None
+    # /watch?v=<id>
+    if parsed.path == "/watch":
+        qs = parse_qs(parsed.query)
+        v = qs.get("v", [""])[0]
+        return v or None
+    # /shorts/<id>, /embed/<id>, /live/<id>
+    for prefix in ("/shorts/", "/embed/", "/live/"):
+        if parsed.path.startswith(prefix):
+            vid = parsed.path[len(prefix):].split("/", 1)[0]
+            return vid or None
+    return None
+
+
 class YouTubeConverter(DocumentConverter):
     """Handle YouTube specially, focusing on the video title, description, and transcript."""
 
@@ -53,8 +96,8 @@ def accepts(
         url = unquote(url)
         url = url.replace(r"\?", "?").replace(r"\=", "=")
 
-        if not url.startswith("https://www.youtube.com/watch?"):
-            # Not a YouTube URL
+        if _extract_video_id(url) is None:
+            # Not a YouTube URL we can handle
             return False
 
         if extension in ACCEPTED_FILE_EXTENSIONS:
@@ -147,10 +190,10 @@ def convert(
         if IS_YOUTUBE_TRANSCRIPT_CAPABLE:
             ytt_api = YouTubeTranscriptApi()
             transcript_text = ""
-            parsed_url = urlparse(stream_info.url)  # type: ignore
-            params = parse_qs(parsed_url.query)  # type: ignore
-            if "v" in params and params["v"][0]:
-                video_id = str(params["v"][0])
+            raw_url = stream_info.url or ""
+            raw_url = unquote(raw_url).replace(r"\?", "?").replace(r"\=", "=")
+            video_id = _extract_video_id(raw_url)
+            if video_id:
                 transcript_list = ytt_api.list(video_id)
                 languages = ["en"]
                 for transcript in transcript_list:
diff --git a/packages/markitdown/tests/test_youtube_url_shapes.py b/packages/markitdown/tests/test_youtube_url_shapes.py
@@ -0,0 +1,79 @@
+"""Unit tests for the YouTube URL-shape helper.
+
+Covers the URL forms that used to fall through to the generic HTML
+converter: ``youtu.be/<id>`` short links, ``youtube.com/shorts/<id>``,
+``/embed/<id>``, ``/live/<id>``, and the mobile host ``m.youtube.com``.
+"""
+
+from markitdown.converters._youtube_converter import _extract_video_id
+
+
+def test_canonical_watch_url():
+    assert _extract_video_id("https://www.youtube.com/watch?v=dQw4w9WgXcQ") == "dQw4w9WgXcQ"
+
+
+def test_watch_url_with_extra_params():
+    assert (
+        _extract_video_id("https://www.youtube.com/watch?v=dQw4w9WgXcQ&t=42s&feature=share")
+        == "dQw4w9WgXcQ"
+    )
+
+
+def test_mobile_watch_url():
+    assert _extract_video_id("https://m.youtube.com/watch?v=dQw4w9WgXcQ") == "dQw4w9WgXcQ"
+
+
+def test_music_youtube_watch_url():
+    assert (
+        _extract_video_id("https://music.youtube.com/watch?v=dQw4w9WgXcQ")
+        == "dQw4w9WgXcQ"
+    )
+
+
+def test_short_url():
+    assert _extract_video_id("https://youtu.be/dQw4w9WgXcQ") == "dQw4w9WgXcQ"
+
+
+def test_short_url_with_timestamp():
+    assert _extract_video_id("https://youtu.be/dQw4w9WgXcQ?t=30") == "dQw4w9WgXcQ"
+
+
+def test_shorts_url():
+    assert (
+        _extract_video_id("https://www.youtube.com/shorts/dQw4w9WgXcQ")
+        == "dQw4w9WgXcQ"
+    )
+
+
+def test_embed_url():
+    assert (
+        _extract_video_id("https://www.youtube.com/embed/dQw4w9WgXcQ")
+        == "dQw4w9WgXcQ"
+    )
+
+
+def test_live_url():
+    assert (
+        _extract_video_id("https://www.youtube.com/live/dQw4w9WgXcQ")
+        == "dQw4w9WgXcQ"
+    )
+
+
+def test_non_youtube_url_returns_none():
+    assert _extract_video_id("https://vimeo.com/12345") is None
+
+
+def test_watch_without_v_param_returns_none():
+    assert _extract_video_id("https://www.youtube.com/watch") is None
+
+
+def test_channel_url_returns_none():
+    assert _extract_video_id("https://www.youtube.com/channel/UC123") is None
+
+
+def test_empty_string_returns_none():
+    assert _extract_video_id("") is None
+
+
+def test_unknown_host_returns_none():
+    assert _extract_video_id("https://example.com/watch?v=dQw4w9WgXcQ") is None