Skip to content

Commit f6e18c4

Browse files
MukundaKattaclaude
andcommitted
fix(youtube): recognise youtu.be / shorts / embed / live URL shapes
The YouTube converter only accepted canonical watch URLs (https://www.youtube.com/watch?v=...). Short links (youtu.be/<id>), Shorts (/shorts/<id>), embeds, live streams, and m./music. hosts silently fell through to the generic HTML converter, so the caller missed the YouTube-specific title / description / transcript path. Centralise the URL → video_id mapping in a single helper (_extract_video_id) used by both accepts() and convert(). That also removes the brittle urlparse+parse_qs duplication in convert(). Fixes #1775. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
1 parent 63cbbd9 commit f6e18c4

2 files changed

Lines changed: 128 additions & 6 deletions

File tree

packages/markitdown/src/markitdown/converters/_youtube_converter.py

Lines changed: 49 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,49 @@
3434
]
3535

3636

37+
# Hosts we treat as YouTube. Each entry covers schemes http/https and an
38+
# optional leading "www." / "m." so mobile and short links work too.
39+
_YOUTUBE_HOSTS = {
40+
"www.youtube.com",
41+
"youtube.com",
42+
"m.youtube.com",
43+
"music.youtube.com",
44+
"youtu.be",
45+
}
46+
47+
48+
def _extract_video_id(url: str) -> str | None:
49+
"""Return the 11-character video id for any supported YouTube URL shape.
50+
51+
Accepts the canonical watch URL (``/watch?v=...``), the short form
52+
(``youtu.be/<id>``), Shorts (``/shorts/<id>``), embed (``/embed/<id>``),
53+
and live (``/live/<id>``). Returns ``None`` for anything else so the
54+
caller can fall back to the regular HTML converter.
55+
"""
56+
try:
57+
parsed = urlparse(url)
58+
except ValueError:
59+
return None
60+
host = (parsed.hostname or "").lower()
61+
if host not in _YOUTUBE_HOSTS:
62+
return None
63+
# youtu.be/<id>
64+
if host == "youtu.be":
65+
vid = parsed.path.lstrip("/").split("/", 1)[0]
66+
return vid or None
67+
# /watch?v=<id>
68+
if parsed.path == "/watch":
69+
qs = parse_qs(parsed.query)
70+
v = qs.get("v", [""])[0]
71+
return v or None
72+
# /shorts/<id>, /embed/<id>, /live/<id>
73+
for prefix in ("/shorts/", "/embed/", "/live/"):
74+
if parsed.path.startswith(prefix):
75+
vid = parsed.path[len(prefix):].split("/", 1)[0]
76+
return vid or None
77+
return None
78+
79+
3780
class YouTubeConverter(DocumentConverter):
3881
"""Handle YouTube specially, focusing on the video title, description, and transcript."""
3982

@@ -53,8 +96,8 @@ def accepts(
5396
url = unquote(url)
5497
url = url.replace(r"\?", "?").replace(r"\=", "=")
5598

56-
if not url.startswith("https://www.youtube.com/watch?"):
57-
# Not a YouTube URL
99+
if _extract_video_id(url) is None:
100+
# Not a YouTube URL we can handle
58101
return False
59102

60103
if extension in ACCEPTED_FILE_EXTENSIONS:
@@ -147,10 +190,10 @@ def convert(
147190
if IS_YOUTUBE_TRANSCRIPT_CAPABLE:
148191
ytt_api = YouTubeTranscriptApi()
149192
transcript_text = ""
150-
parsed_url = urlparse(stream_info.url) # type: ignore
151-
params = parse_qs(parsed_url.query) # type: ignore
152-
if "v" in params and params["v"][0]:
153-
video_id = str(params["v"][0])
193+
raw_url = stream_info.url or ""
194+
raw_url = unquote(raw_url).replace(r"\?", "?").replace(r"\=", "=")
195+
video_id = _extract_video_id(raw_url)
196+
if video_id:
154197
transcript_list = ytt_api.list(video_id)
155198
languages = ["en"]
156199
for transcript in transcript_list:
Lines changed: 79 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,79 @@
1+
"""Unit tests for the YouTube URL-shape helper.
2+
3+
Covers the URL forms that used to fall through to the generic HTML
4+
converter: ``youtu.be/<id>`` short links, ``youtube.com/shorts/<id>``,
5+
``/embed/<id>``, ``/live/<id>``, and the mobile host ``m.youtube.com``.
6+
"""
7+
8+
from markitdown.converters._youtube_converter import _extract_video_id
9+
10+
11+
def test_canonical_watch_url():
12+
assert _extract_video_id("https://www.youtube.com/watch?v=dQw4w9WgXcQ") == "dQw4w9WgXcQ"
13+
14+
15+
def test_watch_url_with_extra_params():
16+
assert (
17+
_extract_video_id("https://www.youtube.com/watch?v=dQw4w9WgXcQ&t=42s&feature=share")
18+
== "dQw4w9WgXcQ"
19+
)
20+
21+
22+
def test_mobile_watch_url():
23+
assert _extract_video_id("https://m.youtube.com/watch?v=dQw4w9WgXcQ") == "dQw4w9WgXcQ"
24+
25+
26+
def test_music_youtube_watch_url():
27+
assert (
28+
_extract_video_id("https://music.youtube.com/watch?v=dQw4w9WgXcQ")
29+
== "dQw4w9WgXcQ"
30+
)
31+
32+
33+
def test_short_url():
34+
assert _extract_video_id("https://youtu.be/dQw4w9WgXcQ") == "dQw4w9WgXcQ"
35+
36+
37+
def test_short_url_with_timestamp():
38+
assert _extract_video_id("https://youtu.be/dQw4w9WgXcQ?t=30") == "dQw4w9WgXcQ"
39+
40+
41+
def test_shorts_url():
42+
assert (
43+
_extract_video_id("https://www.youtube.com/shorts/dQw4w9WgXcQ")
44+
== "dQw4w9WgXcQ"
45+
)
46+
47+
48+
def test_embed_url():
49+
assert (
50+
_extract_video_id("https://www.youtube.com/embed/dQw4w9WgXcQ")
51+
== "dQw4w9WgXcQ"
52+
)
53+
54+
55+
def test_live_url():
56+
assert (
57+
_extract_video_id("https://www.youtube.com/live/dQw4w9WgXcQ")
58+
== "dQw4w9WgXcQ"
59+
)
60+
61+
62+
def test_non_youtube_url_returns_none():
63+
assert _extract_video_id("https://vimeo.com/12345") is None
64+
65+
66+
def test_watch_without_v_param_returns_none():
67+
assert _extract_video_id("https://www.youtube.com/watch") is None
68+
69+
70+
def test_channel_url_returns_none():
71+
assert _extract_video_id("https://www.youtube.com/channel/UC123") is None
72+
73+
74+
def test_empty_string_returns_none():
75+
assert _extract_video_id("") is None
76+
77+
78+
def test_unknown_host_returns_none():
79+
assert _extract_video_id("https://example.com/watch?v=dQw4w9WgXcQ") is None

0 commit comments

Comments
 (0)