mirror of
https://github.com/yt-dlp/yt-dlp.git
synced 2024-11-13 02:14:20 +01:00
[generic] Allow multiple matches for generic hits (Fixes #2818)
This commit is contained in:
parent
f1f25be6db
commit
b30b8698ea
1 changed files with 46 additions and 39 deletions
|
@ -637,70 +637,77 @@ def _real_extract(self, url):
|
||||||
return self.url_result(smotri_url, 'Smotri')
|
return self.url_result(smotri_url, 'Smotri')
|
||||||
|
|
||||||
# Start with something easy: JW Player in SWFObject
|
# Start with something easy: JW Player in SWFObject
|
||||||
mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
|
found = re.findall(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
|
||||||
if mobj is None:
|
if not found:
|
||||||
# Look for gorilla-vid style embedding
|
# Look for gorilla-vid style embedding
|
||||||
mobj = re.search(r'''(?sx)
|
found = re.findall(r'''(?sx)
|
||||||
(?:
|
(?:
|
||||||
jw_plugins|
|
jw_plugins|
|
||||||
JWPlayerOptions|
|
JWPlayerOptions|
|
||||||
jwplayer\s*\(\s*["'][^'"]+["']\s*\)\s*\.setup
|
jwplayer\s*\(\s*["'][^'"]+["']\s*\)\s*\.setup
|
||||||
)
|
)
|
||||||
.*?file\s*:\s*["\'](.*?)["\']''', webpage)
|
.*?file\s*:\s*["\'](.*?)["\']''', webpage)
|
||||||
if mobj is None:
|
if not found:
|
||||||
# Broaden the search a little bit
|
# Broaden the search a little bit
|
||||||
mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
|
found = re.findall(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
|
||||||
if mobj is None:
|
if not found:
|
||||||
# Broaden the search a little bit: JWPlayer JS loader
|
# Broaden the findall a little bit: JWPlayer JS loader
|
||||||
mobj = re.search(r'[^A-Za-z0-9]?file["\']?:\s*["\'](http(?![^\'"]+\.[0-9]+[\'"])[^\'"]+)["\']', webpage)
|
found = re.findall(r'[^A-Za-z0-9]?file["\']?:\s*["\'](http(?![^\'"]+\.[0-9]+[\'"])[^\'"]+)["\']', webpage)
|
||||||
|
if not found:
|
||||||
if mobj is None:
|
|
||||||
# Try to find twitter cards info
|
# Try to find twitter cards info
|
||||||
mobj = re.search(r'<meta (?:property|name)="twitter:player:stream" (?:content|value)="(.+?)"', webpage)
|
found = re.findall(r'<meta (?:property|name)="twitter:player:stream" (?:content|value)="(.+?)"', webpage)
|
||||||
if mobj is None:
|
if not found:
|
||||||
# We look for Open Graph info:
|
# We look for Open Graph info:
|
||||||
# We have to match any number spaces between elements, some sites try to align them (eg.: statigr.am)
|
# We have to match any number spaces between elements, some sites try to align them (eg.: statigr.am)
|
||||||
m_video_type = re.search(r'<meta.*?property="og:video:type".*?content="video/(.*?)"', webpage)
|
m_video_type = re.findall(r'<meta.*?property="og:video:type".*?content="video/(.*?)"', webpage)
|
||||||
# We only look in og:video if the MIME type is a video, don't try if it's a Flash player:
|
# We only look in og:video if the MIME type is a video, don't try if it's a Flash player:
|
||||||
if m_video_type is not None:
|
if m_video_type is not None:
|
||||||
mobj = re.search(r'<meta.*?property="og:video".*?content="(.*?)"', webpage)
|
found = re.findall(r'<meta.*?property="og:video".*?content="(.*?)"', webpage)
|
||||||
if mobj is None:
|
if not found:
|
||||||
# HTML5 video
|
# HTML5 video
|
||||||
mobj = re.search(r'<video[^<]*(?:>.*?<source.*?)? src="([^"]+)"', webpage, flags=re.DOTALL)
|
found = re.findall(r'(?s)<video[^<]*(?:>.*?<source.*?)? src="([^"]+)"', webpage)
|
||||||
if mobj is None:
|
if not found:
|
||||||
mobj = re.search(
|
found = re.findall(
|
||||||
r'(?i)<meta\s+(?=(?:[a-z-]+="[^"]+"\s+)*http-equiv="refresh")'
|
r'(?i)<meta\s+(?=(?:[a-z-]+="[^"]+"\s+)*http-equiv="refresh")'
|
||||||
r'(?:[a-z-]+="[^"]+"\s+)*?content="[0-9]{,2};url=\'([^\']+)\'"',
|
r'(?:[a-z-]+="[^"]+"\s+)*?content="[0-9]{,2};url=\'([^\']+)\'"',
|
||||||
webpage)
|
webpage)
|
||||||
if mobj:
|
if found:
|
||||||
new_url = mobj.group(1)
|
new_url = found.group(1)
|
||||||
self.report_following_redirect(new_url)
|
self.report_following_redirect(new_url)
|
||||||
return {
|
return {
|
||||||
'_type': 'url',
|
'_type': 'url',
|
||||||
'url': new_url,
|
'url': new_url,
|
||||||
}
|
}
|
||||||
if mobj is None:
|
if not found:
|
||||||
raise ExtractorError('Unsupported URL: %s' % url)
|
raise ExtractorError('Unsupported URL: %s' % url)
|
||||||
|
|
||||||
# It's possible that one of the regexes
|
entries = []
|
||||||
# matched, but returned an empty group:
|
for video_url in found:
|
||||||
if mobj.group(1) is None:
|
video_url = compat_urlparse.urljoin(url, video_url)
|
||||||
raise ExtractorError('Did not find a valid video URL at %s' % url)
|
video_id = compat_urllib_parse.unquote(os.path.basename(video_url))
|
||||||
|
|
||||||
video_url = mobj.group(1)
|
# Sometimes, jwplayer extraction will result in a YouTube URL
|
||||||
video_url = compat_urlparse.urljoin(url, video_url)
|
if YoutubeIE.suitable(video_url):
|
||||||
video_id = compat_urllib_parse.unquote(os.path.basename(video_url))
|
entries.append(self.url_result(video_url, 'Youtube'))
|
||||||
|
continue
|
||||||
|
|
||||||
# Sometimes, jwplayer extraction will result in a YouTube URL
|
# here's a fun little line of code for you:
|
||||||
if YoutubeIE.suitable(video_url):
|
video_id = os.path.splitext(video_id)[0]
|
||||||
return self.url_result(video_url, 'Youtube')
|
|
||||||
|
|
||||||
# here's a fun little line of code for you:
|
entries.append({
|
||||||
video_id = os.path.splitext(video_id)[0]
|
'id': video_id,
|
||||||
|
'url': video_url,
|
||||||
|
'uploader': video_uploader,
|
||||||
|
'title': video_title,
|
||||||
|
})
|
||||||
|
|
||||||
|
if len(entries) == 1:
|
||||||
|
return entries[1]
|
||||||
|
else:
|
||||||
|
for num, e in enumerate(entries, start=1):
|
||||||
|
e['title'] = '%s (%d)' % (e['title'], num)
|
||||||
|
return {
|
||||||
|
'_type': 'playlist',
|
||||||
|
'entries': entries,
|
||||||
|
}
|
||||||
|
|
||||||
return {
|
|
||||||
'id': video_id,
|
|
||||||
'url': video_url,
|
|
||||||
'uploader': video_uploader,
|
|
||||||
'title': video_title,
|
|
||||||
}
|
|
||||||
|
|
Loading…
Reference in a new issue