[extractor/generic] Separate embed extraction into own function (#5176)

This commit is contained in:
pukkandan 2022-10-09 16:09:36 +05:30 committed by GitHub
parent 4c9a1a3ba5
commit ade1fa70cb
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
2 changed files with 52 additions and 59 deletions

View file

@ -66,6 +66,7 @@
sanitize_filename, sanitize_filename,
sanitize_url, sanitize_url,
sanitized_Request, sanitized_Request,
smuggle_url,
str_or_none, str_or_none,
str_to_int, str_to_int,
strip_or_none, strip_or_none,
@ -3873,6 +3874,12 @@ def _error_or_warning(self, err, _count=None, _retries=0, *, fatal=True):
def RetryManager(self, **kwargs): def RetryManager(self, **kwargs):
return RetryManager(self.get_param('extractor_retries', 3), self._error_or_warning, **kwargs) return RetryManager(self.get_param('extractor_retries', 3), self._error_or_warning, **kwargs)
def _extract_generic_embeds(self, url, *args, info_dict={}, note='Extracting generic embeds', **kwargs):
display_id = traverse_obj(info_dict, 'display_id', 'id')
self.to_screen(f'{format_field(display_id, None, "%s: ")}{note}')
return self._downloader.get_info_extractor('Generic')._extract_embeds(
smuggle_url(url, {'block_ies': [self.ie_key()]}), *args, **kwargs)
@classmethod @classmethod
def extract_from_webpage(cls, ydl, url, webpage): def extract_from_webpage(cls, ydl, url, webpage):
ie = (cls if isinstance(cls._extract_from_webpage, types.MethodType) ie = (cls if isinstance(cls._extract_from_webpage, types.MethodType)

View file

@ -1,5 +1,6 @@
import os import os
import re import re
import types
import urllib.parse import urllib.parse
import xml.etree.ElementTree import xml.etree.ElementTree
@ -2609,6 +2610,7 @@ def _real_extract(self, url):
default_search += ':' default_search += ':'
return self.url_result(default_search + url) return self.url_result(default_search + url)
original_url = url
url, smuggled_data = unsmuggle_url(url, {}) url, smuggled_data = unsmuggle_url(url, {})
force_videoid = None force_videoid = None
is_intentional = smuggled_data.get('to_generic') is_intentional = smuggled_data.get('to_generic')
@ -2760,7 +2762,20 @@ def _real_extract(self, url):
'age_limit': self._rta_search(webpage), 'age_limit': self._rta_search(webpage),
}) })
domain_name = self._search_regex(r'^(?:https?://)?([^/]*)/.*', url, 'video uploader', default=None) self._downloader.write_debug('Looking for embeds')
embeds = list(self._extract_embeds(original_url, webpage, urlh=full_response, info_dict=info_dict))
if len(embeds) == 1:
return {**info_dict, **embeds[0]}
elif embeds:
return self.playlist_result(embeds, **info_dict)
raise UnsupportedError(url)
def _extract_embeds(self, url, webpage, *, urlh=None, info_dict={}):
"""Returns an iterator of video entries"""
info_dict = types.MappingProxyType(info_dict) # Prevents accidental mutation
video_id = traverse_obj(info_dict, 'display_id', 'id') or self._generic_id(url)
url, smuggled_data = unsmuggle_url(url, {})
actual_url = urlh.geturl() if urlh else url
# Sometimes embedded video player is hidden behind percent encoding # Sometimes embedded video player is hidden behind percent encoding
# (e.g. https://github.com/ytdl-org/youtube-dl/issues/2448) # (e.g. https://github.com/ytdl-org/youtube-dl/issues/2448)
@ -2776,31 +2791,19 @@ def _real_extract(self, url):
lambda x: unescapeHTML(x.group(0)), webpage) lambda x: unescapeHTML(x.group(0)), webpage)
# TODO: Move to respective extractors # TODO: Move to respective extractors
self._downloader.write_debug('Looking for Brightcove embeds')
bc_urls = BrightcoveLegacyIE._extract_brightcove_urls(webpage) bc_urls = BrightcoveLegacyIE._extract_brightcove_urls(webpage)
if bc_urls: if bc_urls:
entries = [{ return [self.url_result(smuggle_url(bc_url, {'Referer': url}), BrightcoveLegacyIE)
'_type': 'url', for bc_url in bc_urls]
'url': smuggle_url(bc_url, {'Referer': url}),
'ie_key': 'BrightcoveLegacy'
} for bc_url in bc_urls]
return {
'_type': 'playlist',
'title': info_dict['title'],
'id': video_id,
'entries': entries,
}
bc_urls = BrightcoveNewIE._extract_brightcove_urls(self, webpage) bc_urls = BrightcoveNewIE._extract_brightcove_urls(self, webpage)
if bc_urls: if bc_urls:
return self.playlist_from_matches( return [self.url_result(smuggle_url(bc_url, {'Referer': url}), BrightcoveNewIE)
bc_urls, video_id, info_dict['title'], for bc_url in bc_urls]
getter=lambda x: smuggle_url(x, {'referrer': url}),
ie='BrightcoveNew')
self._downloader.write_debug('Looking for embeds')
embeds = [] embeds = []
for ie in self._downloader._ies.values(): for ie in self._downloader._ies.values():
if ie.ie_key() in smuggled_data.get('block_ies', []):
continue
gen = ie.extract_from_webpage(self._downloader, url, webpage) gen = ie.extract_from_webpage(self._downloader, url, webpage)
current_embeds = [] current_embeds = []
try: try:
@ -2809,35 +2812,26 @@ def _real_extract(self, url):
except self.StopExtraction: except self.StopExtraction:
self.report_detected(f'{ie.IE_NAME} exclusive embed', len(current_embeds), self.report_detected(f'{ie.IE_NAME} exclusive embed', len(current_embeds),
embeds and 'discarding other embeds') embeds and 'discarding other embeds')
embeds = current_embeds return current_embeds
break
except StopIteration: except StopIteration:
self.report_detected(f'{ie.IE_NAME} embed', len(current_embeds)) self.report_detected(f'{ie.IE_NAME} embed', len(current_embeds))
embeds.extend(current_embeds) embeds.extend(current_embeds)
del current_embeds if embeds:
if len(embeds) == 1: return embeds
return {**info_dict, **embeds[0]}
elif embeds:
return self.playlist_result(embeds, **info_dict)
jwplayer_data = self._find_jwplayer_data( jwplayer_data = self._find_jwplayer_data(
webpage, video_id, transform_source=js_to_json) webpage, video_id, transform_source=js_to_json)
if jwplayer_data: if jwplayer_data:
if isinstance(jwplayer_data.get('playlist'), str): if isinstance(jwplayer_data.get('playlist'), str):
self.report_detected('JW Player playlist') self.report_detected('JW Player playlist')
return { return [self.url_result(jwplayer_data['playlist'], 'JWPlatform')]
**info_dict,
'_type': 'url',
'ie_key': 'JWPlatform',
'url': jwplayer_data['playlist'],
}
try: try:
info = self._parse_jwplayer_data( info = self._parse_jwplayer_data(
jwplayer_data, video_id, require_title=False, base_url=url) jwplayer_data, video_id, require_title=False, base_url=url)
if traverse_obj(info, 'formats', ('entries', ..., 'formats')): if traverse_obj(info, 'formats', ('entries', ..., 'formats')):
self.report_detected('JW Player data') self.report_detected('JW Player data')
return merge_dicts(info, info_dict) return [info]
except ExtractorError: except ExtractorError:
# See https://github.com/ytdl-org/youtube-dl/pull/16735 # See https://github.com/ytdl-org/youtube-dl/pull/16735
pass pass
@ -2865,7 +2859,7 @@ def _real_extract(self, url):
src_type = src_type.lower() src_type = src_type.lower()
ext = determine_ext(src).lower() ext = determine_ext(src).lower()
if src_type == 'video/youtube': if src_type == 'video/youtube':
return self.url_result(src, YoutubeIE.ie_key()) return [self.url_result(src, YoutubeIE.ie_key())]
if src_type == 'application/dash+xml' or ext == 'mpd': if src_type == 'application/dash+xml' or ext == 'mpd':
fmts, subs = self._extract_mpd_formats_and_subtitles( fmts, subs = self._extract_mpd_formats_and_subtitles(
src, video_id, mpd_id='dash', fatal=False) src, video_id, mpd_id='dash', fatal=False)
@ -2883,7 +2877,7 @@ def _real_extract(self, url):
'ext': (mimetype2ext(src_type) 'ext': (mimetype2ext(src_type)
or ext if ext in KNOWN_EXTENSIONS else 'mp4'), or ext if ext in KNOWN_EXTENSIONS else 'mp4'),
'http_headers': { 'http_headers': {
'Referer': full_response.geturl(), 'Referer': actual_url,
}, },
}) })
# https://docs.videojs.com/player#addRemoteTextTrack # https://docs.videojs.com/player#addRemoteTextTrack
@ -2898,28 +2892,26 @@ def _real_extract(self, url):
'url': urllib.parse.urljoin(url, src), 'url': urllib.parse.urljoin(url, src),
'name': sub.get('label'), 'name': sub.get('label'),
'http_headers': { 'http_headers': {
'Referer': full_response.geturl(), 'Referer': actual_url,
}, },
}) })
if formats or subtitles: if formats or subtitles:
self.report_detected('video.js embed') self.report_detected('video.js embed')
self._sort_formats(formats) self._sort_formats(formats)
info_dict['formats'] = formats return [{'formats': formats, 'subtitles': subtitles}]
info_dict['subtitles'] = subtitles
return info_dict
# Looking for http://schema.org/VideoObject # Looking for http://schema.org/VideoObject
json_ld = self._search_json_ld(webpage, video_id, default={}) json_ld = self._search_json_ld(webpage, video_id, default={})
if json_ld.get('url') not in (url, None): if json_ld.get('url') not in (url, None):
self.report_detected('JSON LD') self.report_detected('JSON LD')
return merge_dicts({ return [merge_dicts({
'_type': 'video' if json_ld.get('ext') else 'url_transparent', '_type': 'video' if json_ld.get('ext') else 'url_transparent',
'url': smuggle_url(json_ld['url'], { 'url': smuggle_url(json_ld['url'], {
'force_videoid': video_id, 'force_videoid': video_id,
'to_generic': True, 'to_generic': True,
'http_headers': {'Referer': url}, 'http_headers': {'Referer': url},
}), }),
}, json_ld, info_dict) }, json_ld)]
def check_video(vurl): def check_video(vurl):
if YoutubeIE.suitable(vurl): if YoutubeIE.suitable(vurl):
@ -2990,13 +2982,13 @@ def filter_video(urls):
self._sort_formats(formats) self._sort_formats(formats)
return { return [{
'id': flashvars['video_id'], 'id': flashvars['video_id'],
'display_id': display_id, 'display_id': display_id,
'title': title, 'title': title,
'thumbnail': thumbnail, 'thumbnail': thumbnail,
'formats': formats, 'formats': formats,
} }]
if not found: if not found:
# Broaden the search a little bit # Broaden the search a little bit
found = filter_video(re.findall(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)) found = filter_video(re.findall(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage))
@ -3050,17 +3042,14 @@ def filter_video(urls):
webpage) webpage)
if not found: if not found:
# Look also in Refresh HTTP header # Look also in Refresh HTTP header
refresh_header = full_response.headers.get('Refresh') refresh_header = urlh and urlh.headers.get('Refresh')
if refresh_header: if refresh_header:
found = re.search(REDIRECT_REGEX, refresh_header) found = re.search(REDIRECT_REGEX, refresh_header)
if found: if found:
new_url = urllib.parse.urljoin(url, unescapeHTML(found.group(1))) new_url = urllib.parse.urljoin(url, unescapeHTML(found.group(1)))
if new_url != url: if new_url != url:
self.report_following_redirect(new_url) self.report_following_redirect(new_url)
return { return [self.url_result(new_url)]
'_type': 'url',
'url': new_url,
}
else: else:
found = None found = None
@ -3071,10 +3060,12 @@ def filter_video(urls):
embed_url = self._html_search_meta('twitter:player', webpage, default=None) embed_url = self._html_search_meta('twitter:player', webpage, default=None)
if embed_url and embed_url != url: if embed_url and embed_url != url:
self.report_detected('twitter:player iframe') self.report_detected('twitter:player iframe')
return self.url_result(embed_url) return [self.url_result(embed_url)]
if not found: if not found:
raise UnsupportedError(url) return []
domain_name = self._search_regex(r'^(?:https?://)?([^/]*)/.*', url, 'video uploader', default=None)
entries = [] entries = []
for video_url in orderedSet(found): for video_url in orderedSet(found):
@ -3090,7 +3081,7 @@ def filter_video(urls):
video_id = os.path.splitext(video_id)[0] video_id = os.path.splitext(video_id)[0]
headers = { headers = {
'referer': full_response.geturl() 'referer': actual_url
} }
entry_info_dict = { entry_info_dict = {
@ -3114,7 +3105,7 @@ def filter_video(urls):
if ext == 'smil': if ext == 'smil':
entry_info_dict = {**self._extract_smil_info(video_url, video_id), **entry_info_dict} entry_info_dict = {**self._extract_smil_info(video_url, video_id), **entry_info_dict}
elif ext == 'xspf': elif ext == 'xspf':
return self.playlist_result(self._extract_xspf_playlist(video_url, video_id), video_id) return [self._extract_xspf_playlist(video_url, video_id)]
elif ext == 'm3u8': elif ext == 'm3u8':
entry_info_dict['formats'], entry_info_dict['subtitles'] = self._extract_m3u8_formats_and_subtitles(video_url, video_id, ext='mp4', headers=headers) entry_info_dict['formats'], entry_info_dict['subtitles'] = self._extract_m3u8_formats_and_subtitles(video_url, video_id, ext='mp4', headers=headers)
elif ext == 'mpd': elif ext == 'mpd':
@ -3144,14 +3135,9 @@ def filter_video(urls):
entries.append(entry_info_dict) entries.append(entry_info_dict)
if len(entries) == 1: if len(entries) > 1:
return merge_dicts(entries[0], info_dict)
else:
for num, e in enumerate(entries, start=1): for num, e in enumerate(entries, start=1):
# 'url' results don't have a title # 'url' results don't have a title
if e.get('title') is not None: if e.get('title') is not None:
e['title'] = '%s (%d)' % (e['title'], num) e['title'] = '%s (%d)' % (e['title'], num)
return { return entries
'_type': 'playlist',
'entries': entries,
}