From b6de707d13ca3b7a573d9695b7fc0616fe394f60 Mon Sep 17 00:00:00 2001 From: coletdjnz Date: Mon, 6 Sep 2021 07:26:41 +0000 Subject: [PATCH] [youtube] Improvements to JS player extraction (See desc) (#860) * fallback player url extraction when it fails to be extracted from the webpage * don't download js player unnecessarily for clients that don't require it * try to extract js player url from any additional client configs * ability to skip the js player usage/download using `player_skip=js` * ability to skip the initial webpage download using `player_skip=webpage` known issue: * authentication for multi-channel accounts and multi-account cookies may not work correctly if the webpage or client configs are skipped * formats from the web client requiring signature decryption will be skipped if player js extraction is skipped Authored by: coletdjnz --- README.md | 2 +- yt_dlp/extractor/youtube.py | 83 +++++++++++++++++++++++++------------ 2 files changed, 57 insertions(+), 28 deletions(-) diff --git a/README.md b/README.md index 2e4bedc93..d9daee69e 100644 --- a/README.md +++ b/README.md @@ -1436,7 +1436,7 @@ # EXTRACTOR ARGUMENTS * **youtube** * `skip`: `hls` or `dash` (or both) to skip download of the respective manifests * `player_client`: Clients to extract video data from. The main clients are `web`, `android`, `ios`, `mweb`. These also have `_music`, `_embedded`, `_agegate`, and `_creator` variants (Eg: `web_embedded`) (`mweb` has only `_agegate`). By default, `android,web` is used, but the agegate and creator variants are added as required for age-gated videos. Similarly the music variants are added for `music.youtube.com` urls. You can also use `all` to use all the clients - * `player_skip`: `configs` - skip any requests for client configs and use defaults + * `player_skip`: Skip some network requests that are generally needed for robust extraction. One or more of `configs` (skip client configs), `webpage` (skip initial webpage), `js` (skip js player). While these options can help reduce the number of requests needed or avoid some rate-limiting, they could cause some issues. See [#860](https://github.com/yt-dlp/yt-dlp/pull/860) for more details * `include_live_dash`: Include live dash formats (These formats don't download properly) * `comment_sort`: `top` or `new` (default) - choose comment sorting mode (on YouTube's side). * `max_comments`: Maximum amount of comments to download (default all). diff --git a/yt_dlp/extractor/youtube.py b/yt_dlp/extractor/youtube.py index 65a6c043e..1549c36df 100644 --- a/yt_dlp/extractor/youtube.py +++ b/yt_dlp/extractor/youtube.py @@ -117,6 +117,7 @@ } }, 'INNERTUBE_CONTEXT_CLIENT_NAME': 3, + 'REQUIRE_JS_PLAYER': False }, 'android_embedded': { 'INNERTUBE_API_KEY': 'AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8', @@ -126,7 +127,8 @@ 'clientVersion': '16.20', }, }, - 'INNERTUBE_CONTEXT_CLIENT_NAME': 55 + 'INNERTUBE_CONTEXT_CLIENT_NAME': 55, + 'REQUIRE_JS_PLAYER': False }, 'android_music': { 'INNERTUBE_API_KEY': 'AIzaSyC9XL3ZjWddXya6X74dJoCTL-WEYFDNX30', @@ -138,6 +140,7 @@ } }, 'INNERTUBE_CONTEXT_CLIENT_NAME': 21, + 'REQUIRE_JS_PLAYER': False }, 'android_creator': { 'INNERTUBE_CONTEXT': { @@ -146,7 +149,8 @@ 'clientVersion': '21.24.100', }, }, - 'INNERTUBE_CONTEXT_CLIENT_NAME': 14 + 'INNERTUBE_CONTEXT_CLIENT_NAME': 14, + 'REQUIRE_JS_PLAYER': False }, # ios has HLS live streams # See: https://github.com/TeamNewPipe/NewPipeExtractor/issues/680 @@ -158,7 +162,8 @@ 'clientVersion': '16.20', } }, - 'INNERTUBE_CONTEXT_CLIENT_NAME': 5 + 'INNERTUBE_CONTEXT_CLIENT_NAME': 5, + 'REQUIRE_JS_PLAYER': False }, 'ios_embedded': { 'INNERTUBE_API_KEY': 'AIzaSyDCU8hByM-4DrUqRUYnGn-3llEO78bcxq8', @@ -168,7 +173,8 @@ 'clientVersion': '16.20', }, }, - 'INNERTUBE_CONTEXT_CLIENT_NAME': 66 + 'INNERTUBE_CONTEXT_CLIENT_NAME': 66, + 'REQUIRE_JS_PLAYER': False }, 'ios_music': { 'INNERTUBE_API_KEY': 'AIzaSyDK3iBpDP9nHVTk2qL73FLJICfOC3c51Og', @@ -179,7 +185,8 @@ 'clientVersion': '4.32', }, }, - 'INNERTUBE_CONTEXT_CLIENT_NAME': 26 + 'INNERTUBE_CONTEXT_CLIENT_NAME': 26, + 'REQUIRE_JS_PLAYER': False }, 'ios_creator': { 'INNERTUBE_CONTEXT': { @@ -188,7 +195,8 @@ 'clientVersion': '21.24.100', }, }, - 'INNERTUBE_CONTEXT_CLIENT_NAME': 15 + 'INNERTUBE_CONTEXT_CLIENT_NAME': 15, + 'REQUIRE_JS_PLAYER': False }, # mweb has 'ultralow' formats # See: https://github.com/yt-dlp/yt-dlp/pull/557 @@ -215,6 +223,7 @@ def build_innertube_clients(): for client, ytcfg in tuple(INNERTUBE_CLIENTS.items()): ytcfg.setdefault('INNERTUBE_API_KEY', 'AIzaSyDCU8hByM-4DrUqRUYnGn-3llEO78bcxq8') ytcfg.setdefault('INNERTUBE_HOST', 'www.youtube.com') + ytcfg.setdefault('REQUIRE_JS_PLAYER', True) ytcfg['INNERTUBE_CONTEXT']['client'].setdefault('hl', 'en') ytcfg['priority'] = 10 * priority(client.split('_', 1)[0]) @@ -1858,14 +1867,12 @@ def __init__(self, *args, **kwargs): self._code_cache = {} self._player_cache = {} - def _extract_player_url(self, ytcfg=None, webpage=None): - player_url = try_get(ytcfg, (lambda x: x['PLAYER_JS_URL']), str) - if not player_url and webpage: - player_url = self._search_regex( - r'"(?:PLAYER_JS_URL|jsUrl)"\s*:\s*"([^"]+)"', - webpage, 'player URL', fatal=False) + def _extract_player_url(self, *ytcfgs, webpage=None): + player_url = traverse_obj( + ytcfgs, (..., 'PLAYER_JS_URL'), (..., 'WEB_PLAYER_CONTEXT_CONFIGS', ..., 'jsUrl'), + get_all=False, expected_type=compat_str) if not player_url: - return None + return if player_url.startswith('//'): player_url = 'https:' + player_url elif not re.match(r'https?://', player_url): @@ -1873,6 +1880,16 @@ def _extract_player_url(self, ytcfg=None, webpage=None): 'https://www.youtube.com', player_url) return player_url + def _download_player_url(self, video_id, fatal=False): + res = self._download_webpage( + 'https://www.youtube.com/iframe_api', + note='Downloading iframe API JS', video_id=video_id, fatal=fatal) + if res: + player_version = self._search_regex( + r'player\\?/([0-9a-fA-F]{8})\\?/', res, 'player version', fatal=fatal) + if player_version: + return f'https://www.youtube.com/s/player/{player_version}/player_ias.vflset/en_US/base.js' + def _signature_cache_id(self, example_sig): """ Return a string representation of a signature """ return '.'.join(compat_str(len(part)) for part in example_sig.split('.')) @@ -2462,7 +2479,7 @@ def _extract_player_response(self, client, video_id, master_ytcfg, player_ytcfg, session_index = self._extract_session_index(player_ytcfg, master_ytcfg) syncid = self._extract_account_syncid(player_ytcfg, master_ytcfg, initial_pr) - sts = self._extract_signature_timestamp(video_id, player_url, master_ytcfg, fatal=False) + sts = self._extract_signature_timestamp(video_id, player_url, master_ytcfg, fatal=False) if player_url else None headers = self.generate_api_headers( player_ytcfg, identity_token, syncid, default_client=client, session_index=session_index) @@ -2507,7 +2524,7 @@ def _extract_player_ytcfg(self, client, video_id): webpage = self._download_webpage(url, video_id, fatal=False, note=f'Downloading {client} config') return self.extract_ytcfg(video_id, webpage) or {} - def _extract_player_responses(self, clients, video_id, webpage, master_ytcfg, player_url, identity_token): + def _extract_player_responses(self, clients, video_id, webpage, master_ytcfg, identity_token): initial_pr = None if webpage: initial_pr = self._extract_yt_initial_variable( @@ -2516,6 +2533,7 @@ def _extract_player_responses(self, clients, video_id, webpage, master_ytcfg, pl original_clients = clients clients = clients[::-1] + prs = [] def append_client(client_name): if client_name in INNERTUBE_CLIENTS and client_name not in original_clients: @@ -2525,23 +2543,33 @@ def append_client(client_name): # extraction of some data. So we return the initial_pr with formats # stripped out even if not requested by the user # See: https://github.com/yt-dlp/yt-dlp/issues/501 - yielded_pr = False if initial_pr: pr = dict(initial_pr) pr['streamingData'] = None - yielded_pr = True - yield pr + prs.append(pr) last_error = None + tried_iframe_fallback = False + player_url = None while clients: client = clients.pop() player_ytcfg = master_ytcfg if client == 'web' else {} if 'configs' not in self._configuration_arg('player_skip'): player_ytcfg = self._extract_player_ytcfg(client, video_id) or player_ytcfg + player_url = player_url or self._extract_player_url(master_ytcfg, player_ytcfg, webpage=webpage) + require_js_player = self._get_default_ytcfg(client).get('REQUIRE_JS_PLAYER') + if 'js' in self._configuration_arg('player_skip'): + require_js_player = False + player_url = None + + if not player_url and not tried_iframe_fallback and require_js_player: + player_url = self._download_player_url(video_id) + tried_iframe_fallback = True + try: pr = initial_pr if client == 'web' and initial_pr else self._extract_player_response( - client, video_id, player_ytcfg or master_ytcfg, player_ytcfg, identity_token, player_url, initial_pr) + client, video_id, player_ytcfg or master_ytcfg, player_ytcfg, identity_token, player_url if require_js_player else None, initial_pr) except ExtractorError as e: if last_error: self.report_warning(last_error) @@ -2549,8 +2577,7 @@ def append_client(client_name): continue if pr: - yielded_pr = True - yield pr + prs.append(pr) # creator clients can bypass AGE_VERIFICATION_REQUIRED if logged in if client.endswith('_agegate') and self._is_unplayable(pr) and self._generate_sapisidhash_header(): @@ -2559,9 +2586,10 @@ def append_client(client_name): append_client(f'{client}_agegate') if last_error: - if not yielded_pr: + if not len(prs): raise last_error self.report_warning(last_error) + return prs, player_url def _extract_formats(self, streaming_data, video_id, player_url, is_live): itags, stream_ids = [], [] @@ -2708,16 +2736,17 @@ def _real_extract(self, url): base_url = self.http_scheme() + '//www.youtube.com/' webpage_url = base_url + 'watch?v=' + video_id - webpage = self._download_webpage( - webpage_url + '&bpctr=9999999999&has_verified=1', video_id, fatal=False) + webpage = None + if 'webpage' not in self._configuration_arg('player_skip'): + webpage = self._download_webpage( + webpage_url + '&bpctr=9999999999&has_verified=1', video_id, fatal=False) master_ytcfg = self.extract_ytcfg(video_id, webpage) or self._get_default_ytcfg() - player_url = self._extract_player_url(master_ytcfg, webpage) identity_token = self._extract_identity_token(webpage, video_id) - player_responses = list(self._extract_player_responses( + player_responses, player_url = self._extract_player_responses( self._get_requested_clients(url, smuggled_data), - video_id, webpage, master_ytcfg, player_url, identity_token)) + video_id, webpage, master_ytcfg, identity_token) get_first = lambda obj, keys, **kwargs: traverse_obj(obj, (..., *variadic(keys)), **kwargs, get_all=False)