[ie/youtube] Add po_token, visitor_data, data_sync_id extractor args (#10648)

Authored by: seproDev, coletdjnz, bashonly
2024-09-19 20:51:19 +02:00 · 2024-09-13 12:51:58 +02:00 · 2024-09-13 12:51:58 +02:00 · 3a3bd00037
commit 3a3bd00037
parent d1c4d88b2d
2 changed files with 182 additions and 78 deletions
--- a/README.md
+++ b/README.md
@ -1777,6 +1777,9 @@ #### youtube
 * `innertube_host`: Innertube API host to use for all API requests; e.g. `studio.youtube.com`, `youtubei.googleapis.com`. Note that cookies exported from one subdomain will not work on others
 * `innertube_key`: Innertube API key to use for all API requests. By default, no API key is used
 * `raise_incomplete_data`: `Incomplete Data Received` raises an error instead of reporting a warning
 * `data_sync_id`: Overrides the account Data Sync ID used in Innertube API requests. This may be needed if you are using an account with `youtube:player_skip=webpage,configs` or `youtubetab:skip=webpage`
 * `visitor_data`: Overrides the Visitor Data used in Innertube API requests. This should be used with `player_skip=webpage,configs` and without cookies. Note: this may have adverse effects if used improperly. If a session from a browser is wanted, you should pass cookies instead (which contain the Visitor ID)
 * `po_token`:  Proof of Origin (PO) Token(s) to use for requesting video playback. Comma seperated list of PO Tokens in the format `CLIENT+PO_TOKEN`, e.g. `youtube:po_token=web+XXX,android+YYY`
 #### youtubetab (YouTube playlists, channels, feeds, etc.)
 * `skip`: One or more of `webpage` (skip initial webpage download), `authcheck` (allow the download of playlists requiring authentication when no initial webpage is downloaded. This may cause unwanted behavior, see [#1122](https://github.com/yt-dlp/yt-dlp/pull/1122) for more details)
--- a/yt_dlp/extractor/youtube.py
+++ b/yt_dlp/extractor/youtube.py
@ -69,6 +69,8 @@
 )
 STREAMING_DATA_CLIENT_NAME = '__yt_dlp_client'
 STREAMING_DATA_PO_TOKEN = '__yt_dlp_po_token'
 # any clients starting with _ cannot be explicitly requested by the user
 INNERTUBE_CLIENTS = {
    'web': {
@ -79,6 +81,7 @@
            },
        },
        'INNERTUBE_CONTEXT_CLIENT_NAME': 1,
        'REQUIRE_PO_TOKEN': True,
    },
    # Safari UA returns pre-merged video+audio 144p/240p/360p/720p/1080p HLS formats
    'web_safari': {
@ -90,6 +93,7 @@
            },
        },
        'INNERTUBE_CONTEXT_CLIENT_NAME': 1,
        'REQUIRE_PO_TOKEN': True,
    },
    'web_embedded': {
        'INNERTUBE_CONTEXT': {
@ -132,6 +136,7 @@
        },
        'INNERTUBE_CONTEXT_CLIENT_NAME': 3,
        'REQUIRE_JS_PLAYER': False,
        'REQUIRE_PO_TOKEN': True,
    },
    'android_music': {
        'INNERTUBE_CONTEXT': {
@ -146,6 +151,7 @@
        },
        'INNERTUBE_CONTEXT_CLIENT_NAME': 21,
        'REQUIRE_JS_PLAYER': False,
        'REQUIRE_PO_TOKEN': True,
    },
    'android_creator': {
        'INNERTUBE_CONTEXT': {
@ -160,6 +166,7 @@
        },
        'INNERTUBE_CONTEXT_CLIENT_NAME': 14,
        'REQUIRE_JS_PLAYER': False,
        'REQUIRE_PO_TOKEN': True,
    },
    # YouTube Kids videos aren't returned on this client for some reason
    'android_vr': {
@ -323,6 +330,7 @@ def build_innertube_clients():
    for client, ytcfg in tuple(INNERTUBE_CLIENTS.items()):
        ytcfg.setdefault('INNERTUBE_HOST', 'www.youtube.com')
        ytcfg.setdefault('REQUIRE_JS_PLAYER', True)
        ytcfg.setdefault('REQUIRE_PO_TOKEN', False)
        ytcfg.setdefault('PLAYER_PARAMS', None)
        ytcfg['INNERTUBE_CONTEXT']['client'].setdefault('hl', 'en')
@ -688,31 +696,46 @@ def _extract_identity_token(self, ytcfg=None, webpage=None):
                r'\bID_TOKEN["\']\s*:\s*["\'](.+?)["\']', webpage,
                'identity token', default=None, fatal=False)
-    @staticmethod
+    def _data_sync_id_to_delegated_session_id(self, data_sync_id):
-    def _extract_account_syncid(*args):
+        if not data_sync_id:
            return
        # datasyncid is of the form "channel_syncid||user_syncid" for secondary channel
        # and just "user_syncid||" for primary channel. We only want the channel_syncid
        channel_syncid, _, user_syncid = data_sync_id.partition('||')
        if user_syncid:
            return channel_syncid
    def _extract_account_syncid(self, *args):
        """
-        Extract syncId required to download private playlists of secondary channels
+        Extract current session ID required to download private playlists of secondary channels
        @params response and/or ytcfg
        """
-        for data in args:
+        # ytcfg includes channel_syncid if on secondary channel
-            # ytcfg includes channel_syncid if on secondary channel
+        if delegated_sid := traverse_obj(args, (..., 'DELEGATED_SESSION_ID', {str}, any)):
-            delegated_sid = try_get(data, lambda x: x['DELEGATED_SESSION_ID'], str)
+            return delegated_sid
            if delegated_sid:
                return delegated_sid
            sync_ids = (try_get(
                data, (lambda x: x['responseContext']['mainAppWebResponseContext']['datasyncId'],
                       lambda x: x['DATASYNC_ID']), str) or '').split('||')
            if len(sync_ids) >= 2 and sync_ids[1]:
                # datasyncid is of the form "channel_syncid||user_syncid" for secondary channel
                # and just "user_syncid||" for primary channel. We only want the channel_syncid
                return sync_ids[0]
-    @staticmethod
+        data_sync_id = self._extract_data_sync_id(*args)
-    def _extract_visitor_data(*args):
+        return self._data_sync_id_to_delegated_session_id(data_sync_id)
    def _extract_data_sync_id(self, *args):
        """
        Extract current account dataSyncId.
        In the format DELEGATED_SESSION_ID||USER_SESSION_ID or USER_SESSION_ID||
        @params response and/or ytcfg
        """
        if data_sync_id := self._configuration_arg('data_sync_id', [None], ie_key=YoutubeIE, casesense=True)[0]:
            return data_sync_id
        return traverse_obj(
            args, (..., ('DATASYNC_ID', ('responseContext', 'mainAppWebResponseContext', 'datasyncId')), {str}, any))
    def _extract_visitor_data(self, *args):
        """
        Extracts visitorData from an API response or ytcfg
        Appears to be used to track session state
        """
        if visitor_data := self._configuration_arg('visitor_data', [None], ie_key=YoutubeIE, casesense=True)[0]:
            return visitor_data
        return get_first(
            args, [('VISITOR_DATA', ('INNERTUBE_CONTEXT', 'client', 'visitorData'), ('responseContext', 'visitorData'))],
            expected_type=str)
@ -1334,11 +1357,6 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
        '401': {'ext': 'mp4', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'av01.0.12M.08'},
    }
    _SUBTITLE_FORMATS = ('json3', 'srv1', 'srv2', 'srv3', 'ttml', 'vtt')
    _POTOKEN_EXPERIMENTS = ('51217476', '51217102')
    _BROKEN_CLIENTS = {
        short_client_name(client): client
        for client in ('android', 'android_creator', 'android_music')
    }
    _DEFAULT_CLIENTS = ('ios', 'web_creator')
    _GEO_BYPASS = False
@ -3701,6 +3719,54 @@ def _generate_player_context(cls, sts=None):
            **cls._get_checkok_params(),
        }
    def _get_config_po_token(self, client):
        po_token_strs = self._configuration_arg('po_token', [], ie_key=YoutubeIE, casesense=True)
        for token_str in po_token_strs:
            po_token_client, sep, po_token = token_str.partition('+')
            if not sep:
                self.report_warning(
                    f'Invalid po_token configuration format. Expected "client+po_token", got "{token_str}"', only_once=True)
                continue
            if po_token_client == client:
                return po_token
    def fetch_po_token(self, client='web', visitor_data=None, data_sync_id=None, player_url=None, **kwargs):
        # PO Token is bound to visitor_data / Visitor ID when logged out. Must have visitor_data for it to function.
        if not visitor_data and not self.is_authenticated and player_url:
            self.report_warning(
                f'Unable to fetch PO Token for {client} client: Missing required Visitor Data. '
                f'You may need to pass Visitor Data with --extractor-args "youtube:visitor_data=XXX"')
            return
        config_po_token = self._get_config_po_token(client)
        if config_po_token:
            # PO token is bound to data_sync_id / account Session ID when logged in. However, for the config po_token,
            # if using first channel in an account then we don't need the data_sync_id anymore...
            if not data_sync_id and self.is_authenticated and player_url:
                self.report_warning(
                    f'Got a PO Token for {client} client, but missing Data Sync ID for account. Formats may not work.'
                    f'You may need to pass a Data Sync ID with --extractor-args "youtube:data_sync_id=XXX"')
            return config_po_token
        # Require PO Token if logged in for external fetching
        if not data_sync_id and self.is_authenticated and player_url:
            self.report_warning(
                f'Unable to fetch PO Token for {client} client: Missing required Data Sync ID for account. '
                f'You may need to pass a Data Sync ID with --extractor-args "youtube:data_sync_id=XXX"')
            return
        return self._fetch_po_token(
            client=client,
            visitor_data=visitor_data,
            data_sync_id=data_sync_id,
            player_url=player_url,
            **kwargs,
        )
    def _fetch_po_token(self, client, visitor_data=None, data_sync_id=None, player_url=None, **kwargs):
        """External PO Token fetch stub"""
    @staticmethod
    def _is_agegated(player_response):
        if traverse_obj(player_response, ('playabilityStatus', 'desktopLegacyAgeGateReason')):
@ -3717,13 +3783,17 @@ def _is_agegated(player_response):
    def _is_unplayable(player_response):
        return traverse_obj(player_response, ('playabilityStatus', 'status')) == 'UNPLAYABLE'
-    def _extract_player_response(self, client, video_id, master_ytcfg, player_ytcfg, player_url, initial_pr, smuggled_data):
+    def _extract_player_response(self, client, video_id, master_ytcfg, player_ytcfg, player_url, initial_pr, visitor_data, data_sync_id, po_token):
        session_index = self._extract_session_index(player_ytcfg, master_ytcfg)
        syncid = self._extract_account_syncid(player_ytcfg, master_ytcfg, initial_pr)
        sts = self._extract_signature_timestamp(video_id, player_url, master_ytcfg, fatal=False) if player_url else None
        headers = self.generate_api_headers(
-            ytcfg=player_ytcfg, account_syncid=syncid, session_index=session_index, default_client=client)
+            ytcfg=player_ytcfg,
            default_client=client,
            visitor_data=visitor_data,
            session_index=self._extract_session_index(master_ytcfg, player_ytcfg),
            account_syncid=(
                self._data_sync_id_to_delegated_session_id(data_sync_id)
                or self._extract_account_syncid(master_ytcfg, initial_pr, player_ytcfg)
            ),
        )
        yt_query = {
            'videoId': video_id,
@ -3734,6 +3804,10 @@ def _extract_player_response(self, client, video_id, master_ytcfg, player_ytcfg,
        if player_params := self._configuration_arg('player_params', [default_pp], casesense=True)[0]:
            yt_query['params'] = player_params
        if po_token:
            yt_query['serviceIntegrityDimensions'] = {'poToken': po_token}
        sts = self._extract_signature_timestamp(video_id, player_url, master_ytcfg, fatal=False) if player_url else None
        yt_query.update(self._generate_player_context(sts))
        return self._extract_response(
            item_id=video_id, ep='player', query=yt_query,
@ -3744,7 +3818,6 @@ def _extract_player_response(self, client, video_id, master_ytcfg, player_ytcfg,
    def _get_requested_clients(self, url, smuggled_data):
        requested_clients = []
        broken_clients = []
        excluded_clients = []
        allowed_clients = sorted(
            (client for client in INNERTUBE_CLIENTS if client[:1] != '_'),
@ -3758,12 +3831,8 @@ def _get_requested_clients(self, url, smuggled_data):
                excluded_clients.append(client[1:])
            elif client not in allowed_clients:
                self.report_warning(f'Skipping unsupported client "{client}"')
            elif client in self._BROKEN_CLIENTS.values():
                broken_clients.append(client)
            else:
                requested_clients.append(client)
        # Force deprioritization of _BROKEN_CLIENTS for format de-duplication
        requested_clients.extend(broken_clients)
        if not requested_clients:
            requested_clients.extend(self._DEFAULT_CLIENTS)
        for excluded_client in excluded_clients:
@ -3788,19 +3857,14 @@ def _invalid_player_response(self, pr, video_id):
            return pr_id
    def _extract_player_responses(self, clients, video_id, webpage, master_ytcfg, smuggled_data):
-        initial_pr = ignore_initial_response = None
+        initial_pr = None
        if webpage:
            if 'web' in clients:
                experiments = traverse_obj(master_ytcfg, (
                    'WEB_PLAYER_CONTEXT_CONFIGS', ..., 'serializedExperimentIds', {lambda x: x.split(',')}, ...))
                if all(x in experiments for x in self._POTOKEN_EXPERIMENTS):
                    self.report_warning(
                        'Webpage contains broken formats (poToken experiment detected). Ignoring initial player response')
                    ignore_initial_response = True
            initial_pr = self._search_json(
                self._YT_INITIAL_PLAYER_RESPONSE_RE, webpage, 'initial player response', video_id, fatal=False)
        prs = []
        deprioritized_prs = []
        if initial_pr and not self._invalid_player_response(initial_pr, video_id):
            # Android player_response does not have microFormats which are needed for
            # extraction of some data. So we return the initial_pr with formats
@ -3822,14 +3886,13 @@ def append_client(*client_names):
                        return
        tried_iframe_fallback = False
-        player_url = None
+        player_url = visitor_data = data_sync_id = None
        skipped_clients = {}
        while clients:
            deprioritize_pr = False
            client, base_client, variant = _split_innertube_client(clients.pop())
-            player_ytcfg = {}
+            player_ytcfg = master_ytcfg if client == 'web' else {}
-            if client == 'web':
+            if 'configs' not in self._configuration_arg('player_skip') and client != 'web':
                player_ytcfg = self._get_default_ytcfg() if ignore_initial_response else master_ytcfg
            elif 'configs' not in self._configuration_arg('player_skip'):
                player_ytcfg = self._download_ytcfg(client, video_id) or player_ytcfg
            player_url = player_url or self._extract_player_url(master_ytcfg, player_ytcfg, webpage=webpage)
@ -3842,34 +3905,53 @@ def append_client(*client_names):
                player_url = self._download_player_url(video_id)
                tried_iframe_fallback = True
-            pr = initial_pr if client == 'web' and not ignore_initial_response else None
+            visitor_data = visitor_data or self._extract_visitor_data(master_ytcfg, initial_pr, player_ytcfg)
-            for retry in self.RetryManager(fatal=False):
+            data_sync_id = data_sync_id or self._extract_data_sync_id(master_ytcfg, initial_pr, player_ytcfg)
-                try:
+            po_token = self.fetch_po_token(
-                    pr = pr or self._extract_player_response(
+                client=client, visitor_data=visitor_data,
-                        client, video_id, player_ytcfg or master_ytcfg, player_ytcfg,
+                data_sync_id=data_sync_id if self.is_authenticated else None,
-                        player_url if require_js_player else None, initial_pr, smuggled_data)
+                player_url=player_url if require_js_player else None,
-                except ExtractorError as e:
+            )
-                    self.report_warning(e)
+
-                    break
+            require_po_token = self._get_default_ytcfg(client).get('REQUIRE_PO_TOKEN')
-                experiments = traverse_obj(pr, (
+            if not po_token and require_po_token:
-                    'responseContext', 'serviceTrackingParams', lambda _, v: v['service'] == 'GFEEDBACK',
+                self.report_warning(
-                    'params', lambda _, v: v['key'] == 'e', 'value', {lambda x: x.split(',')}, ...))
+                    f'No PO Token provided for {client} client, '
-                if all(x in experiments for x in self._POTOKEN_EXPERIMENTS):
+                    f'which is required for working {client} formats. '
-                    pr = None
+                    f'You can manually pass a PO Token for this client with '
-                    retry.error = ExtractorError('API returned broken formats (poToken experiment detected)', expected=True)
+                    f'--extractor-args "youtube:po_token={client}+XXX"',
-            if not pr:
+                    only_once=True)
                deprioritize_pr = True
            pr = initial_pr if client == 'web' else None
            try:
                pr = pr or self._extract_player_response(
                    client, video_id,
                    master_ytcfg=player_ytcfg or master_ytcfg,
                    player_ytcfg=player_ytcfg,
                    player_url=player_url,
                    initial_pr=initial_pr,
                    visitor_data=visitor_data,
                    data_sync_id=data_sync_id,
                    po_token=po_token)
            except ExtractorError as e:
                self.report_warning(e)
                continue
            if pr_id := self._invalid_player_response(pr, video_id):
                skipped_clients[client] = pr_id
            elif pr:
                # Save client name for introspection later
                name = short_client_name(client)
                sd = traverse_obj(pr, ('streamingData', {dict})) or {}
-                sd[STREAMING_DATA_CLIENT_NAME] = name
+                sd[STREAMING_DATA_CLIENT_NAME] = client
                sd[STREAMING_DATA_PO_TOKEN] = po_token
                for f in traverse_obj(sd, (('formats', 'adaptiveFormats'), ..., {dict})):
-                    f[STREAMING_DATA_CLIENT_NAME] = name
+                    f[STREAMING_DATA_CLIENT_NAME] = client
-                prs.append(pr)
+                    f[STREAMING_DATA_PO_TOKEN] = po_token
                if deprioritize_pr:
                    deprioritized_prs.append(pr)
                else:
                    prs.append(pr)
            # tv_embedded can work around age-gate and age-verification IF the video is embeddable
            if self._is_agegated(pr) and variant != 'tv_embedded':
@ -3893,6 +3975,8 @@ def append_client(*client_names):
                # _producer, _testsuite, & _vr variants can also work around age-verification
                append_client('web_creator', 'mediaconnect')
        prs.extend(deprioritized_prs)
        if skipped_clients:
            self.report_warning(
                f'Skipping player responses from {"/".join(skipped_clients)} clients '
@ -4027,13 +4111,17 @@ def build_fragments(f):
                    f'{video_id}: Some formats are possibly damaged. They will be deprioritized', only_once=True)
            client_name = fmt.get(STREAMING_DATA_CLIENT_NAME)
-            # _BROKEN_CLIENTS return videoplayback URLs that expire after 30 seconds
+            po_token = fmt.get(STREAMING_DATA_PO_TOKEN)
-            # Ref: https://github.com/yt-dlp/yt-dlp/issues/9554
+
-            is_broken = client_name in self._BROKEN_CLIENTS
+            if po_token:
                fmt_url = update_url_query(fmt_url, {'pot': po_token})
            # Clients that require PO Token return videoplayback URLs that may return 403
            is_broken = (not po_token and self._get_default_ytcfg(client_name).get('REQUIRE_PO_TOKEN'))
            if is_broken:
                self.report_warning(
-                    f'{video_id}: {self._BROKEN_CLIENTS[client_name]} client formats are broken '
+                    f'{video_id}: {client_name} client formats require a PO Token which was not provided. '
-                    'and may yield HTTP Error 403. They will be deprioritized', only_once=True)
+                    'They will be deprioritized as they may yield HTTP Error 403', only_once=True)
            name = fmt.get('qualityLabel') or quality.replace('audio_quality_', '') or ''
            fps = int_or_none(fmt.get('fps')) or 0
@ -4109,12 +4197,24 @@ def build_fragments(f):
        elif skip_bad_formats and live_status == 'is_live' and needs_live_processing != 'is_live':
            skip_manifests.add('dash')
-        def process_manifest_format(f, proto, client_name, itag):
+        def process_manifest_format(f, proto, client_name, itag, po_token):
            key = (proto, f.get('language'))
            if not all_formats and key in itags[itag]:
                return False
            itags[itag].add(key)
            if f.get('source_preference') is None:
                f['source_preference'] = -1
            # Clients that require PO Token return videoplayback URLs that may return 403
            # hls does not currently require PO Token
            if (not po_token and self._get_default_ytcfg(client_name).get('REQUIRE_PO_TOKEN')) and proto != 'hls':
                self.report_warning(
                    f'{video_id}: {client_name} client {proto} formats require a PO Token which was not provided. '
                    'They will be deprioritized as they may yield HTTP Error 403', only_once=True)
                f['format_note'] = join_nonempty(f.get('format_note'), 'BROKEN', delim=' ')
                f['source_preference'] -= 20
            if itag and all_formats:
                f['format_id'] = f'{itag}-{proto}'
            elif any(p != proto for p, _ in itags[itag]):
@ -4126,9 +4226,6 @@ def process_manifest_format(f, proto, client_name, itag):
                f['format_note'] = join_nonempty(f.get('format_note'), '(default)', delim=' ')
                f['language_preference'] = PREFERRED_LANG_VALUE
            if f.get('source_preference') is None:
                f['source_preference'] = -1
            if itag in ('616', '235'):
                f['format_note'] = join_nonempty(f.get('format_note'), 'Premium', delim=' ')
                f['source_preference'] += 100
@ -4149,23 +4246,27 @@ def process_manifest_format(f, proto, client_name, itag):
        subtitles = {}
        for sd in streaming_data:
            client_name = sd.get(STREAMING_DATA_CLIENT_NAME)
-
+            po_token = sd.get(STREAMING_DATA_PO_TOKEN)
            hls_manifest_url = 'hls' not in skip_manifests and sd.get('hlsManifestUrl')
            if hls_manifest_url:
                if po_token:
                    hls_manifest_url = hls_manifest_url.rstrip('/') + f'/pot/{po_token}'
                fmts, subs = self._extract_m3u8_formats_and_subtitles(
                    hls_manifest_url, video_id, 'mp4', fatal=False, live=live_status == 'is_live')
                subtitles = self._merge_subtitles(subs, subtitles)
                for f in fmts:
                    if process_manifest_format(f, 'hls', client_name, self._search_regex(
-                            r'/itag/(\d+)', f['url'], 'itag', default=None)):
+                            r'/itag/(\d+)', f['url'], 'itag', default=None), po_token):
                        yield f
            dash_manifest_url = 'dash' not in skip_manifests and sd.get('dashManifestUrl')
            if dash_manifest_url:
                if po_token:
                    dash_manifest_url = dash_manifest_url.rstrip('/') + f'/pot/{po_token}'
                formats, subs = self._extract_mpd_formats_and_subtitles(dash_manifest_url, video_id, fatal=False)
                subtitles = self._merge_subtitles(subs, subtitles)  # Prioritize HLS subs over DASH
                for f in formats:
-                    if process_manifest_format(f, 'dash', client_name, f['format_id']):
+                    if process_manifest_format(f, 'dash', client_name, f['format_id'], po_token):
                        f['filesize'] = int_or_none(self._search_regex(
                            r'/clen/(\d+)', f.get('fragment_base_url') or f['url'], 'file size', default=None))
                        if needs_live_processing: