[ie/vimeo] Always try to extract original format (#10721)

Closes #9163 Authored by: bashonly
2024-09-19 20:51:19 +02:00 · 2024-09-01 18:25:36 -05:00 · 2024-09-01 18:25:36 -05:00 · 4115c24d15
commit 4115c24d15
parent ad9a8115aa
1 changed files with 155 additions and 53 deletions
--- a/yt_dlp/extractor/vimeo.py
+++ b/yt_dlp/extractor/vimeo.py
@ -234,13 +234,30 @@ def _parse_config(self, config, video_id):
            '_format_sort_fields': ('quality', 'res', 'fps', 'hdr:12', 'source'),
        }

-    def _extract_original_format(self, url, video_id, unlisted_hash=None):
+    def _call_videos_api(self, video_id, jwt_token, unlisted_hash=None, **kwargs):
+        return self._download_json(
+            join_nonempty(f'https://api.vimeo.com/videos/{video_id}', unlisted_hash, delim=':'),
+            video_id, 'Downloading API JSON', headers={
+                'Authorization': f'jwt {jwt_token}',
+                'Accept': 'application/json',
+            }, query={
+                'fields': ','.join((
+                    'config_url', 'created_time', 'description', 'download', 'license',
+                    'metadata.connections.comments.total', 'metadata.connections.likes.total',
+                    'release_time', 'stats.plays')),
+            }, **kwargs)
+
+    def _extract_original_format(self, url, video_id, unlisted_hash=None, jwt=None, api_data=None):
+        # Original/source formats are only available when logged in
+        if not self._get_cookies('https://vimeo.com/').get('is_logged_in'):
+            return
+
        query = {'action': 'load_download_config'}
        if unlisted_hash:
            query['unlisted_hash'] = unlisted_hash
        download_data = self._download_json(
-            url, video_id, fatal=False, query=query,
-            headers={'X-Requested-With': 'XMLHttpRequest'},
+            url, video_id, 'Loading download config JSON', fatal=False,
+            query=query, headers={'X-Requested-With': 'XMLHttpRequest'},
            expected_status=(403, 404)) or {}
        source_file = download_data.get('source_file')
        download_url = try_get(source_file, lambda x: x['download_url'])
@ -261,15 +278,13 @@ def _extract_original_format(self, url, video_id, unlisted_hash=None):
                    'quality': 1,
                }

-        jwt_response = self._download_json(
-            'https://vimeo.com/_rv/viewer', video_id, note='Downloading jwt token', fatal=False) or {}
-        if not jwt_response.get('jwt'):
+        jwt = jwt or traverse_obj(self._download_json(
+            'https://vimeo.com/_rv/viewer', video_id, 'Downloading jwt token', fatal=False), ('jwt', {str}))
+        if not jwt:
            return
-        headers = {'Authorization': 'jwt {}'.format(jwt_response['jwt']), 'Accept': 'application/json'}
-        original_response = self._download_json(
-            f'https://api.vimeo.com/videos/{video_id}', video_id,
-            headers=headers, fatal=False, expected_status=(403, 404)) or {}
-        for download_data in original_response.get('download') or []:
+        original_response = api_data or self._call_videos_api(
+            video_id, jwt, unlisted_hash, fatal=False, expected_status=(403, 404))
+        for download_data in traverse_obj(original_response, ('download', ..., {dict})):
            download_url = download_data.get('link')
            if not download_url or download_data.get('quality') != 'source':
                continue
@ -354,7 +369,7 @@ class VimeoIE(VimeoBaseInfoExtractor):
            'skip': 'No longer available',
        },
        {
-            'url': 'http://player.vimeo.com/video/54469442',
+            'url': 'https://player.vimeo.com/video/54469442',
            'md5': '619b811a4417aa4abe78dc653becf511',
            'note': 'Videos that embed the url in the player page',
            'info_dict': {
@ -370,6 +385,7 @@ class VimeoIE(VimeoBaseInfoExtractor):
            'params': {
                'format': 'best[protocol=https]',
            },
+            'expected_warnings': ['Failed to parse XML: not well-formed'],
        },
        {
            'url': 'http://vimeo.com/68375962',
@ -379,22 +395,23 @@ class VimeoIE(VimeoBaseInfoExtractor):
                'id': '68375962',
                'ext': 'mp4',
                'title': 'youtube-dl password protected test video',
-                'timestamp': 1371200155,
+                'timestamp': 1371214555,
                'upload_date': '20130614',
+                'release_timestamp': 1371214555,
+                'release_date': '20130614',
                'uploader_url': r're:https?://(?:www\.)?vimeo\.com/user18948128',
                'uploader_id': 'user18948128',
                'uploader': 'Jaime Marquínez Ferrándiz',
                'duration': 10,
-                'description': 'md5:6173f270cd0c0119f22817204b3eb86c',
-                'thumbnail': 'https://i.vimeocdn.com/video/440665496-b2c5aee2b61089442c794f64113a8e8f7d5763c3e6b3ebfaf696ae6413f8b1f4-d_1280',
-                'view_count': int,
                'comment_count': int,
                'like_count': int,
+                'thumbnail': 'https://i.vimeocdn.com/video/440665496-b2c5aee2b61089442c794f64113a8e8f7d5763c3e6b3ebfaf696ae6413f8b1f4-d_1280',
            },
            'params': {
                'format': 'best[protocol=https]',
                'videopassword': 'youtube-dl',
            },
+            'expected_warnings': ['Failed to parse XML: not well-formed'],
        },
        {
            'url': 'http://vimeo.com/channels/keypeele/75629013',
@ -418,29 +435,38 @@ class VimeoIE(VimeoBaseInfoExtractor):
                'like_count': int,
            },
            'params': {'format': 'http-1080p'},
+            'expected_warnings': ['Failed to parse XML: not well-formed'],
        },
        {
            'url': 'http://vimeo.com/76979871',
            'note': 'Video with subtitles',
            'info_dict': {
                'id': '76979871',
-                'ext': 'mov',
+                'ext': 'mp4',
                'title': 'The New Vimeo Player (You Know, For Videos)',
-                'description': 'md5:2ec900bf97c3f389378a96aee11260ea',
-                'timestamp': 1381846109,
+                'description': str,  # FIXME: Dynamic SEO spam description
+                'timestamp': 1381860509,
                'upload_date': '20131015',
+                'release_timestamp': 1381860509,
+                'release_date': '20131015',
                'uploader_url': r're:https?://(?:www\.)?vimeo\.com/staff',
                'uploader_id': 'staff',
-                'uploader': 'Vimeo Staff',
+                'uploader': 'Vimeo',
                'duration': 62,
+                'comment_count': int,
+                'like_count': int,
+                'thumbnail': 'https://i.vimeocdn.com/video/452001751-8216e0571c251a09d7a8387550942d89f7f86f6398f8ed886e639b0dd50d3c90-d_1280',
                'subtitles': {
-                    'de': [{'ext': 'vtt'}],
-                    'en': [{'ext': 'vtt'}],
-                    'es': [{'ext': 'vtt'}],
-                    'fr': [{'ext': 'vtt'}],
+                    'de': 'count:3',
+                    'en': 'count:3',
+                    'es': 'count:3',
+                    'fr': 'count:3',
                },
            },
-            'expected_warnings': ['Ignoring subtitle tracks found in the HLS manifest'],
+            'expected_warnings': [
+                'Ignoring subtitle tracks found in the HLS manifest',
+                'Failed to parse XML: not well-formed',
+            ],
        },
        {
            # from https://www.ouya.tv/game/Pier-Solar-and-the-Great-Architects/
@ -456,11 +482,12 @@ class VimeoIE(VimeoBaseInfoExtractor):
                'duration': 118,
                'thumbnail': 'https://i.vimeocdn.com/video/478636036-c18440305ef3df9decfb6bf207a61fe39d2d17fa462a96f6f2d93d30492b037d-d_1280',
            },
+            'expected_warnings': ['Failed to parse XML: not well-formed'],
        },
        {
-            # contains original format
+            # contains Original format
            'url': 'https://vimeo.com/33951933',
-            'md5': '53c688fa95a55bf4b7293d37a89c5c53',
+            # 'md5': '53c688fa95a55bf4b7293d37a89c5c53',
            'info_dict': {
                'id': '33951933',
                'ext': 'mp4',
@ -476,15 +503,19 @@ class VimeoIE(VimeoBaseInfoExtractor):
                'view_count': int,
                'thumbnail': 'https://i.vimeocdn.com/video/231174622-dd07f015e9221ff529d451e1cc31c982b5d87bfafa48c4189b1da72824ee289a-d_1280',
                'like_count': int,
+                'tags': 'count:11',
            },
+            # 'params': {'format': 'Original'},
+            'expected_warnings': ['Failed to parse XML: not well-formed'],
        },
        {
-            'note': 'Contains original format not accessible in webpage',
+            'note': 'Contains source format not accessible in webpage',
            'url': 'https://vimeo.com/393756517',
-            'md5': 'c464af248b592190a5ffbb5d33f382b0',
+            # 'md5': 'c464af248b592190a5ffbb5d33f382b0',
            'info_dict': {
                'id': '393756517',
-                'ext': 'mov',
+                # 'ext': 'mov',
+                'ext': 'mp4',
                'timestamp': 1582642091,
                'uploader_id': 'frameworkla',
                'title': 'Straight To Hell - Sabrina: Netflix',
@ -495,6 +526,8 @@ class VimeoIE(VimeoBaseInfoExtractor):
                'thumbnail': 'https://i.vimeocdn.com/video/859377297-836494a4ef775e9d4edbace83937d9ad34dc846c688c0c419c0e87f7ab06c4b3-d_1280',
                'uploader_url': 'https://vimeo.com/frameworkla',
            },
+            # 'params': {'format': 'source'},
+            'expected_warnings': ['Failed to parse XML: not well-formed'],
        },
        {
            # only available via https://vimeo.com/channels/tributes/6213729 and
@ -511,16 +544,18 @@ class VimeoIE(VimeoBaseInfoExtractor):
                'channel_id': 'tributes',
                'timestamp': 1250886430,
                'upload_date': '20090821',
-                'description': 'md5:bdbf314014e58713e6e5b66eb252f4a6',
+                'description': str,  # FIXME: Dynamic SEO spam description
                'duration': 321,
                'comment_count': int,
                'view_count': int,
                'thumbnail': 'https://i.vimeocdn.com/video/22728298-bfc22146f930de7cf497821c7b0b9f168099201ecca39b00b6bd31fcedfca7a6-d_1280',
                'like_count': int,
+                'tags': ['[the shining', 'vimeohq', 'cv', 'vimeo tribute]'],
            },
            'params': {
                'skip_download': True,
            },
+            'expected_warnings': ['Failed to parse XML: not well-formed'],
        },
        {
            # redirects to ondemand extractor and should be passed through it
@ -543,28 +578,23 @@ class VimeoIE(VimeoBaseInfoExtractor):
            'skip': 'this page is no longer available.',
        },
        {
-            'url': 'http://player.vimeo.com/video/68375962',
+            'url': 'https://player.vimeo.com/video/68375962',
            'md5': 'aaf896bdb7ddd6476df50007a0ac0ae7',
            'info_dict': {
                'id': '68375962',
                'ext': 'mp4',
                'title': 'youtube-dl password protected test video',
-                'timestamp': 1371200155,
-                'upload_date': '20130614',
                'uploader_url': r're:https?://(?:www\.)?vimeo\.com/user18948128',
                'uploader_id': 'user18948128',
                'uploader': 'Jaime Marquínez Ferrándiz',
                'duration': 10,
-                'description': 'md5:6173f270cd0c0119f22817204b3eb86c',
                'thumbnail': 'https://i.vimeocdn.com/video/440665496-b2c5aee2b61089442c794f64113a8e8f7d5763c3e6b3ebfaf696ae6413f8b1f4-d_1280',
-                'view_count': int,
-                'comment_count': int,
-                'like_count': int,
            },
            'params': {
                'format': 'best[protocol=https]',
                'videopassword': 'youtube-dl',
            },
+            'expected_warnings': ['Failed to parse XML: not well-formed'],
        },
        {
            'url': 'http://vimeo.com/moogaloop.swf?clip_id=2539741',
@ -592,7 +622,7 @@ class VimeoIE(VimeoBaseInfoExtractor):
                'title': "youtube-dl test video '' ä↭𝕐-BaW jenozKc",
                'uploader': 'Philipp Hagemeister',
                'uploader_id': 'user20132939',
-                'description': 'md5:fa7b6c6d8db0bdc353893df2f111855b',
+                'description': str,  # FIXME: Dynamic SEO spam description
                'upload_date': '20150209',
                'timestamp': 1423518307,
                'thumbnail': 'https://i.vimeocdn.com/video/default_1280',
@ -606,6 +636,7 @@ class VimeoIE(VimeoBaseInfoExtractor):
                'format': 'best[protocol=https]',
                'videopassword': 'youtube-dl',
            },
+            'expected_warnings': ['Failed to parse XML: not well-formed'],
        },
        {
            # source file returns 403: Forbidden
@ -633,11 +664,13 @@ class VimeoIE(VimeoBaseInfoExtractor):
                'release_date': '20160329',
            },
            'params': {'skip_download': True},
+            'expected_warnings': ['Failed to parse XML: not well-formed'],
        },
        {
            'url': 'https://vimeo.com/138909882',
            'info_dict': {
                'id': '138909882',
+                # 'ext': 'm4v',
                'ext': 'mp4',
                'title': 'Eastnor Castle 2015 Firework Champions - The Promo!',
                'description': 'md5:5967e090768a831488f6e74b7821b3c1',
@ -645,11 +678,19 @@ class VimeoIE(VimeoBaseInfoExtractor):
                'uploader': 'Firework Champions',
                'upload_date': '20150910',
                'timestamp': 1441901895,
+                'thumbnail': 'https://i.vimeocdn.com/video/534715882-6ff8e4660cbf2fea68282876d8d44f318825dfe572cc4016e73b3266eac8ae3a-d_1280',
+                'uploader_url': 'https://vimeo.com/fireworkchampions',
+                'tags': 'count:6',
+                'duration': 229,
+                'view_count': int,
+                'like_count': int,
+                'comment_count': int,
            },
            'params': {
                'skip_download': True,
-                'format': 'Original',
+                # 'format': 'source',
            },
+            'expected_warnings': ['Failed to parse XML: not well-formed'],
        },
        {
            'url': 'https://vimeo.com/channels/staffpicks/143603739',
@ -670,8 +711,10 @@ class VimeoIE(VimeoBaseInfoExtractor):
                'like_count': int,
                'uploader_url': 'https://vimeo.com/karimhd',
                'channel_url': 'https://vimeo.com/channels/staffpicks',
+                'tags': 'count:6',
            },
            'params': {'skip_download': 'm3u8'},
+            'expected_warnings': ['Failed to parse XML: not well-formed'],
        },
        {
            # requires passing unlisted_hash(a52724358e) to load_download_config request
@ -701,6 +744,7 @@ class VimeoIE(VimeoBaseInfoExtractor):
            'params': {
                'skip_download': True,
            },
+            'expected_warnings': ['Failed to parse XML: not well-formed'],
        },
        {
            # chapters must be sorted, see: https://github.com/yt-dlp/yt-dlp/issues/5308
@ -735,6 +779,48 @@ class VimeoIE(VimeoBaseInfoExtractor):
            },
            'expected_warnings': ['Failed to parse XML: not well-formed'],
        },
+        {
+            # vimeo.com URL with unlisted hash and Original format
+            'url': 'https://vimeo.com/144579403/ec02229140',
+            # 'md5': '6b662c2884e0373183fbde2a0d15cb78',
+            'info_dict': {
+                'id': '144579403',
+                'ext': 'mp4',
+                'title': 'SALESMANSHIP',
+                'description': 'md5:4338302f347a1ff8841b4a3aecaa09f0',
+                'uploader': 'Off the Picture Pictures',
+                'uploader_id': 'offthepicturepictures',
+                'uploader_url': 'https://vimeo.com/offthepicturepictures',
+                'duration': 669,
+                'upload_date': '20151104',
+                'timestamp': 1446607180,
+                'release_date': '20151104',
+                'release_timestamp': 1446607180,
+                'like_count': int,
+                'view_count': int,
+                'comment_count': int,
+                'thumbnail': r're:https://i\.vimeocdn\.com/video/1018638656-[\da-f]+-d_1280',
+            },
+            # 'params': {'format': 'Original'},
+            'expected_warnings': ['Failed to parse XML: not well-formed'],
+        },
+        {
+            # player.vimeo.com URL with source format
+            'url': 'https://player.vimeo.com/video/859028877',
+            # 'md5': '19ca3d2463441dee2d2f0671ac2916a2',
+            'info_dict': {
+                'id': '859028877',
+                'ext': 'mp4',
+                'title': 'Ariana Grande - Honeymoon Avenue (Live from London)',
+                'uploader': 'Raja Virdi',
+                'uploader_id': 'rajavirdi',
+                'uploader_url': 'https://vimeo.com/rajavirdi',
+                'duration': 309,
+                'thumbnail': r're:https://i\.vimeocdn\.com/video/1716727772-[\da-f]+-d_1280',
+            },
+            # 'params': {'format': 'source'},
+            'expected_warnings': ['Failed to parse XML: not well-formed'],
+        },
        {
            # user playlist alias -> https://vimeo.com/258705797
            'url': 'https://vimeo.com/user26785108/newspiritualguide',
@ -768,16 +854,6 @@ def _verify_player_video_password(self, url, video_id, headers):
            raise ExtractorError('Wrong video password', expected=True)
        return checked

-    def _call_videos_api(self, video_id, jwt_token, unlisted_hash=None):
-        return self._download_json(
-            join_nonempty(f'https://api.vimeo.com/videos/{video_id}', unlisted_hash, delim=':'),
-            video_id, 'Downloading API JSON', headers={
-                'Authorization': f'jwt {jwt_token}',
-                'Accept': 'application/json',
-            }, query={
-                'fields': 'config_url,created_time,description,license,metadata.connections.comments.total,metadata.connections.likes.total,release_time,stats.plays',
-            })
-
    def _extract_from_api(self, video_id, unlisted_hash=None):
        viewer = self._download_json(
            'https://vimeo.com/_next/viewer', video_id, 'Downloading viewer info')
@ -798,6 +874,11 @@ def _extract_from_api(self, video_id, unlisted_hash=None):

        info = self._parse_config(self._download_json(
            video['config_url'], video_id), video_id)
+        source_format = self._extract_original_format(
+            f'https://vimeo.com/{video_id}', video_id, unlisted_hash, jwt=viewer['jwt'], api_data=video)
+        if source_format:
+            info['formats'].append(source_format)
+
        get_timestamp = lambda x: parse_iso8601(video.get(x + '_time'))
        info.update({
            'description': video.get('description'),
@ -899,7 +980,12 @@ def _real_extract(self, url):
            if config.get('view') == 4:
                config = self._verify_player_video_password(
                    redirect_url, video_id, headers)
-            return self._parse_config(config, video_id)
+            info = self._parse_config(config, video_id)
+            source_format = self._extract_original_format(
+                f'https://vimeo.com/{video_id}', video_id, unlisted_hash)
+            if source_format:
+                info['formats'].append(source_format)
+            return info

        vimeo_config = self._extract_vimeo_config(webpage, video_id, default=None)
        if vimeo_config:
@ -1269,6 +1355,20 @@ class VimeoReviewIE(VimeoBaseInfoExtractor):
    IE_DESC = 'Review pages on vimeo'
    _VALID_URL = r'https?://vimeo\.com/(?P<user>[^/?#]+)/review/(?P<id>\d+)/(?P<hash>[\da-f]{10})'
    _TESTS = [{
+        'url': 'https://vimeo.com/user170863801/review/996447483/a316d6ed8d',
+        'info_dict': {
+            'id': '996447483',
+            'ext': 'mp4',
+            'title': 'Rodeo day 1-_2',
+            'uploader': 'BROADKAST',
+            'uploader_id': 'user170863801',
+            'uploader_url': 'https://vimeo.com/user170863801',
+            'duration': 30,
+            'thumbnail': 'https://i.vimeocdn.com/video/1912612821-09a43bd2e75c203d503aed89de7534f28fc4474a48f59c51999716931a246af5-d_1280',
+        },
+        'params': {'skip_download': 'm3u8'},
+        'expected_warnings': ['Failed to parse XML'],
+    }, {
        'url': 'https://vimeo.com/user21297594/review/75524534/3c257a1b5d',
        'md5': 'c507a72f780cacc12b2248bb4006d253',
        'info_dict': {
@ -1282,6 +1382,7 @@ class VimeoReviewIE(VimeoBaseInfoExtractor):
            'thumbnail': 'https://i.vimeocdn.com/video/450115033-43303819d9ebe24c2630352e18b7056d25197d09b3ae901abdac4c4f1d68de71-d_1280',
            'uploader_url': 'https://vimeo.com/user21297594',
        },
+        'skip': '404 Not Found',
    }, {
        'note': 'video player needs Referer',
        'url': 'https://vimeo.com/user22258446/review/91613211/13f927e053',
@ -1316,6 +1417,7 @@ def _real_extract(self, url):
        user, video_id, review_hash = self._match_valid_url(url).group('user', 'id', 'hash')
        data_url = f'https://vimeo.com/{user}/review/data/{video_id}/{review_hash}'
        data = self._download_json(data_url, video_id)
+        viewer = {}
        if data.get('isLocked') is True:
            video_password = self._get_video_password()
            viewer = self._download_json(
@ -1327,8 +1429,8 @@ def _real_extract(self, url):
        config = self._download_json(config_url, video_id)
        info_dict = self._parse_config(config, video_id)
        source_format = self._extract_original_format(
-            f'https://vimeo.com/{user}/review/{video_id}/{review_hash}/action', video_id,
-            unlisted_hash=traverse_obj(config_url, ({parse_qs}, 'h', -1)))
+            f'https://vimeo.com/{user}/review/{video_id}/{review_hash}/action',
+            video_id, unlisted_hash=clip_data.get('unlistedHash'), jwt=viewer.get('jwt'))
        if source_format:
            info_dict['formats'].append(source_format)
        info_dict['description'] = clean_html(clip_data.get('description'))