mirror of
https://github.com/yt-dlp/yt-dlp.git
synced 2024-11-10 01:02:13 +01:00
[extractor/facebook] Fix metadata extraction (#6856)
Closes #3432 Authored by: ringus1
This commit is contained in:
parent
c449c0655d
commit
3b52a60688
1 changed files with 9 additions and 5 deletions
|
@ -390,7 +390,10 @@ def extract_metadata(webpage):
|
||||||
k == 'media' and str(v['id']) == video_id and v['__typename'] == 'Video')), expected_type=dict)
|
k == 'media' and str(v['id']) == video_id and v['__typename'] == 'Video')), expected_type=dict)
|
||||||
title = get_first(media, ('title', 'text'))
|
title = get_first(media, ('title', 'text'))
|
||||||
description = get_first(media, ('creation_story', 'comet_sections', 'message', 'story', 'message', 'text'))
|
description = get_first(media, ('creation_story', 'comet_sections', 'message', 'story', 'message', 'text'))
|
||||||
uploader_data = get_first(media, 'owner') or get_first(post, ('node', 'actors', ...)) or {}
|
uploader_data = (
|
||||||
|
get_first(media, ('owner', {dict}))
|
||||||
|
or get_first(post, (..., 'video', lambda k, v: k == 'owner' and v['name']))
|
||||||
|
or get_first(post, ('node', 'actors', ..., {dict})) or {})
|
||||||
|
|
||||||
page_title = title or self._html_search_regex((
|
page_title = title or self._html_search_regex((
|
||||||
r'<h2\s+[^>]*class="uiHeaderTitle"[^>]*>(?P<content>[^<]*)</h2>',
|
r'<h2\s+[^>]*class="uiHeaderTitle"[^>]*>(?P<content>[^<]*)</h2>',
|
||||||
|
@ -415,16 +418,17 @@ def extract_metadata(webpage):
|
||||||
# in https://www.facebook.com/yaroslav.korpan/videos/1417995061575415/
|
# in https://www.facebook.com/yaroslav.korpan/videos/1417995061575415/
|
||||||
if thumbnail and not re.search(r'\.(?:jpg|png)', thumbnail):
|
if thumbnail and not re.search(r'\.(?:jpg|png)', thumbnail):
|
||||||
thumbnail = None
|
thumbnail = None
|
||||||
view_count = parse_count(self._search_regex(
|
|
||||||
r'\bviewCount\s*:\s*["\']([\d,.]+)', webpage, 'view count',
|
|
||||||
default=None))
|
|
||||||
info_dict = {
|
info_dict = {
|
||||||
'description': description,
|
'description': description,
|
||||||
'uploader': uploader,
|
'uploader': uploader,
|
||||||
'uploader_id': uploader_data.get('id'),
|
'uploader_id': uploader_data.get('id'),
|
||||||
'timestamp': timestamp,
|
'timestamp': timestamp,
|
||||||
'thumbnail': thumbnail,
|
'thumbnail': thumbnail,
|
||||||
'view_count': view_count,
|
'view_count': parse_count(self._search_regex(
|
||||||
|
(r'\bviewCount\s*:\s*["\']([\d,.]+)', r'video_view_count["\']\s*:\s*(\d+)',),
|
||||||
|
webpage, 'view count', default=None)),
|
||||||
|
'concurrent_view_count': get_first(post, (
|
||||||
|
('video', (..., ..., 'attachments', ..., 'media')), 'liveViewerCount', {int_or_none})),
|
||||||
}
|
}
|
||||||
|
|
||||||
info_json_ld = self._search_json_ld(webpage, video_id, default={})
|
info_json_ld = self._search_json_ld(webpage, video_id, default={})
|
||||||
|
|
Loading…
Reference in a new issue