From b7c47b743871cdf3e0de75b17e4454d987384bf9 Mon Sep 17 00:00:00 2001 From: pukkandan Date: Fri, 3 Jun 2022 21:02:31 +0530 Subject: [PATCH] [extractor] Add `_search_json` All fetching of JSON objects should eventually be done with this function but only `youtube` is being refactored for now --- yt_dlp/extractor/archiveorg.py | 21 +++++++++------------ yt_dlp/extractor/common.py | 24 ++++++++++++------------ yt_dlp/extractor/youtube.py | 23 ++++++++--------------- yt_dlp/utils.py | 13 +++++++++++++ 4 files changed, 42 insertions(+), 39 deletions(-) diff --git a/yt_dlp/extractor/archiveorg.py b/yt_dlp/extractor/archiveorg.py index c85d5297d..c1c9b0adf 100644 --- a/yt_dlp/extractor/archiveorg.py +++ b/yt_dlp/extractor/archiveorg.py @@ -442,9 +442,10 @@ class YoutubeWebArchiveIE(InfoExtractor): 'only_matching': True }, ] - _YT_INITIAL_DATA_RE = r'(?:(?:(?:window\s*\[\s*["\']ytInitialData["\']\s*\]|ytInitialData)\s*=\s*({.+?})\s*;)|%s)' % YoutubeBaseInfoExtractor._YT_INITIAL_DATA_RE - _YT_INITIAL_PLAYER_RESPONSE_RE = r'(?:(?:(?:window\s*\[\s*["\']ytInitialPlayerResponse["\']\s*\]|ytInitialPlayerResponse)\s*=[(\s]*({.+?})[)\s]*;)|%s)' % YoutubeBaseInfoExtractor._YT_INITIAL_PLAYER_RESPONSE_RE - _YT_INITIAL_BOUNDARY_RE = r'(?:(?:var\s+meta|{{.+}})\s*{end_pattern}', + string, name, group='json', fatal=fatal) or '{}', + video_id, fatal=fatal, ignore_extra=True, **kwargs) or {} + def _html_search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None): """ Like _search_regex, but strips HTML tags and unescapes entities. diff --git a/yt_dlp/extractor/youtube.py b/yt_dlp/extractor/youtube.py index 8b2332dc1..c8541c664 100644 --- a/yt_dlp/extractor/youtube.py +++ b/yt_dlp/extractor/youtube.py @@ -397,9 +397,8 @@ def _check_login_required(self): if self._LOGIN_REQUIRED and not self._cookies_passed: self.raise_login_required('Login details are needed to download this content', method='cookies') - _YT_INITIAL_DATA_RE = r'(?:window\s*\[\s*["\']ytInitialData["\']\s*\]|ytInitialData)\s*=\s*({.+})\s*;' - _YT_INITIAL_PLAYER_RESPONSE_RE = r'ytInitialPlayerResponse\s*=\s*({.+})\s*;' - _YT_INITIAL_BOUNDARY_RE = r'(?:var\s+meta|