mirror of
https://github.com/yt-dlp/yt-dlp.git
synced 2024-11-10 01:02:13 +01:00
[extractor] Add dev option --load-pages
This commit is contained in:
parent
617f658b7e
commit
f95b9dee45
4 changed files with 54 additions and 36 deletions
|
@ -2022,6 +2022,7 @@ #### Developer options
|
||||||
These options are not intended to be used by the end-user
|
These options are not intended to be used by the end-user
|
||||||
|
|
||||||
--test Download only part of video for testing extractors
|
--test Download only part of video for testing extractors
|
||||||
|
--load-pages Load pages dumped by --write-pages
|
||||||
--youtube-print-sig-code For testing youtube signatures
|
--youtube-print-sig-code For testing youtube signatures
|
||||||
--allow-unplayable-formats List unplayable formats also
|
--allow-unplayable-formats List unplayable formats also
|
||||||
--no-allow-unplayable-formats Default
|
--no-allow-unplayable-formats Default
|
||||||
|
|
|
@ -758,6 +758,7 @@ def parse_options(argv=None):
|
||||||
'verbose': opts.verbose,
|
'verbose': opts.verbose,
|
||||||
'dump_intermediate_pages': opts.dump_intermediate_pages,
|
'dump_intermediate_pages': opts.dump_intermediate_pages,
|
||||||
'write_pages': opts.write_pages,
|
'write_pages': opts.write_pages,
|
||||||
|
'load_pages': opts.load_pages,
|
||||||
'test': opts.test,
|
'test': opts.test,
|
||||||
'keepvideo': opts.keepvideo,
|
'keepvideo': opts.keepvideo,
|
||||||
'min_filesize': opts.min_filesize,
|
'min_filesize': opts.min_filesize,
|
||||||
|
|
|
@ -75,7 +75,6 @@
|
||||||
unified_strdate,
|
unified_strdate,
|
||||||
unified_timestamp,
|
unified_timestamp,
|
||||||
update_Request,
|
update_Request,
|
||||||
update_url_query,
|
|
||||||
url_basename,
|
url_basename,
|
||||||
url_or_none,
|
url_or_none,
|
||||||
urljoin,
|
urljoin,
|
||||||
|
@ -724,6 +723,11 @@ def __can_accept_status_code(err, expected_status):
|
||||||
else:
|
else:
|
||||||
return err.code in variadic(expected_status)
|
return err.code in variadic(expected_status)
|
||||||
|
|
||||||
|
def _create_request(self, url_or_request, data=None, headers={}, query={}):
|
||||||
|
if not isinstance(url_or_request, compat_urllib_request.Request):
|
||||||
|
url_or_request = sanitized_Request(url_or_request)
|
||||||
|
return update_Request(url_or_request, data=data, headers=headers, query=query)
|
||||||
|
|
||||||
def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, data=None, headers={}, query={}, expected_status=None):
|
def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, data=None, headers={}, query={}, expected_status=None):
|
||||||
"""
|
"""
|
||||||
Return the response handle.
|
Return the response handle.
|
||||||
|
@ -755,16 +759,8 @@ def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fa
|
||||||
if 'X-Forwarded-For' not in headers:
|
if 'X-Forwarded-For' not in headers:
|
||||||
headers['X-Forwarded-For'] = self._x_forwarded_for_ip
|
headers['X-Forwarded-For'] = self._x_forwarded_for_ip
|
||||||
|
|
||||||
if isinstance(url_or_request, compat_urllib_request.Request):
|
|
||||||
url_or_request = update_Request(
|
|
||||||
url_or_request, data=data, headers=headers, query=query)
|
|
||||||
else:
|
|
||||||
if query:
|
|
||||||
url_or_request = update_url_query(url_or_request, query)
|
|
||||||
if data is not None or headers:
|
|
||||||
url_or_request = sanitized_Request(url_or_request, data, headers)
|
|
||||||
try:
|
try:
|
||||||
return self._downloader.urlopen(url_or_request)
|
return self._downloader.urlopen(self._create_request(url_or_request, data, headers, query))
|
||||||
except network_exceptions as err:
|
except network_exceptions as err:
|
||||||
if isinstance(err, compat_urllib_error.HTTPError):
|
if isinstance(err, compat_urllib_error.HTTPError):
|
||||||
if self.__can_accept_status_code(err, expected_status):
|
if self.__can_accept_status_code(err, expected_status):
|
||||||
|
@ -876,40 +872,44 @@ def __check_blocked(self, content):
|
||||||
'Visit http://blocklist.rkn.gov.ru/ for a block reason.',
|
'Visit http://blocklist.rkn.gov.ru/ for a block reason.',
|
||||||
expected=True)
|
expected=True)
|
||||||
|
|
||||||
|
def _request_dump_filename(self, url, video_id):
|
||||||
|
basen = f'{video_id}_{url}'
|
||||||
|
trim_length = self.get_param('trim_file_name') or 240
|
||||||
|
if len(basen) > trim_length:
|
||||||
|
h = '___' + hashlib.md5(basen.encode('utf-8')).hexdigest()
|
||||||
|
basen = basen[:trim_length - len(h)] + h
|
||||||
|
filename = sanitize_filename(f'{basen}.dump', restricted=True)
|
||||||
|
# Working around MAX_PATH limitation on Windows (see
|
||||||
|
# http://msdn.microsoft.com/en-us/library/windows/desktop/aa365247(v=vs.85).aspx)
|
||||||
|
if compat_os_name == 'nt':
|
||||||
|
absfilepath = os.path.abspath(filename)
|
||||||
|
if len(absfilepath) > 259:
|
||||||
|
filename = fR'\\?\{absfilepath}'
|
||||||
|
return filename
|
||||||
|
|
||||||
|
def __decode_webpage(self, webpage_bytes, encoding, headers):
|
||||||
|
if not encoding:
|
||||||
|
encoding = self._guess_encoding_from_content(headers.get('Content-Type', ''), webpage_bytes)
|
||||||
|
try:
|
||||||
|
return webpage_bytes.decode(encoding, 'replace')
|
||||||
|
except LookupError:
|
||||||
|
return webpage_bytes.decode('utf-8', 'replace')
|
||||||
|
|
||||||
def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errnote=None, fatal=True, prefix=None, encoding=None):
|
def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errnote=None, fatal=True, prefix=None, encoding=None):
|
||||||
content_type = urlh.headers.get('Content-Type', '')
|
|
||||||
webpage_bytes = urlh.read()
|
webpage_bytes = urlh.read()
|
||||||
if prefix is not None:
|
if prefix is not None:
|
||||||
webpage_bytes = prefix + webpage_bytes
|
webpage_bytes = prefix + webpage_bytes
|
||||||
if not encoding:
|
|
||||||
encoding = self._guess_encoding_from_content(content_type, webpage_bytes)
|
|
||||||
if self.get_param('dump_intermediate_pages', False):
|
if self.get_param('dump_intermediate_pages', False):
|
||||||
self.to_screen('Dumping request to ' + urlh.geturl())
|
self.to_screen('Dumping request to ' + urlh.geturl())
|
||||||
dump = base64.b64encode(webpage_bytes).decode('ascii')
|
dump = base64.b64encode(webpage_bytes).decode('ascii')
|
||||||
self._downloader.to_screen(dump)
|
self._downloader.to_screen(dump)
|
||||||
if self.get_param('write_pages', False):
|
if self.get_param('write_pages'):
|
||||||
basen = f'{video_id}_{urlh.geturl()}'
|
filename = self._request_dump_filename(video_id, urlh.geturl())
|
||||||
trim_length = self.get_param('trim_file_name') or 240
|
self.to_screen(f'Saving request to {filename}')
|
||||||
if len(basen) > trim_length:
|
|
||||||
h = '___' + hashlib.md5(basen.encode('utf-8')).hexdigest()
|
|
||||||
basen = basen[:trim_length - len(h)] + h
|
|
||||||
raw_filename = basen + '.dump'
|
|
||||||
filename = sanitize_filename(raw_filename, restricted=True)
|
|
||||||
self.to_screen('Saving request to ' + filename)
|
|
||||||
# Working around MAX_PATH limitation on Windows (see
|
|
||||||
# http://msdn.microsoft.com/en-us/library/windows/desktop/aa365247(v=vs.85).aspx)
|
|
||||||
if compat_os_name == 'nt':
|
|
||||||
absfilepath = os.path.abspath(filename)
|
|
||||||
if len(absfilepath) > 259:
|
|
||||||
filename = '\\\\?\\' + absfilepath
|
|
||||||
with open(filename, 'wb') as outf:
|
with open(filename, 'wb') as outf:
|
||||||
outf.write(webpage_bytes)
|
outf.write(webpage_bytes)
|
||||||
|
|
||||||
try:
|
content = self.__decode_webpage(webpage_bytes, encoding, urlh.headers)
|
||||||
content = webpage_bytes.decode(encoding, 'replace')
|
|
||||||
except LookupError:
|
|
||||||
content = webpage_bytes.decode('utf-8', 'replace')
|
|
||||||
|
|
||||||
self.__check_blocked(content)
|
self.__check_blocked(content)
|
||||||
|
|
||||||
return content
|
return content
|
||||||
|
@ -967,9 +967,21 @@ def download_handle(self, url_or_request, video_id, note=note, errnote=errnote,
|
||||||
content, urlh = res
|
content, urlh = res
|
||||||
return parse(self, content, video_id, transform_source, fatal), urlh
|
return parse(self, content, video_id, transform_source, fatal), urlh
|
||||||
|
|
||||||
def download_content(
|
def download_content(self, url_or_request, video_id, note=note, errnote=errnote, transform_source=None,
|
||||||
self, url_or_request, video_id, note=note, errnote=errnote, transform_source=None, *args, **kwargs):
|
fatal=True, encoding=None, data=None, headers={}, query={}, *args, **kwargs):
|
||||||
args = [url_or_request, video_id, note, errnote, transform_source, *args]
|
if self.get_param('load_pages'):
|
||||||
|
url_or_request = self._create_request(url_or_request, data, headers, query)
|
||||||
|
filename = self._request_dump_filename(url_or_request.full_url, video_id)
|
||||||
|
self.to_screen(f'Loading request from {filename}')
|
||||||
|
try:
|
||||||
|
with open(filename, 'rb') as dumpf:
|
||||||
|
webpage_bytes = dumpf.read()
|
||||||
|
except OSError as e:
|
||||||
|
self.report_warning(f'Unable to load request from disk: {e}')
|
||||||
|
else:
|
||||||
|
content = self.__decode_webpage(webpage_bytes, encoding, url_or_request.headers)
|
||||||
|
return parse(self, content, video_id, transform_source, fatal)
|
||||||
|
args = [url_or_request, video_id, note, errnote, transform_source, fatal, encoding, data, headers, query, *args]
|
||||||
if parser is None:
|
if parser is None:
|
||||||
args.pop(4) # transform_source
|
args.pop(4) # transform_source
|
||||||
# The method is fetched by name so subclasses can override _download_..._handle
|
# The method is fetched by name so subclasses can override _download_..._handle
|
||||||
|
|
|
@ -1154,6 +1154,10 @@ def _alias_callback(option, opt_str, value, parser, opts, nargs):
|
||||||
'--write-pages',
|
'--write-pages',
|
||||||
action='store_true', dest='write_pages', default=False,
|
action='store_true', dest='write_pages', default=False,
|
||||||
help='Write downloaded intermediary pages to files in the current directory to debug problems')
|
help='Write downloaded intermediary pages to files in the current directory to debug problems')
|
||||||
|
verbosity.add_option(
|
||||||
|
'--load-pages',
|
||||||
|
action='store_true', dest='load_pages', default=False,
|
||||||
|
help=optparse.SUPPRESS_HELP)
|
||||||
verbosity.add_option(
|
verbosity.add_option(
|
||||||
'--youtube-print-sig-code',
|
'--youtube-print-sig-code',
|
||||||
action='store_true', dest='youtube_print_sig_code', default=False,
|
action='store_true', dest='youtube_print_sig_code', default=False,
|
||||||
|
|
Loading…
Reference in a new issue