From 5bfa48620542d9ee34958d7c96aa45465b058fbd Mon Sep 17 00:00:00 2001 From: pukkandan Date: Tue, 26 Jan 2021 15:50:20 +0530 Subject: [PATCH] Add option `--parse-metadata` * The fields extracted by this can be used in `--output` * Deprecated `--metadata-from-title` :ci skip dl --- README.md | 25 +++--- test/test_postprocessors.py | 10 ++- youtube_dlc/YoutubeDL.py | 89 ++++++++++--------- youtube_dlc/__init__.py | 18 +++- youtube_dlc/options.py | 16 ++-- youtube_dlc/postprocessor/__init__.py | 4 +- .../postprocessor/metadatafromfield.py | 66 ++++++++++++++ .../postprocessor/metadatafromtitle.py | 44 --------- 8 files changed, 162 insertions(+), 110 deletions(-) create mode 100644 youtube_dlc/postprocessor/metadatafromfield.py delete mode 100644 youtube_dlc/postprocessor/metadatafromtitle.py diff --git a/README.md b/README.md index 7524e8493..886ec245f 100644 --- a/README.md +++ b/README.md @@ -610,16 +610,19 @@ ## Post-Processing Options: --no-embed-thumbnail Do not embed thumbnail (default) --add-metadata Write metadata to the video file --no-add-metadata Do not write metadata (default) - --metadata-from-title FORMAT Parse additional metadata like song title / - artist from the video title. The format - syntax is the same as --output. Regular - expression with named capture groups may - also be used. The parsed parameters replace - existing values. Example: --metadata-from- - title "%(artist)s - %(title)s" matches a + --parse-metadata FIELD:FORMAT Parse additional metadata like title/artist + from other fields. Give field name to + extract data from, and format of the field + seperated by a ":". The format syntax is + the same as --output. Regular expression + with named capture groups may also be used. + The parsed parameters replace existing + values. This option can be used multiple + times. Example: --parse-metadata + "title:%(artist)s - %(title)s" matches a title like "Coldplay - Paradise". Example - (regex): --metadata-from-title - "(?P.+?) - (?P.+)" + (regex): --parse-metadata + "description:Artist - (?P<artist>.+?)" --xattrs Write metadata to the video file's xattrs (using dublin core and xdg standards) --fixup POLICY Automatically correct known faults of the @@ -1098,7 +1101,7 @@ # PLUGINS Plugins are loaded from `<root-dir>/ytdlp_plugins/<type>/__init__.py`. Currently only `extractor` plugins are supported. Support for `downloader` and `postprocessor` plugins may be added in the future. See [ytdlp_plugins](ytdlp_plugins) for example. -**Note**: `<root-dir>` is the directory of the binary (`<root-dir>/youtube-dlc`), or the root directory of the module if you are running directly from source-code ((`<root dir>/youtube_dlc/__main__.py`) +**Note**: `<root-dir>` is the directory of the binary (`<root-dir>/youtube-dlc`), or the root directory of the module if you are running directly from source-code (`<root dir>/youtube_dlc/__main__.py`) # MORE -For FAQ, Developer Instructions etc., see the [original README](https://github.com/ytdl-org/youtube-dl) +For FAQ, Developer Instructions etc., see the [original README](https://github.com/ytdl-org/youtube-dl#faq) diff --git a/test/test_postprocessors.py b/test/test_postprocessors.py index 6f538a3da..fabe7e6fb 100644 --- a/test/test_postprocessors.py +++ b/test/test_postprocessors.py @@ -8,10 +8,16 @@ import unittest sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) -from youtube_dlc.postprocessor import MetadataFromTitlePP +from youtube_dlc.postprocessor import MetadataFromFieldPP, MetadataFromTitlePP + + +class TestMetadataFromField(unittest.TestCase): + def test_format_to_regex(self): + pp = MetadataFromFieldPP(None, ['title:%(title)s - %(artist)s']) + self.assertEqual(pp._data[0]['regex'], r'(?P<title>[^\r\n]+)\ \-\ (?P<artist>[^\r\n]+)') class TestMetadataFromTitle(unittest.TestCase): def test_format_to_regex(self): pp = MetadataFromTitlePP(None, '%(title)s - %(artist)s') - self.assertEqual(pp._titleregex, r'(?P<title>.+)\ \-\ (?P<artist>.+)') + self.assertEqual(pp._titleregex, r'(?P<title>[^\r\n]+)\ \-\ (?P<artist>[^\r\n]+)') diff --git a/youtube_dlc/YoutubeDL.py b/youtube_dlc/YoutubeDL.py index ce990507c..0e93303b1 100644 --- a/youtube_dlc/YoutubeDL.py +++ b/youtube_dlc/YoutubeDL.py @@ -375,8 +375,7 @@ class YoutubeDL(object): params = None _ies = [] - _pps = [] - _pps_end = [] + _pps = {'beforedl': [], 'aftermove': [], 'normal': []} __prepare_filename_warned = False _download_retcode = None _num_downloads = None @@ -390,8 +389,7 @@ def __init__(self, params=None, auto_init=True): params = {} self._ies = [] self._ies_instances = {} - self._pps = [] - self._pps_end = [] + self._pps = {'beforedl': [], 'aftermove': [], 'normal': []} self.__prepare_filename_warned = False self._post_hooks = [] self._progress_hooks = [] @@ -494,11 +492,13 @@ def check_deprecated(param, option, suggestion): pp_class = get_postprocessor(pp_def_raw['key']) pp_def = dict(pp_def_raw) del pp_def['key'] - after_move = pp_def.get('_after_move', False) - if '_after_move' in pp_def: - del pp_def['_after_move'] + if 'when' in pp_def: + when = pp_def['when'] + del pp_def['when'] + else: + when = 'normal' pp = pp_class(self, **compat_kwargs(pp_def)) - self.add_post_processor(pp, after_move=after_move) + self.add_post_processor(pp, when=when) for ph in self.params.get('post_hooks', []): self.add_post_hook(ph) @@ -550,12 +550,9 @@ def add_default_info_extractors(self): for ie in gen_extractor_classes(): self.add_info_extractor(ie) - def add_post_processor(self, pp, after_move=False): + def add_post_processor(self, pp, when='normal'): """Add a PostProcessor object to the end of the chain.""" - if after_move: - self._pps_end.append(pp) - else: - self._pps.append(pp) + self._pps[when].append(pp) pp.set_downloader(self) def add_post_hook(self, ph): @@ -1948,6 +1945,8 @@ def process_info(self, info_dict): self._num_downloads += 1 + info_dict = self.pre_process(info_dict) + filename = self.prepare_filename(info_dict, warn=True) info_dict['_filename'] = full_filename = self.prepare_filepath(filename) temp_filename = self.prepare_filepath(filename, 'temp') @@ -2400,41 +2399,45 @@ def filter_requested_info(info_dict): (k, v) for k, v in info_dict.items() if k not in ['requested_formats', 'requested_subtitles']) + def run_pp(self, pp, infodict, files_to_move={}): + files_to_delete = [] + try: + files_to_delete, infodict = pp.run(infodict) + except PostProcessingError as e: + self.report_error(e.msg) + if not files_to_delete: + return files_to_move, infodict + + if self.params.get('keepvideo', False): + for f in files_to_delete: + files_to_move.setdefault(f, '') + else: + for old_filename in set(files_to_delete): + self.to_screen('Deleting original file %s (pass -k to keep)' % old_filename) + try: + os.remove(encodeFilename(old_filename)) + except (IOError, OSError): + self.report_warning('Unable to remove downloaded original file') + if old_filename in files_to_move: + del files_to_move[old_filename] + return files_to_move, infodict + + def pre_process(self, ie_info): + info = dict(ie_info) + for pp in self._pps['beforedl']: + info = self.run_pp(pp, info)[1] + return info + def post_process(self, filename, ie_info, files_to_move={}): """Run all the postprocessors on the given file.""" info = dict(ie_info) info['filepath'] = filename - def run_pp(pp): - files_to_delete = [] - infodict = info - try: - files_to_delete, infodict = pp.run(infodict) - except PostProcessingError as e: - self.report_error(e.msg) - if not files_to_delete: - return infodict - - if self.params.get('keepvideo', False): - for f in files_to_delete: - files_to_move.setdefault(f, '') - else: - for old_filename in set(files_to_delete): - self.to_screen('Deleting original file %s (pass -k to keep)' % old_filename) - try: - os.remove(encodeFilename(old_filename)) - except (IOError, OSError): - self.report_warning('Unable to remove downloaded original file') - if old_filename in files_to_move: - del files_to_move[old_filename] - return infodict - - for pp in ie_info.get('__postprocessors', []) + self._pps: - info = run_pp(pp) - info = run_pp(MoveFilesAfterDownloadPP(self, files_to_move)) - files_to_move = {} - for pp in self._pps_end: - info = run_pp(pp) + for pp in ie_info.get('__postprocessors', []) + self._pps['normal']: + files_to_move, info = self.run_pp(pp, info, files_to_move) + info = self.run_pp(MoveFilesAfterDownloadPP(self, files_to_move), info, files_to_move)[1] + for pp in self._pps['aftermove']: + files_to_move, info = self.run_pp(pp, info, {}) def _make_archive_id(self, info_dict): video_id = info_dict.get('id') diff --git a/youtube_dlc/__init__.py b/youtube_dlc/__init__.py index e2db66266..5f97b51ff 100644 --- a/youtube_dlc/__init__.py +++ b/youtube_dlc/__init__.py @@ -45,6 +45,7 @@ from .extractor import gen_extractors, list_extractors from .extractor.common import InfoExtractor from .extractor.adobepass import MSO_INFO +from .postprocessor.metadatafromfield import MetadataFromFieldPP from .YoutubeDL import YoutubeDL @@ -249,16 +250,25 @@ def parse_retries(retries): if re.match(InfoExtractor.FormatSort.regex, f) is None: parser.error('invalid format sort string "%s" specified' % f) + if opts.metafromfield is None: + opts.metafromfield = [] + if opts.metafromtitle is not None: + opts.metafromfield.append('title:%s' % opts.metafromtitle) + for f in opts.metafromfield: + if re.match(MetadataFromFieldPP.regex, f) is None: + parser.error('invalid format string "%s" specified for --parse-metadata' % f) + any_getting = opts.geturl or opts.gettitle or opts.getid or opts.getthumbnail or opts.getdescription or opts.getfilename or opts.getformat or opts.getduration or opts.dumpjson or opts.dump_single_json any_printing = opts.print_json download_archive_fn = expand_path(opts.download_archive) if opts.download_archive is not None else opts.download_archive # PostProcessors postprocessors = [] - if opts.metafromtitle: + if opts.metafromfield: postprocessors.append({ - 'key': 'MetadataFromTitle', - 'titleformat': opts.metafromtitle + 'key': 'MetadataFromField', + 'formats': opts.metafromfield, + 'when': 'beforedl' }) if opts.extractaudio: postprocessors.append({ @@ -324,7 +334,7 @@ def parse_retries(retries): postprocessors.append({ 'key': 'ExecAfterDownload', 'exec_cmd': opts.exec_cmd, - '_after_move': True + 'when': 'aftermove' }) _args_compat_warning = 'WARNING: %s given without specifying name. The arguments will be given to all %s\n' diff --git a/youtube_dlc/options.py b/youtube_dlc/options.py index 4910c2083..859f28e2b 100644 --- a/youtube_dlc/options.py +++ b/youtube_dlc/options.py @@ -1078,14 +1078,20 @@ def _dict_from_multiple_values_options_callback( postproc.add_option( '--metadata-from-title', metavar='FORMAT', dest='metafromtitle', + help=optparse.SUPPRESS_HELP) + postproc.add_option( + '--parse-metadata', + metavar='FIELD:FORMAT', dest='metafromfield', action='append', help=( - 'Parse additional metadata like song title / artist from the video title. ' - 'The format syntax is the same as --output. Regular expression with ' - 'named capture groups may also be used. ' + 'Parse additional metadata like title/artist from other fields. ' + 'Give field name to extract data from, and format of the field seperated by a ":". ' + 'The format syntax is the same as --output. ' + 'Regular expression with named capture groups may also be used. ' 'The parsed parameters replace existing values. ' - 'Example: --metadata-from-title "%(artist)s - %(title)s" matches a title like ' + 'This option can be used multiple times. ' + 'Example: --parse-metadata "title:%(artist)s - %(title)s" matches a title like ' '"Coldplay - Paradise". ' - 'Example (regex): --metadata-from-title "(?P<artist>.+?) - (?P<title>.+)"')) + 'Example (regex): --parse-metadata "description:Artist - (?P<artist>.+?)"')) postproc.add_option( '--xattrs', action='store_true', dest='xattrs', default=False, diff --git a/youtube_dlc/postprocessor/__init__.py b/youtube_dlc/postprocessor/__init__.py index 840a83b0e..c5aa925c6 100644 --- a/youtube_dlc/postprocessor/__init__.py +++ b/youtube_dlc/postprocessor/__init__.py @@ -16,7 +16,8 @@ ) from .xattrpp import XAttrMetadataPP from .execafterdownload import ExecAfterDownloadPP -from .metadatafromtitle import MetadataFromTitlePP +from .metadatafromfield import MetadataFromFieldPP +from .metadatafromfield import MetadataFromTitlePP from .movefilesafterdownload import MoveFilesAfterDownloadPP from .sponskrub import SponSkrubPP @@ -39,6 +40,7 @@ def get_postprocessor(key): 'FFmpegSubtitlesConvertorPP', 'FFmpegVideoConvertorPP', 'FFmpegVideoRemuxerPP', + 'MetadataFromFieldPP', 'MetadataFromTitlePP', 'MoveFilesAfterDownloadPP', 'SponSkrubPP', diff --git a/youtube_dlc/postprocessor/metadatafromfield.py b/youtube_dlc/postprocessor/metadatafromfield.py new file mode 100644 index 000000000..eb774326b --- /dev/null +++ b/youtube_dlc/postprocessor/metadatafromfield.py @@ -0,0 +1,66 @@ +from __future__ import unicode_literals + +import re + +from .common import PostProcessor +from ..compat import compat_str + + +class MetadataFromFieldPP(PostProcessor): + regex = r'(?P<field>\w+):(?P<format>.+)$' + + def __init__(self, downloader, formats): + PostProcessor.__init__(self, downloader) + assert isinstance(formats, (list, tuple)) + self._data = [] + for f in formats: + assert isinstance(f, compat_str) + match = re.match(self.regex, f) + assert match is not None + self._data.append({ + 'field': match.group('field'), + 'format': match.group('format'), + 'regex': self.format_to_regex(match.group('format'))}) + + def format_to_regex(self, fmt): + r""" + Converts a string like + '%(title)s - %(artist)s' + to a regex like + '(?P<title>.+)\ \-\ (?P<artist>.+)' + """ + if not re.search(r'%\(\w+\)s', fmt): + return fmt + lastpos = 0 + regex = '' + # replace %(..)s with regex group and escape other string parts + for match in re.finditer(r'%\((\w+)\)s', fmt): + regex += re.escape(fmt[lastpos:match.start()]) + regex += r'(?P<' + match.group(1) + r'>[^\r\n]+)' + lastpos = match.end() + if lastpos < len(fmt): + regex += re.escape(fmt[lastpos:]) + return regex + + def run(self, info): + for dictn in self._data: + field, regex = dictn['field'], dictn['regex'] + if field not in info: + self.report_warning('Video doesnot have a %s' % field) + continue + self.write_debug('Searching for r"%s" in %s' % (regex, field)) + match = re.search(regex, info[field]) + if match is None: + self.report_warning('Could not interpret video %s as "%s"' % (field, dictn['format'])) + continue + for attribute, value in match.groupdict().items(): + info[attribute] = value + self.to_screen('parsed %s from %s: %s' % (attribute, field, value if value is not None else 'NA')) + return [], info + + +class MetadataFromTitlePP(MetadataFromFieldPP): # for backward compatibility + def __init__(self, downloader, titleformat): + super(MetadataFromTitlePP, self).__init__(downloader, ['title:%s' % titleformat]) + self._titleformat = titleformat + self._titleregex = self._data[0]['regex'] diff --git a/youtube_dlc/postprocessor/metadatafromtitle.py b/youtube_dlc/postprocessor/metadatafromtitle.py deleted file mode 100644 index 86df3b4f0..000000000 --- a/youtube_dlc/postprocessor/metadatafromtitle.py +++ /dev/null @@ -1,44 +0,0 @@ -from __future__ import unicode_literals - -import re - -from .common import PostProcessor - - -class MetadataFromTitlePP(PostProcessor): - def __init__(self, downloader, titleformat): - super(MetadataFromTitlePP, self).__init__(downloader) - self._titleformat = titleformat - self._titleregex = (self.format_to_regex(titleformat) - if re.search(r'%\(\w+\)s', titleformat) - else titleformat) - - def format_to_regex(self, fmt): - r""" - Converts a string like - '%(title)s - %(artist)s' - to a regex like - '(?P<title>.+)\ \-\ (?P<artist>.+)' - """ - lastpos = 0 - regex = '' - # replace %(..)s with regex group and escape other string parts - for match in re.finditer(r'%\((\w+)\)s', fmt): - regex += re.escape(fmt[lastpos:match.start()]) - regex += r'(?P<' + match.group(1) + '>.+)' - lastpos = match.end() - if lastpos < len(fmt): - regex += re.escape(fmt[lastpos:]) - return regex - - def run(self, info): - title = info['title'] - match = re.match(self._titleregex, title) - if match is None: - self.to_screen('Could not interpret title of video as "%s"' % self._titleformat) - return [], info - for attribute, value in match.groupdict().items(): - info[attribute] = value - self.to_screen('parsed %s: %s' % (attribute, value if value is not None else 'NA')) - - return [], info