mirror of
https://github.com/yt-dlp/yt-dlp.git
synced 2024-11-10 01:02:13 +01:00
Add option --replace-in-metadata
This commit is contained in:
parent
a38bd1defa
commit
e9f4ccd19e
8 changed files with 186 additions and 106 deletions
13
README.md
13
README.md
|
@ -777,6 +777,10 @@ ## Post-Processing Options:
|
|||
--parse-metadata FROM:TO Parse additional metadata like title/artist
|
||||
from other fields; see "MODIFYING METADATA"
|
||||
for details
|
||||
--replace-in-metadata FIELDS REGEX REPLACE
|
||||
Replace text in a metadata field using the
|
||||
given regex. This option can be used
|
||||
multiple times
|
||||
--xattrs Write metadata to the video file's xattrs
|
||||
(using dublin core and xdg standards)
|
||||
--fixup POLICY Automatically correct known faults of the
|
||||
|
@ -1333,7 +1337,11 @@ # preferring better codec and then larger total bitrate for the same resolution
|
|||
|
||||
# MODIFYING METADATA
|
||||
|
||||
The metadata obtained the the extractors can be modified by using `--parse-metadata FROM:TO`. The general syntax is to give the name of a field or a template (with similar syntax to [output template](#output-template)) to extract data from, and the format to interpret it as, separated by a colon `:`. Either a [python regular expression](https://docs.python.org/3/library/re.html#regular-expression-syntax) with named capture groups or a similar syntax to the [output template](#output-template) (only `%(field)s` formatting is supported) can be used for `TO`. The option can be used multiple times to parse and modify various fields.
|
||||
The metadata obtained the the extractors can be modified by using `--parse-metadata` and `--replace-in-metadata`
|
||||
|
||||
`--replace-in-metadata FIELDS REGEX REPLACE` is used to replace text in any metatdata field using [python regular expression](https://docs.python.org/3/library/re.html#regular-expression-syntax). [Backreferences](https://docs.python.org/3/library/re.html?highlight=backreferences#re.sub) can be used in the replace string for advanced use.
|
||||
|
||||
The general syntax of `--parse-metadata FROM:TO` is to give the name of a field or a template (with same syntax as [output template](#output-template)) to extract data from, and the format to interpret it as, separated by a colon `:`. Either a [python regular expression](https://docs.python.org/3/library/re.html#regular-expression-syntax) with named capture groups or a similar syntax to the [output template](#output-template) (only `%(field)s` formatting is supported) can be used for `TO`. The option can be used multiple times to parse and modify various fields.
|
||||
|
||||
Note that any field created by this can be used in the [output template](#output-template) and will also affect the media file's metadata added when using `--add-metadata`.
|
||||
|
||||
|
@ -1380,6 +1388,9 @@ # Set title as "Series name S01E05"
|
|||
# Set "comment" field in video metadata using description instead of webpage_url
|
||||
$ yt-dlp --parse-metadata 'description:(?s)(?P<meta_comment>.+)' --add-metadata
|
||||
|
||||
# Replace all spaces and "_" in title and uploader with a `-`
|
||||
$ yt-dlp --replace-in-metadata 'title,uploader' '[ _]' '-'
|
||||
|
||||
```
|
||||
|
||||
# EXTRACTOR ARGUMENTS
|
||||
|
|
|
@ -14,29 +14,28 @@
|
|||
ExecAfterDownloadPP,
|
||||
FFmpegThumbnailsConvertorPP,
|
||||
MetadataFromFieldPP,
|
||||
MetadataFromTitlePP,
|
||||
MetadataParserPP,
|
||||
)
|
||||
|
||||
|
||||
class TestMetadataFromField(unittest.TestCase):
|
||||
|
||||
def test_format_to_regex(self):
|
||||
pp = MetadataFromFieldPP(None, ['title:%(title)s - %(artist)s'])
|
||||
self.assertEqual(pp._data[0]['regex'], r'(?P<title>.+)\ \-\ (?P<artist>.+)')
|
||||
self.assertEqual(
|
||||
MetadataParserPP.format_to_regex('%(title)s - %(artist)s'),
|
||||
r'(?P<title>.+)\ \-\ (?P<artist>.+)')
|
||||
self.assertEqual(MetadataParserPP.format_to_regex(r'(?P<x>.+)'), r'(?P<x>.+)')
|
||||
|
||||
def test_field_to_outtmpl(self):
|
||||
pp = MetadataFromFieldPP(None, ['title:%(title)s : %(artist)s'])
|
||||
self.assertEqual(pp._data[0]['tmpl'], '%(title)s')
|
||||
def test_field_to_template(self):
|
||||
self.assertEqual(MetadataParserPP.field_to_template('title'), '%(title)s')
|
||||
self.assertEqual(MetadataParserPP.field_to_template('1'), '1')
|
||||
self.assertEqual(MetadataParserPP.field_to_template('foo bar'), 'foo bar')
|
||||
self.assertEqual(MetadataParserPP.field_to_template(' literal'), ' literal')
|
||||
|
||||
def test_in_out_seperation(self):
|
||||
pp = MetadataFromFieldPP(None, ['%(title)s \\: %(artist)s:%(title)s : %(artist)s'])
|
||||
self.assertEqual(pp._data[0]['in'], '%(title)s : %(artist)s')
|
||||
self.assertEqual(pp._data[0]['out'], '%(title)s : %(artist)s')
|
||||
|
||||
|
||||
class TestMetadataFromTitle(unittest.TestCase):
|
||||
def test_format_to_regex(self):
|
||||
pp = MetadataFromTitlePP(None, '%(title)s - %(artist)s')
|
||||
self.assertEqual(pp._titleregex, r'(?P<title>.+)\ \-\ (?P<artist>.+)')
|
||||
def test_metadatafromfield(self):
|
||||
self.assertEqual(
|
||||
MetadataFromFieldPP.to_action('%(title)s \\: %(artist)s:%(title)s : %(artist)s'),
|
||||
(MetadataParserPP.Actions.INTERPRET, '%(title)s : %(artist)s', '%(title)s : %(artist)s'))
|
||||
|
||||
|
||||
class TestConvertThumbnail(unittest.TestCase):
|
||||
|
|
|
@ -1281,7 +1281,7 @@ def process_ie_result(self, ie_result, download=True, extra_info={}):
|
|||
ie_result = self.process_video_result(ie_result, download=download)
|
||||
additional_urls = (ie_result or {}).get('additional_urls')
|
||||
if additional_urls:
|
||||
# TODO: Improve MetadataFromFieldPP to allow setting a list
|
||||
# TODO: Improve MetadataParserPP to allow setting a list
|
||||
if isinstance(additional_urls, compat_str):
|
||||
additional_urls = [additional_urls]
|
||||
self.to_screen(
|
||||
|
|
|
@ -7,6 +7,7 @@
|
|||
|
||||
import codecs
|
||||
import io
|
||||
import itertools
|
||||
import os
|
||||
import random
|
||||
import re
|
||||
|
@ -18,6 +19,7 @@
|
|||
)
|
||||
from .compat import (
|
||||
compat_getpass,
|
||||
compat_shlex_quote,
|
||||
workaround_optparse_bug9161,
|
||||
)
|
||||
from .cookies import SUPPORTED_BROWSERS
|
||||
|
@ -46,14 +48,15 @@
|
|||
from .extractor import gen_extractors, list_extractors
|
||||
from .extractor.common import InfoExtractor
|
||||
from .extractor.adobepass import MSO_INFO
|
||||
from .postprocessor.ffmpeg import (
|
||||
from .postprocessor import (
|
||||
FFmpegExtractAudioPP,
|
||||
FFmpegSubtitlesConvertorPP,
|
||||
FFmpegThumbnailsConvertorPP,
|
||||
FFmpegVideoConvertorPP,
|
||||
FFmpegVideoRemuxerPP,
|
||||
MetadataFromFieldPP,
|
||||
MetadataParserPP,
|
||||
)
|
||||
from .postprocessor.metadatafromfield import MetadataFromFieldPP
|
||||
from .YoutubeDL import YoutubeDL
|
||||
|
||||
|
||||
|
@ -344,13 +347,29 @@ def validate_outtmpl(tmpl, msg):
|
|||
if re.match(InfoExtractor.FormatSort.regex, f) is None:
|
||||
parser.error('invalid format sort string "%s" specified' % f)
|
||||
|
||||
if opts.metafromfield is None:
|
||||
opts.metafromfield = []
|
||||
def metadataparser_actions(f):
|
||||
if isinstance(f, str):
|
||||
cmd = '--parse-metadata %s' % compat_shlex_quote(f)
|
||||
try:
|
||||
actions = [MetadataFromFieldPP.to_action(f)]
|
||||
except Exception as err:
|
||||
parser.error(f'{cmd} is invalid; {err}')
|
||||
else:
|
||||
cmd = '--replace-in-metadata %s' % ' '.join(map(compat_shlex_quote, f))
|
||||
actions = ((MetadataParserPP.Actions.REPLACE, x, *f[1:]) for x in f[0].split(','))
|
||||
|
||||
for action in actions:
|
||||
try:
|
||||
MetadataParserPP.validate_action(*action)
|
||||
except Exception as err:
|
||||
parser.error(f'{cmd} is invalid; {err}')
|
||||
yield action
|
||||
|
||||
if opts.parse_metadata is None:
|
||||
opts.parse_metadata = []
|
||||
if opts.metafromtitle is not None:
|
||||
opts.metafromfield.append('title:%s' % opts.metafromtitle)
|
||||
for f in opts.metafromfield:
|
||||
if re.match(MetadataFromFieldPP.regex, f) is None:
|
||||
parser.error('invalid format string "%s" specified for --parse-metadata' % f)
|
||||
opts.parse_metadata.append('title:%s' % opts.metafromtitle)
|
||||
opts.parse_metadata = list(itertools.chain(*map(metadataparser_actions, opts.parse_metadata)))
|
||||
|
||||
any_getting = opts.forceprint or opts.geturl or opts.gettitle or opts.getid or opts.getthumbnail or opts.getdescription or opts.getfilename or opts.getformat or opts.getduration or opts.dumpjson or opts.dump_single_json
|
||||
any_printing = opts.print_json
|
||||
|
@ -402,10 +421,10 @@ def report_conflict(arg1, arg2):
|
|||
|
||||
# PostProcessors
|
||||
postprocessors = []
|
||||
if opts.metafromfield:
|
||||
if opts.parse_metadata:
|
||||
postprocessors.append({
|
||||
'key': 'MetadataFromField',
|
||||
'formats': opts.metafromfield,
|
||||
'key': 'MetadataParser',
|
||||
'actions': opts.parse_metadata,
|
||||
# Run this immediately after extraction is complete
|
||||
'when': 'pre_process'
|
||||
})
|
||||
|
|
|
@ -1241,10 +1241,14 @@ def _dict_from_options_callback(
|
|||
help=optparse.SUPPRESS_HELP)
|
||||
postproc.add_option(
|
||||
'--parse-metadata',
|
||||
metavar='FROM:TO', dest='metafromfield', action='append',
|
||||
metavar='FROM:TO', dest='parse_metadata', action='append',
|
||||
help=(
|
||||
'Parse additional metadata like title/artist from other fields; '
|
||||
'see "MODIFYING METADATA" for details'))
|
||||
postproc.add_option(
|
||||
'--replace-in-metadata',
|
||||
dest='parse_metadata', metavar='FIELDS REGEX REPLACE', action='append', nargs=3,
|
||||
help='Replace text in a metadata field using the given regex. This option can be used multiple times')
|
||||
postproc.add_option(
|
||||
'--xattrs',
|
||||
action='store_true', dest='xattrs', default=False,
|
||||
|
|
|
@ -20,8 +20,11 @@
|
|||
)
|
||||
from .xattrpp import XAttrMetadataPP
|
||||
from .execafterdownload import ExecAfterDownloadPP
|
||||
from .metadatafromfield import MetadataFromFieldPP
|
||||
from .metadatafromfield import MetadataFromTitlePP
|
||||
from .metadataparser import (
|
||||
MetadataFromFieldPP,
|
||||
MetadataFromTitlePP,
|
||||
MetadataParserPP,
|
||||
)
|
||||
from .movefilesafterdownload import MoveFilesAfterDownloadPP
|
||||
from .sponskrub import SponSkrubPP
|
||||
|
||||
|
@ -48,6 +51,7 @@ def get_postprocessor(key):
|
|||
'FFmpegThumbnailsConvertorPP',
|
||||
'FFmpegVideoConvertorPP',
|
||||
'FFmpegVideoRemuxerPP',
|
||||
'MetadataParserPP',
|
||||
'MetadataFromFieldPP',
|
||||
'MetadataFromTitlePP',
|
||||
'MoveFilesAfterDownloadPP',
|
||||
|
|
|
@ -1,74 +0,0 @@
|
|||
from __future__ import unicode_literals
|
||||
|
||||
import re
|
||||
|
||||
from .common import PostProcessor
|
||||
from ..compat import compat_str
|
||||
|
||||
|
||||
class MetadataFromFieldPP(PostProcessor):
|
||||
regex = r'(?P<in>.*?)(?<!\\):(?P<out>.+)$'
|
||||
|
||||
def __init__(self, downloader, formats):
|
||||
PostProcessor.__init__(self, downloader)
|
||||
assert isinstance(formats, (list, tuple))
|
||||
self._data = []
|
||||
for f in formats:
|
||||
assert isinstance(f, compat_str)
|
||||
match = re.match(self.regex, f)
|
||||
assert match is not None
|
||||
inp = match.group('in').replace('\\:', ':')
|
||||
self._data.append({
|
||||
'in': inp,
|
||||
'out': match.group('out'),
|
||||
'tmpl': self.field_to_template(inp),
|
||||
'regex': self.format_to_regex(match.group('out')),
|
||||
})
|
||||
|
||||
@staticmethod
|
||||
def field_to_template(tmpl):
|
||||
if re.match(r'[a-zA-Z_]+$', tmpl):
|
||||
return '%%(%s)s' % tmpl
|
||||
return tmpl
|
||||
|
||||
@staticmethod
|
||||
def format_to_regex(fmt):
|
||||
r"""
|
||||
Converts a string like
|
||||
'%(title)s - %(artist)s'
|
||||
to a regex like
|
||||
'(?P<title>.+)\ \-\ (?P<artist>.+)'
|
||||
"""
|
||||
if not re.search(r'%\(\w+\)s', fmt):
|
||||
return fmt
|
||||
lastpos = 0
|
||||
regex = ''
|
||||
# replace %(..)s with regex group and escape other string parts
|
||||
for match in re.finditer(r'%\((\w+)\)s', fmt):
|
||||
regex += re.escape(fmt[lastpos:match.start()])
|
||||
regex += r'(?P<%s>.+)' % match.group(1)
|
||||
lastpos = match.end()
|
||||
if lastpos < len(fmt):
|
||||
regex += re.escape(fmt[lastpos:])
|
||||
return regex
|
||||
|
||||
def run(self, info):
|
||||
for dictn in self._data:
|
||||
tmpl, tmpl_dict = self._downloader.prepare_outtmpl(dictn['tmpl'], info)
|
||||
data_to_parse = self._downloader.escape_outtmpl(tmpl) % tmpl_dict
|
||||
self.write_debug('Searching for r"%s" in %s' % (dictn['regex'], dictn['tmpl']))
|
||||
match = re.search(dictn['regex'], data_to_parse)
|
||||
if match is None:
|
||||
self.report_warning('Could not interpret video %s as "%s"' % (dictn['in'], dictn['out']))
|
||||
continue
|
||||
for attribute, value in match.groupdict().items():
|
||||
info[attribute] = value
|
||||
self.to_screen('parsed %s from "%s": %s' % (attribute, dictn['tmpl'], value if value is not None else 'NA'))
|
||||
return [], info
|
||||
|
||||
|
||||
class MetadataFromTitlePP(MetadataFromFieldPP): # for backward compatibility
|
||||
def __init__(self, downloader, titleformat):
|
||||
super(MetadataFromTitlePP, self).__init__(downloader, ['%%(title)s:%s' % titleformat])
|
||||
self._titleformat = titleformat
|
||||
self._titleregex = self._data[0]['regex']
|
117
yt_dlp/postprocessor/metadataparser.py
Normal file
117
yt_dlp/postprocessor/metadataparser.py
Normal file
|
@ -0,0 +1,117 @@
|
|||
import re
|
||||
|
||||
from enum import Enum
|
||||
|
||||
from .common import PostProcessor
|
||||
|
||||
|
||||
class MetadataParserPP(PostProcessor):
|
||||
class Actions(Enum):
|
||||
INTERPRET = 'interpretter'
|
||||
REPLACE = 'replacer'
|
||||
|
||||
def __init__(self, downloader, actions):
|
||||
PostProcessor.__init__(self, downloader)
|
||||
self._actions = []
|
||||
for f in actions:
|
||||
action = f[0]
|
||||
assert isinstance(action, self.Actions)
|
||||
self._actions.append(getattr(self, action._value_)(*f[1:]))
|
||||
|
||||
@classmethod
|
||||
def validate_action(cls, action, *data):
|
||||
''' Each action can be:
|
||||
(Actions.INTERPRET, from, to) OR
|
||||
(Actions.REPLACE, field, search, replace)
|
||||
'''
|
||||
if not isinstance(action, cls.Actions):
|
||||
raise ValueError(f'{action!r} is not a valid action')
|
||||
getattr(cls, action._value_)(cls, *data)
|
||||
|
||||
@staticmethod
|
||||
def field_to_template(tmpl):
|
||||
if re.match(r'[a-zA-Z_]+$', tmpl):
|
||||
return f'%({tmpl})s'
|
||||
return tmpl
|
||||
|
||||
@staticmethod
|
||||
def format_to_regex(fmt):
|
||||
r"""
|
||||
Converts a string like
|
||||
'%(title)s - %(artist)s'
|
||||
to a regex like
|
||||
'(?P<title>.+)\ \-\ (?P<artist>.+)'
|
||||
"""
|
||||
if not re.search(r'%\(\w+\)s', fmt):
|
||||
return fmt
|
||||
lastpos = 0
|
||||
regex = ''
|
||||
# replace %(..)s with regex group and escape other string parts
|
||||
for match in re.finditer(r'%\((\w+)\)s', fmt):
|
||||
regex += re.escape(fmt[lastpos:match.start()])
|
||||
regex += rf'(?P<{match.group(1)}>.+)'
|
||||
lastpos = match.end()
|
||||
if lastpos < len(fmt):
|
||||
regex += re.escape(fmt[lastpos:])
|
||||
return regex
|
||||
|
||||
def run(self, info):
|
||||
for f in self._actions:
|
||||
f(info)
|
||||
return [], info
|
||||
|
||||
def interpretter(self, inp, out):
|
||||
def f(info):
|
||||
outtmpl, tmpl_dict = self._downloader.prepare_outtmpl(template, info)
|
||||
data_to_parse = self._downloader.escape_outtmpl(outtmpl) % tmpl_dict
|
||||
self.write_debug(f'Searching for r{out_re.pattern!r} in {template!r}')
|
||||
match = out_re.search(data_to_parse)
|
||||
if match is None:
|
||||
self.report_warning('Could not interpret {inp!r} as {out!r}')
|
||||
return
|
||||
for attribute, value in match.groupdict().items():
|
||||
info[attribute] = value
|
||||
self.to_screen('Parsed %s from %r: %r' % (attribute, template, value if value is not None else 'NA'))
|
||||
|
||||
template = self.field_to_template(inp)
|
||||
out_re = re.compile(self.format_to_regex(out))
|
||||
return f
|
||||
|
||||
def replacer(self, field, search, replace):
|
||||
def f(info):
|
||||
val = info.get(field)
|
||||
if val is None:
|
||||
self.report_warning(f'Video does not have a {field}')
|
||||
return
|
||||
elif not isinstance(val, str):
|
||||
self.report_warning(f'Cannot replace in field {field} since it is a {type(val).__name__}')
|
||||
return
|
||||
self.write_debug(f'Replacing all r{search!r} in {field} with {replace!r}')
|
||||
info[field], n = search_re.subn(replace, val)
|
||||
if n:
|
||||
self.to_screen(f'Changed {field} to: {info[field]}')
|
||||
else:
|
||||
self.to_screen(f'Did not find r{search!r} in {field}')
|
||||
|
||||
search_re = re.compile(search)
|
||||
return f
|
||||
|
||||
|
||||
class MetadataFromFieldPP(MetadataParserPP):
|
||||
@classmethod
|
||||
def to_action(cls, f):
|
||||
match = re.match(r'(?P<in>.*?)(?<!\\):(?P<out>.+)$', f)
|
||||
if match is None:
|
||||
raise ValueError(f'it should be FROM:TO, not {f!r}')
|
||||
return (
|
||||
cls.Actions.INTERPRET,
|
||||
match.group('in').replace('\\:', ':'),
|
||||
match.group('out'))
|
||||
|
||||
def __init__(self, downloader, formats):
|
||||
MetadataParserPP.__init__(self, downloader, [self.to_action(f) for f in formats])
|
||||
|
||||
|
||||
class MetadataFromTitlePP(MetadataParserPP): # for backward compatibility
|
||||
def __init__(self, downloader, titleformat):
|
||||
MetadataParserPP.__init__(self, downloader, [(self.Actions.INTERPRET, 'title', titleformat)])
|
Loading…
Reference in a new issue