mirror of
https://github.com/yt-dlp/yt-dlp.git
synced 2024-11-10 01:02:13 +01:00
Add regex to --match-filter
This does not fully deprecate `--match-title`/`--reject-title` since `--match-filter` is only checked after the extraction is complete, while `--match-title` can often be checked from the flat playlist. Fixes: https://github.com/ytdl-org/youtube-dl/issues/9092, https://github.com/ytdl-org/youtube-dl/issues/23035
This commit is contained in:
parent
77b87f0519
commit
a047eeb6d2
4 changed files with 82 additions and 43 deletions
27
README.md
27
README.md
|
@ -340,19 +340,22 @@ ## Video Selection:
|
||||||
COUNT views
|
COUNT views
|
||||||
--match-filter FILTER Generic video filter. Any field (see
|
--match-filter FILTER Generic video filter. Any field (see
|
||||||
"OUTPUT TEMPLATE") can be compared with a
|
"OUTPUT TEMPLATE") can be compared with a
|
||||||
number or a quoted string using the
|
number or a string using the operators
|
||||||
operators defined in "Filtering formats".
|
defined in "Filtering formats". You can
|
||||||
You can also simply specify a field to
|
also simply specify a field to match if the
|
||||||
match if the field is present and "!field"
|
field is present and "!field" to check if
|
||||||
to check if the field is not present.
|
the field is not present. In addition,
|
||||||
Multiple filters can be checked using "&".
|
Python style regular expression matching
|
||||||
For example, to only match videos that are
|
can be done using "~=", and multiple
|
||||||
not live, has a like count more than 100, a
|
filters can be checked with "&". Use a "\"
|
||||||
dislike count less than 50 (or the dislike
|
to escape "&" or quotes if needed. Eg:
|
||||||
|
--match-filter "!is_live & like_count>?100
|
||||||
|
& description~=\'(?i)\bcats \& dogs\b\'"
|
||||||
|
matches only videos that are not live, has
|
||||||
|
a like count more than 100 (or the like
|
||||||
field is not available), and also has a
|
field is not available), and also has a
|
||||||
description that contains "python", use
|
description that contains the phrase "cats
|
||||||
--match-filter "!is_live & like_count>100 &
|
& dogs" (ignoring case)
|
||||||
dislike_count<?50 & description*='python'"
|
|
||||||
--no-match-filter Do not use generic video filter (default)
|
--no-match-filter Do not use generic video filter (default)
|
||||||
--no-playlist Download only the video, if the URL refers
|
--no-playlist Download only the video, if the URL refers
|
||||||
to a video and a playlist
|
to a video and a playlist
|
||||||
|
|
|
@ -1207,11 +1207,26 @@ def test_render_table(self):
|
||||||
'9999 51')
|
'9999 51')
|
||||||
|
|
||||||
def test_match_str(self):
|
def test_match_str(self):
|
||||||
|
# Unary
|
||||||
self.assertFalse(match_str('xy', {'x': 1200}))
|
self.assertFalse(match_str('xy', {'x': 1200}))
|
||||||
self.assertTrue(match_str('!xy', {'x': 1200}))
|
self.assertTrue(match_str('!xy', {'x': 1200}))
|
||||||
self.assertTrue(match_str('x', {'x': 1200}))
|
self.assertTrue(match_str('x', {'x': 1200}))
|
||||||
self.assertFalse(match_str('!x', {'x': 1200}))
|
self.assertFalse(match_str('!x', {'x': 1200}))
|
||||||
self.assertTrue(match_str('x', {'x': 0}))
|
self.assertTrue(match_str('x', {'x': 0}))
|
||||||
|
self.assertTrue(match_str('is_live', {'is_live': True}))
|
||||||
|
self.assertFalse(match_str('is_live', {'is_live': False}))
|
||||||
|
self.assertFalse(match_str('is_live', {'is_live': None}))
|
||||||
|
self.assertFalse(match_str('is_live', {}))
|
||||||
|
self.assertFalse(match_str('!is_live', {'is_live': True}))
|
||||||
|
self.assertTrue(match_str('!is_live', {'is_live': False}))
|
||||||
|
self.assertTrue(match_str('!is_live', {'is_live': None}))
|
||||||
|
self.assertTrue(match_str('!is_live', {}))
|
||||||
|
self.assertTrue(match_str('title', {'title': 'abc'}))
|
||||||
|
self.assertTrue(match_str('title', {'title': ''}))
|
||||||
|
self.assertFalse(match_str('!title', {'title': 'abc'}))
|
||||||
|
self.assertFalse(match_str('!title', {'title': ''}))
|
||||||
|
|
||||||
|
# Numeric
|
||||||
self.assertFalse(match_str('x>0', {'x': 0}))
|
self.assertFalse(match_str('x>0', {'x': 0}))
|
||||||
self.assertFalse(match_str('x>0', {}))
|
self.assertFalse(match_str('x>0', {}))
|
||||||
self.assertTrue(match_str('x>?0', {}))
|
self.assertTrue(match_str('x>?0', {}))
|
||||||
|
@ -1219,6 +1234,8 @@ def test_match_str(self):
|
||||||
self.assertFalse(match_str('x>2K', {'x': 1200}))
|
self.assertFalse(match_str('x>2K', {'x': 1200}))
|
||||||
self.assertTrue(match_str('x>=1200 & x < 1300', {'x': 1200}))
|
self.assertTrue(match_str('x>=1200 & x < 1300', {'x': 1200}))
|
||||||
self.assertFalse(match_str('x>=1100 & x < 1200', {'x': 1200}))
|
self.assertFalse(match_str('x>=1100 & x < 1200', {'x': 1200}))
|
||||||
|
|
||||||
|
# String
|
||||||
self.assertFalse(match_str('y=a212', {'y': 'foobar42'}))
|
self.assertFalse(match_str('y=a212', {'y': 'foobar42'}))
|
||||||
self.assertTrue(match_str('y=foobar42', {'y': 'foobar42'}))
|
self.assertTrue(match_str('y=foobar42', {'y': 'foobar42'}))
|
||||||
self.assertFalse(match_str('y!=foobar42', {'y': 'foobar42'}))
|
self.assertFalse(match_str('y!=foobar42', {'y': 'foobar42'}))
|
||||||
|
@ -1234,6 +1251,8 @@ def test_match_str(self):
|
||||||
self.assertTrue(match_str('y!*=baz', {'y': 'foobar42'}))
|
self.assertTrue(match_str('y!*=baz', {'y': 'foobar42'}))
|
||||||
self.assertTrue(match_str('y$=42', {'y': 'foobar42'}))
|
self.assertTrue(match_str('y$=42', {'y': 'foobar42'}))
|
||||||
self.assertFalse(match_str('y$=43', {'y': 'foobar42'}))
|
self.assertFalse(match_str('y$=43', {'y': 'foobar42'}))
|
||||||
|
|
||||||
|
# And
|
||||||
self.assertFalse(match_str(
|
self.assertFalse(match_str(
|
||||||
'like_count > 100 & dislike_count <? 50 & description',
|
'like_count > 100 & dislike_count <? 50 & description',
|
||||||
{'like_count': 90, 'description': 'foo'}))
|
{'like_count': 90, 'description': 'foo'}))
|
||||||
|
@ -1246,18 +1265,29 @@ def test_match_str(self):
|
||||||
self.assertFalse(match_str(
|
self.assertFalse(match_str(
|
||||||
'like_count > 100 & dislike_count <? 50 & description',
|
'like_count > 100 & dislike_count <? 50 & description',
|
||||||
{'like_count': 190, 'dislike_count': 10}))
|
{'like_count': 190, 'dislike_count': 10}))
|
||||||
self.assertTrue(match_str('is_live', {'is_live': True}))
|
|
||||||
self.assertFalse(match_str('is_live', {'is_live': False}))
|
# Regex
|
||||||
self.assertFalse(match_str('is_live', {'is_live': None}))
|
self.assertTrue(match_str(r'x~=\bbar', {'x': 'foo bar'}))
|
||||||
self.assertFalse(match_str('is_live', {}))
|
self.assertFalse(match_str(r'x~=\bbar.+', {'x': 'foo bar'}))
|
||||||
self.assertFalse(match_str('!is_live', {'is_live': True}))
|
self.assertFalse(match_str(r'x~=^FOO', {'x': 'foo bar'}))
|
||||||
self.assertTrue(match_str('!is_live', {'is_live': False}))
|
self.assertTrue(match_str(r'x~=(?i)^FOO', {'x': 'foo bar'}))
|
||||||
self.assertTrue(match_str('!is_live', {'is_live': None}))
|
|
||||||
self.assertTrue(match_str('!is_live', {}))
|
# Quotes
|
||||||
self.assertTrue(match_str('title', {'title': 'abc'}))
|
self.assertTrue(match_str(r'x^="foo"', {'x': 'foo "bar"'}))
|
||||||
self.assertTrue(match_str('title', {'title': ''}))
|
self.assertFalse(match_str(r'x^="foo "', {'x': 'foo "bar"'}))
|
||||||
self.assertFalse(match_str('!title', {'title': 'abc'}))
|
self.assertFalse(match_str(r'x$="bar"', {'x': 'foo "bar"'}))
|
||||||
self.assertFalse(match_str('!title', {'title': ''}))
|
self.assertTrue(match_str(r'x$=" \"bar\""', {'x': 'foo "bar"'}))
|
||||||
|
|
||||||
|
# Escaping &
|
||||||
|
self.assertFalse(match_str(r'x=foo & bar', {'x': 'foo & bar'}))
|
||||||
|
self.assertTrue(match_str(r'x=foo \& bar', {'x': 'foo & bar'}))
|
||||||
|
self.assertTrue(match_str(r'x=foo \& bar & x^=foo', {'x': 'foo & bar'}))
|
||||||
|
self.assertTrue(match_str(r'x="foo \& bar" & x^=foo', {'x': 'foo & bar'}))
|
||||||
|
|
||||||
|
# Example from docs
|
||||||
|
self.assertTrue(
|
||||||
|
r'!is_live & like_count>?100 & description~=\'(?i)\bcats \& dogs\b\'',
|
||||||
|
{'description': 'Raining Cats & Dogs'})
|
||||||
|
|
||||||
def test_parse_dfxp_time_expr(self):
|
def test_parse_dfxp_time_expr(self):
|
||||||
self.assertEqual(parse_dfxp_time_expr(None), None)
|
self.assertEqual(parse_dfxp_time_expr(None), None)
|
||||||
|
|
|
@ -378,13 +378,14 @@ def _dict_from_options_callback(
|
||||||
'Generic video filter. Any field (see "OUTPUT TEMPLATE") can be compared with a '
|
'Generic video filter. Any field (see "OUTPUT TEMPLATE") can be compared with a '
|
||||||
'number or a string using the operators defined in "Filtering formats". '
|
'number or a string using the operators defined in "Filtering formats". '
|
||||||
'You can also simply specify a field to match if the field is present '
|
'You can also simply specify a field to match if the field is present '
|
||||||
'and "!field" to check if the field is not present. '
|
'and "!field" to check if the field is not present. In addition, '
|
||||||
'Multiple filters can be checked using "&". '
|
'Python style regular expression matching can be done using "~=", '
|
||||||
'For example, to only match videos that are not live, '
|
'and multiple filters can be checked with "&". '
|
||||||
'has a like count more than 100, a dislike count less than 50 '
|
'Use a "\\" to escape "&" or quotes if needed. Eg: --match-filter '
|
||||||
'(or the dislike field is not available), and also has a description '
|
r'"!is_live & like_count>?100 & description~=\'(?i)\bcats \& dogs\b\'" '
|
||||||
'that contains "python", use --match-filter "!is_live & '
|
'matches only videos that are not live, has a like count more than 100 '
|
||||||
'like_count>100 & dislike_count<?50 & description*=\'python\'"'))
|
'(or the like field is not available), and also has a description '
|
||||||
|
'that contains the phrase "cats & dogs" (ignoring case)'))
|
||||||
selection.add_option(
|
selection.add_option(
|
||||||
'--no-match-filter',
|
'--no-match-filter',
|
||||||
metavar='FILTER', dest='match_filter', action='store_const', const=None,
|
metavar='FILTER', dest='match_filter', action='store_const', const=None,
|
||||||
|
|
|
@ -4664,23 +4664,28 @@ def filter_using_list(row, filterArray):
|
||||||
|
|
||||||
def _match_one(filter_part, dct):
|
def _match_one(filter_part, dct):
|
||||||
# TODO: Generalize code with YoutubeDL._build_format_filter
|
# TODO: Generalize code with YoutubeDL._build_format_filter
|
||||||
COMPARISON_OPERATORS = {
|
STRING_OPERATORS = {
|
||||||
'<': operator.lt,
|
|
||||||
'<=': operator.le,
|
|
||||||
'>': operator.gt,
|
|
||||||
'>=': operator.ge,
|
|
||||||
'=': operator.eq,
|
|
||||||
'*=': operator.contains,
|
'*=': operator.contains,
|
||||||
'^=': lambda attr, value: attr.startswith(value),
|
'^=': lambda attr, value: attr.startswith(value),
|
||||||
'$=': lambda attr, value: attr.endswith(value),
|
'$=': lambda attr, value: attr.endswith(value),
|
||||||
|
'~=': lambda attr, value: re.search(value, attr),
|
||||||
}
|
}
|
||||||
|
COMPARISON_OPERATORS = {
|
||||||
|
**STRING_OPERATORS,
|
||||||
|
'<=': operator.le, # "<=" must be defined above "<"
|
||||||
|
'<': operator.lt,
|
||||||
|
'>=': operator.ge,
|
||||||
|
'>': operator.gt,
|
||||||
|
'=': operator.eq,
|
||||||
|
}
|
||||||
|
|
||||||
operator_rex = re.compile(r'''(?x)\s*
|
operator_rex = re.compile(r'''(?x)\s*
|
||||||
(?P<key>[a-z_]+)
|
(?P<key>[a-z_]+)
|
||||||
\s*(?P<negation>!\s*)?(?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
|
\s*(?P<negation>!\s*)?(?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
|
||||||
(?:
|
(?:
|
||||||
(?P<intval>[0-9.]+(?:[kKmMgGtTpPeEzZyY]i?[Bb]?)?)|
|
(?P<intval>[0-9.]+(?:[kKmMgGtTpPeEzZyY]i?[Bb]?)?)|
|
||||||
(?P<quote>["\'])(?P<quotedstrval>(?:\\.|(?!(?P=quote)|\\).)+?)(?P=quote)|
|
(?P<quote>["\'])(?P<quotedstrval>.+?)(?P=quote)|
|
||||||
(?P<strval>(?![0-9.])[a-z0-9A-Z]*)
|
(?P<strval>.+?)
|
||||||
)
|
)
|
||||||
\s*$
|
\s*$
|
||||||
''' % '|'.join(map(re.escape, COMPARISON_OPERATORS.keys())))
|
''' % '|'.join(map(re.escape, COMPARISON_OPERATORS.keys())))
|
||||||
|
@ -4705,9 +4710,8 @@ def _match_one(filter_part, dct):
|
||||||
if quote is not None:
|
if quote is not None:
|
||||||
comparison_value = comparison_value.replace(r'\%s' % quote, quote)
|
comparison_value = comparison_value.replace(r'\%s' % quote, quote)
|
||||||
else:
|
else:
|
||||||
if m.group('op') in ('*=', '^=', '$='):
|
if m.group('op') in STRING_OPERATORS:
|
||||||
raise ValueError(
|
raise ValueError('Operator %s only supports string values!' % m.group('op'))
|
||||||
'Operator %s only supports string values!' % m.group('op'))
|
|
||||||
try:
|
try:
|
||||||
comparison_value = int(m.group('intval'))
|
comparison_value = int(m.group('intval'))
|
||||||
except ValueError:
|
except ValueError:
|
||||||
|
@ -4743,7 +4747,8 @@ def match_str(filter_str, dct):
|
||||||
""" Filter a dictionary with a simple string syntax. Returns True (=passes filter) or false """
|
""" Filter a dictionary with a simple string syntax. Returns True (=passes filter) or false """
|
||||||
|
|
||||||
return all(
|
return all(
|
||||||
_match_one(filter_part, dct) for filter_part in filter_str.split('&'))
|
_match_one(filter_part.replace(r'\&', '&'), dct)
|
||||||
|
for filter_part in re.split(r'(?<!\\)&', filter_str))
|
||||||
|
|
||||||
|
|
||||||
def match_filter_func(filter_str):
|
def match_filter_func(filter_str):
|
||||||
|
|
Loading…
Reference in a new issue