Add regex to --match-filter

This does not fully deprecate `--match-title`/`--reject-title`
since `--match-filter` is only checked after the extraction is complete,
while `--match-title` can often be checked from the flat playlist.

Fixes: https://github.com/ytdl-org/youtube-dl/issues/9092, https://github.com/ytdl-org/youtube-dl/issues/23035
This commit is contained in:
pukkandan 2021-08-05 03:01:23 +05:30
parent 77b87f0519
commit a047eeb6d2
No known key found for this signature in database
GPG key ID: 0F00D95A001F4698
4 changed files with 82 additions and 43 deletions

View file

@ -340,19 +340,22 @@ ## Video Selection:
COUNT views COUNT views
--match-filter FILTER Generic video filter. Any field (see --match-filter FILTER Generic video filter. Any field (see
"OUTPUT TEMPLATE") can be compared with a "OUTPUT TEMPLATE") can be compared with a
number or a quoted string using the number or a string using the operators
operators defined in "Filtering formats". defined in "Filtering formats". You can
You can also simply specify a field to also simply specify a field to match if the
match if the field is present and "!field" field is present and "!field" to check if
to check if the field is not present. the field is not present. In addition,
Multiple filters can be checked using "&". Python style regular expression matching
For example, to only match videos that are can be done using "~=", and multiple
not live, has a like count more than 100, a filters can be checked with "&". Use a "\"
dislike count less than 50 (or the dislike to escape "&" or quotes if needed. Eg:
--match-filter "!is_live & like_count>?100
& description~=\'(?i)\bcats \& dogs\b\'"
matches only videos that are not live, has
a like count more than 100 (or the like
field is not available), and also has a field is not available), and also has a
description that contains "python", use description that contains the phrase "cats
--match-filter "!is_live & like_count>100 & & dogs" (ignoring case)
dislike_count<?50 & description*='python'"
--no-match-filter Do not use generic video filter (default) --no-match-filter Do not use generic video filter (default)
--no-playlist Download only the video, if the URL refers --no-playlist Download only the video, if the URL refers
to a video and a playlist to a video and a playlist

View file

@ -1207,11 +1207,26 @@ def test_render_table(self):
'9999 51') '9999 51')
def test_match_str(self): def test_match_str(self):
# Unary
self.assertFalse(match_str('xy', {'x': 1200})) self.assertFalse(match_str('xy', {'x': 1200}))
self.assertTrue(match_str('!xy', {'x': 1200})) self.assertTrue(match_str('!xy', {'x': 1200}))
self.assertTrue(match_str('x', {'x': 1200})) self.assertTrue(match_str('x', {'x': 1200}))
self.assertFalse(match_str('!x', {'x': 1200})) self.assertFalse(match_str('!x', {'x': 1200}))
self.assertTrue(match_str('x', {'x': 0})) self.assertTrue(match_str('x', {'x': 0}))
self.assertTrue(match_str('is_live', {'is_live': True}))
self.assertFalse(match_str('is_live', {'is_live': False}))
self.assertFalse(match_str('is_live', {'is_live': None}))
self.assertFalse(match_str('is_live', {}))
self.assertFalse(match_str('!is_live', {'is_live': True}))
self.assertTrue(match_str('!is_live', {'is_live': False}))
self.assertTrue(match_str('!is_live', {'is_live': None}))
self.assertTrue(match_str('!is_live', {}))
self.assertTrue(match_str('title', {'title': 'abc'}))
self.assertTrue(match_str('title', {'title': ''}))
self.assertFalse(match_str('!title', {'title': 'abc'}))
self.assertFalse(match_str('!title', {'title': ''}))
# Numeric
self.assertFalse(match_str('x>0', {'x': 0})) self.assertFalse(match_str('x>0', {'x': 0}))
self.assertFalse(match_str('x>0', {})) self.assertFalse(match_str('x>0', {}))
self.assertTrue(match_str('x>?0', {})) self.assertTrue(match_str('x>?0', {}))
@ -1219,6 +1234,8 @@ def test_match_str(self):
self.assertFalse(match_str('x>2K', {'x': 1200})) self.assertFalse(match_str('x>2K', {'x': 1200}))
self.assertTrue(match_str('x>=1200 & x < 1300', {'x': 1200})) self.assertTrue(match_str('x>=1200 & x < 1300', {'x': 1200}))
self.assertFalse(match_str('x>=1100 & x < 1200', {'x': 1200})) self.assertFalse(match_str('x>=1100 & x < 1200', {'x': 1200}))
# String
self.assertFalse(match_str('y=a212', {'y': 'foobar42'})) self.assertFalse(match_str('y=a212', {'y': 'foobar42'}))
self.assertTrue(match_str('y=foobar42', {'y': 'foobar42'})) self.assertTrue(match_str('y=foobar42', {'y': 'foobar42'}))
self.assertFalse(match_str('y!=foobar42', {'y': 'foobar42'})) self.assertFalse(match_str('y!=foobar42', {'y': 'foobar42'}))
@ -1234,6 +1251,8 @@ def test_match_str(self):
self.assertTrue(match_str('y!*=baz', {'y': 'foobar42'})) self.assertTrue(match_str('y!*=baz', {'y': 'foobar42'}))
self.assertTrue(match_str('y$=42', {'y': 'foobar42'})) self.assertTrue(match_str('y$=42', {'y': 'foobar42'}))
self.assertFalse(match_str('y$=43', {'y': 'foobar42'})) self.assertFalse(match_str('y$=43', {'y': 'foobar42'}))
# And
self.assertFalse(match_str( self.assertFalse(match_str(
'like_count > 100 & dislike_count <? 50 & description', 'like_count > 100 & dislike_count <? 50 & description',
{'like_count': 90, 'description': 'foo'})) {'like_count': 90, 'description': 'foo'}))
@ -1246,18 +1265,29 @@ def test_match_str(self):
self.assertFalse(match_str( self.assertFalse(match_str(
'like_count > 100 & dislike_count <? 50 & description', 'like_count > 100 & dislike_count <? 50 & description',
{'like_count': 190, 'dislike_count': 10})) {'like_count': 190, 'dislike_count': 10}))
self.assertTrue(match_str('is_live', {'is_live': True}))
self.assertFalse(match_str('is_live', {'is_live': False})) # Regex
self.assertFalse(match_str('is_live', {'is_live': None})) self.assertTrue(match_str(r'x~=\bbar', {'x': 'foo bar'}))
self.assertFalse(match_str('is_live', {})) self.assertFalse(match_str(r'x~=\bbar.+', {'x': 'foo bar'}))
self.assertFalse(match_str('!is_live', {'is_live': True})) self.assertFalse(match_str(r'x~=^FOO', {'x': 'foo bar'}))
self.assertTrue(match_str('!is_live', {'is_live': False})) self.assertTrue(match_str(r'x~=(?i)^FOO', {'x': 'foo bar'}))
self.assertTrue(match_str('!is_live', {'is_live': None}))
self.assertTrue(match_str('!is_live', {})) # Quotes
self.assertTrue(match_str('title', {'title': 'abc'})) self.assertTrue(match_str(r'x^="foo"', {'x': 'foo "bar"'}))
self.assertTrue(match_str('title', {'title': ''})) self.assertFalse(match_str(r'x^="foo "', {'x': 'foo "bar"'}))
self.assertFalse(match_str('!title', {'title': 'abc'})) self.assertFalse(match_str(r'x$="bar"', {'x': 'foo "bar"'}))
self.assertFalse(match_str('!title', {'title': ''})) self.assertTrue(match_str(r'x$=" \"bar\""', {'x': 'foo "bar"'}))
# Escaping &
self.assertFalse(match_str(r'x=foo & bar', {'x': 'foo & bar'}))
self.assertTrue(match_str(r'x=foo \& bar', {'x': 'foo & bar'}))
self.assertTrue(match_str(r'x=foo \& bar & x^=foo', {'x': 'foo & bar'}))
self.assertTrue(match_str(r'x="foo \& bar" & x^=foo', {'x': 'foo & bar'}))
# Example from docs
self.assertTrue(
r'!is_live & like_count>?100 & description~=\'(?i)\bcats \& dogs\b\'',
{'description': 'Raining Cats & Dogs'})
def test_parse_dfxp_time_expr(self): def test_parse_dfxp_time_expr(self):
self.assertEqual(parse_dfxp_time_expr(None), None) self.assertEqual(parse_dfxp_time_expr(None), None)

View file

@ -378,13 +378,14 @@ def _dict_from_options_callback(
'Generic video filter. Any field (see "OUTPUT TEMPLATE") can be compared with a ' 'Generic video filter. Any field (see "OUTPUT TEMPLATE") can be compared with a '
'number or a string using the operators defined in "Filtering formats". ' 'number or a string using the operators defined in "Filtering formats". '
'You can also simply specify a field to match if the field is present ' 'You can also simply specify a field to match if the field is present '
'and "!field" to check if the field is not present. ' 'and "!field" to check if the field is not present. In addition, '
'Multiple filters can be checked using "&". ' 'Python style regular expression matching can be done using "~=", '
'For example, to only match videos that are not live, ' 'and multiple filters can be checked with "&". '
'has a like count more than 100, a dislike count less than 50 ' 'Use a "\\" to escape "&" or quotes if needed. Eg: --match-filter '
'(or the dislike field is not available), and also has a description ' r'"!is_live & like_count>?100 & description~=\'(?i)\bcats \& dogs\b\'" '
'that contains "python", use --match-filter "!is_live & ' 'matches only videos that are not live, has a like count more than 100 '
'like_count>100 & dislike_count<?50 & description*=\'python\'"')) '(or the like field is not available), and also has a description '
'that contains the phrase "cats & dogs" (ignoring case)'))
selection.add_option( selection.add_option(
'--no-match-filter', '--no-match-filter',
metavar='FILTER', dest='match_filter', action='store_const', const=None, metavar='FILTER', dest='match_filter', action='store_const', const=None,

View file

@ -4664,23 +4664,28 @@ def filter_using_list(row, filterArray):
def _match_one(filter_part, dct): def _match_one(filter_part, dct):
# TODO: Generalize code with YoutubeDL._build_format_filter # TODO: Generalize code with YoutubeDL._build_format_filter
COMPARISON_OPERATORS = { STRING_OPERATORS = {
'<': operator.lt,
'<=': operator.le,
'>': operator.gt,
'>=': operator.ge,
'=': operator.eq,
'*=': operator.contains, '*=': operator.contains,
'^=': lambda attr, value: attr.startswith(value), '^=': lambda attr, value: attr.startswith(value),
'$=': lambda attr, value: attr.endswith(value), '$=': lambda attr, value: attr.endswith(value),
'~=': lambda attr, value: re.search(value, attr),
} }
COMPARISON_OPERATORS = {
**STRING_OPERATORS,
'<=': operator.le, # "<=" must be defined above "<"
'<': operator.lt,
'>=': operator.ge,
'>': operator.gt,
'=': operator.eq,
}
operator_rex = re.compile(r'''(?x)\s* operator_rex = re.compile(r'''(?x)\s*
(?P<key>[a-z_]+) (?P<key>[a-z_]+)
\s*(?P<negation>!\s*)?(?P<op>%s)(?P<none_inclusive>\s*\?)?\s* \s*(?P<negation>!\s*)?(?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
(?: (?:
(?P<intval>[0-9.]+(?:[kKmMgGtTpPeEzZyY]i?[Bb]?)?)| (?P<intval>[0-9.]+(?:[kKmMgGtTpPeEzZyY]i?[Bb]?)?)|
(?P<quote>["\'])(?P<quotedstrval>(?:\\.|(?!(?P=quote)|\\).)+?)(?P=quote)| (?P<quote>["\'])(?P<quotedstrval>.+?)(?P=quote)|
(?P<strval>(?![0-9.])[a-z0-9A-Z]*) (?P<strval>.+?)
) )
\s*$ \s*$
''' % '|'.join(map(re.escape, COMPARISON_OPERATORS.keys()))) ''' % '|'.join(map(re.escape, COMPARISON_OPERATORS.keys())))
@ -4705,9 +4710,8 @@ def _match_one(filter_part, dct):
if quote is not None: if quote is not None:
comparison_value = comparison_value.replace(r'\%s' % quote, quote) comparison_value = comparison_value.replace(r'\%s' % quote, quote)
else: else:
if m.group('op') in ('*=', '^=', '$='): if m.group('op') in STRING_OPERATORS:
raise ValueError( raise ValueError('Operator %s only supports string values!' % m.group('op'))
'Operator %s only supports string values!' % m.group('op'))
try: try:
comparison_value = int(m.group('intval')) comparison_value = int(m.group('intval'))
except ValueError: except ValueError:
@ -4743,7 +4747,8 @@ def match_str(filter_str, dct):
""" Filter a dictionary with a simple string syntax. Returns True (=passes filter) or false """ """ Filter a dictionary with a simple string syntax. Returns True (=passes filter) or false """
return all( return all(
_match_one(filter_part, dct) for filter_part in filter_str.split('&')) _match_one(filter_part.replace(r'\&', '&'), dct)
for filter_part in re.split(r'(?<!\\)&', filter_str))
def match_filter_func(filter_str): def match_filter_func(filter_str):