[generic] Allow multiple matches for generic hits (Fixes #2818)

This commit is contained in:
Philipp Hagemeister 2014-04-30 02:23:51 +02:00
parent f1f25be6db
commit b30b8698ea

View file

@ -637,70 +637,77 @@ def _real_extract(self, url):
return self.url_result(smotri_url, 'Smotri') return self.url_result(smotri_url, 'Smotri')
# Start with something easy: JW Player in SWFObject # Start with something easy: JW Player in SWFObject
mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage) found = re.findall(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
if mobj is None: if not found:
# Look for gorilla-vid style embedding # Look for gorilla-vid style embedding
mobj = re.search(r'''(?sx) found = re.findall(r'''(?sx)
(?: (?:
jw_plugins| jw_plugins|
JWPlayerOptions| JWPlayerOptions|
jwplayer\s*\(\s*["'][^'"]+["']\s*\)\s*\.setup jwplayer\s*\(\s*["'][^'"]+["']\s*\)\s*\.setup
) )
.*?file\s*:\s*["\'](.*?)["\']''', webpage) .*?file\s*:\s*["\'](.*?)["\']''', webpage)
if mobj is None: if not found:
# Broaden the search a little bit # Broaden the search a little bit
mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage) found = re.findall(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
if mobj is None: if not found:
# Broaden the search a little bit: JWPlayer JS loader # Broaden the findall a little bit: JWPlayer JS loader
mobj = re.search(r'[^A-Za-z0-9]?file["\']?:\s*["\'](http(?![^\'"]+\.[0-9]+[\'"])[^\'"]+)["\']', webpage) found = re.findall(r'[^A-Za-z0-9]?file["\']?:\s*["\'](http(?![^\'"]+\.[0-9]+[\'"])[^\'"]+)["\']', webpage)
if not found:
if mobj is None:
# Try to find twitter cards info # Try to find twitter cards info
mobj = re.search(r'<meta (?:property|name)="twitter:player:stream" (?:content|value)="(.+?)"', webpage) found = re.findall(r'<meta (?:property|name)="twitter:player:stream" (?:content|value)="(.+?)"', webpage)
if mobj is None: if not found:
# We look for Open Graph info: # We look for Open Graph info:
# We have to match any number spaces between elements, some sites try to align them (eg.: statigr.am) # We have to match any number spaces between elements, some sites try to align them (eg.: statigr.am)
m_video_type = re.search(r'<meta.*?property="og:video:type".*?content="video/(.*?)"', webpage) m_video_type = re.findall(r'<meta.*?property="og:video:type".*?content="video/(.*?)"', webpage)
# We only look in og:video if the MIME type is a video, don't try if it's a Flash player: # We only look in og:video if the MIME type is a video, don't try if it's a Flash player:
if m_video_type is not None: if m_video_type is not None:
mobj = re.search(r'<meta.*?property="og:video".*?content="(.*?)"', webpage) found = re.findall(r'<meta.*?property="og:video".*?content="(.*?)"', webpage)
if mobj is None: if not found:
# HTML5 video # HTML5 video
mobj = re.search(r'<video[^<]*(?:>.*?<source.*?)? src="([^"]+)"', webpage, flags=re.DOTALL) found = re.findall(r'(?s)<video[^<]*(?:>.*?<source.*?)? src="([^"]+)"', webpage)
if mobj is None: if not found:
mobj = re.search( found = re.findall(
r'(?i)<meta\s+(?=(?:[a-z-]+="[^"]+"\s+)*http-equiv="refresh")' r'(?i)<meta\s+(?=(?:[a-z-]+="[^"]+"\s+)*http-equiv="refresh")'
r'(?:[a-z-]+="[^"]+"\s+)*?content="[0-9]{,2};url=\'([^\']+)\'"', r'(?:[a-z-]+="[^"]+"\s+)*?content="[0-9]{,2};url=\'([^\']+)\'"',
webpage) webpage)
if mobj: if found:
new_url = mobj.group(1) new_url = found.group(1)
self.report_following_redirect(new_url) self.report_following_redirect(new_url)
return { return {
'_type': 'url', '_type': 'url',
'url': new_url, 'url': new_url,
} }
if mobj is None: if not found:
raise ExtractorError('Unsupported URL: %s' % url) raise ExtractorError('Unsupported URL: %s' % url)
# It's possible that one of the regexes entries = []
# matched, but returned an empty group: for video_url in found:
if mobj.group(1) is None: video_url = compat_urlparse.urljoin(url, video_url)
raise ExtractorError('Did not find a valid video URL at %s' % url) video_id = compat_urllib_parse.unquote(os.path.basename(video_url))
video_url = mobj.group(1) # Sometimes, jwplayer extraction will result in a YouTube URL
video_url = compat_urlparse.urljoin(url, video_url) if YoutubeIE.suitable(video_url):
video_id = compat_urllib_parse.unquote(os.path.basename(video_url)) entries.append(self.url_result(video_url, 'Youtube'))
continue
# Sometimes, jwplayer extraction will result in a YouTube URL # here's a fun little line of code for you:
if YoutubeIE.suitable(video_url): video_id = os.path.splitext(video_id)[0]
return self.url_result(video_url, 'Youtube')
# here's a fun little line of code for you: entries.append({
video_id = os.path.splitext(video_id)[0] 'id': video_id,
'url': video_url,
'uploader': video_uploader,
'title': video_title,
})
if len(entries) == 1:
return entries[1]
else:
for num, e in enumerate(entries, start=1):
e['title'] = '%s (%d)' % (e['title'], num)
return {
'_type': 'playlist',
'entries': entries,
}
return {
'id': video_id,
'url': video_url,
'uploader': video_uploader,
'title': video_title,
}