Merge pull request #563 from FiloSottile/IE_cleanup

General IE docs and return dicts cleanup
This commit is contained in:
Filippo Valsorda 2012-11-27 14:22:40 -08:00
commit e643e2c6b7
2 changed files with 92 additions and 82 deletions

View file

@ -327,10 +327,13 @@ def prepare_filename(self, info_dict):
"""Generate the output filename.""" """Generate the output filename."""
try: try:
template_dict = dict(info_dict) template_dict = dict(info_dict)
template_dict['epoch'] = int(time.time()) template_dict['epoch'] = int(time.time())
template_dict['autonumber'] = u'%05d' % self._num_downloads template_dict['autonumber'] = u'%05d' % self._num_downloads
template_dict = dict((key, u'NA' if val is None else val) for key, val in template_dict.items())
template_dict = dict((k, sanitize_filename(u(v), self.params.get('restrictfilenames'))) for k,v in template_dict.items()) template_dict = dict((k, sanitize_filename(u(v), self.params.get('restrictfilenames'))) for k,v in template_dict.items())
filename = self.params['outtmpl'] % template_dict filename = self.params['outtmpl'] % template_dict
return filename return filename
except (ValueError, KeyError), err: except (ValueError, KeyError), err:
@ -359,6 +362,9 @@ def process_info(self, info_dict):
# Keep for backwards compatibility # Keep for backwards compatibility
info_dict['stitle'] = info_dict['title'] info_dict['stitle'] = info_dict['title']
if not 'format' in info_dict:
info_dict['format'] = info_dict['ext']
reason = self._match_entry(info_dict) reason = self._match_entry(info_dict)
if reason is not None: if reason is not None:
self.to_screen(u'[download] ' + reason) self.to_screen(u'[download] ' + reason)
@ -481,6 +487,11 @@ def download(self, url_list):
if not ie.suitable(url): if not ie.suitable(url):
continue continue
# Warn if the _WORKING attribute is False
if not ie.working():
self.trouble(u'WARNING: the program functionality for this site has been marked as broken, '
u'and will probably not work. If you want to go on, use the -i option.')
# Suitable InfoExtractor found # Suitable InfoExtractor found
suitable_found = True suitable_found = True

View file

@ -29,37 +29,48 @@ class InfoExtractor(object):
"""Information Extractor class. """Information Extractor class.
Information extractors are the classes that, given a URL, extract Information extractors are the classes that, given a URL, extract
information from the video (or videos) the URL refers to. This information about the video (or videos) the URL refers to. This
information includes the real video URL, the video title and simplified information includes the real video URL, the video title, author and
title, author and others. The information is stored in a dictionary others. The information is stored in a dictionary which is then
which is then passed to the FileDownloader. The FileDownloader passed to the FileDownloader. The FileDownloader processes this
processes this information possibly downloading the video to the file information possibly downloading the video to the file system, among
system, among other possible outcomes. The dictionaries must include other possible outcomes.
the following fields:
The dictionaries must include the following fields:
id: Video identifier. id: Video identifier.
url: Final video URL. url: Final video URL.
uploader: Nickname of the video uploader. uploader: Nickname of the video uploader, unescaped.
title: Literal title. upload_date: Video upload date (YYYYMMDD).
title: Video title, unescaped.
ext: Video filename extension. ext: Video filename extension.
format: Video format.
player_url: SWF Player URL (may be None).
The following fields are optional. Their primary purpose is to allow The following fields are optional:
youtube-dl to serve as the backend for a video search function, such
as the one in youtube2mp3. They are only used when their respective
forced printing functions are called:
format: The video format, defaults to ext (used for --get-format)
thumbnail: Full URL to a video thumbnail image. thumbnail: Full URL to a video thumbnail image.
description: One-line video description. description: One-line video description.
player_url: SWF Player URL (used for rtmpdump).
subtitles: The .srt file contents.
urlhandle: [internal] The urlHandle to be used to download the file,
like returned by urllib2.urlopen
The fields should all be Unicode strings.
Subclasses of this one should re-define the _real_initialize() and Subclasses of this one should re-define the _real_initialize() and
_real_extract() methods and define a _VALID_URL regexp. _real_extract() methods and define a _VALID_URL regexp.
Probably, they should also be added to the list of extractors. Probably, they should also be added to the list of extractors.
_real_extract() must return a *list* of information dictionaries as
described above.
Finally, the _WORKING attribute should be set to False for broken IEs
in order to warn the users and skip the tests.
""" """
_ready = False _ready = False
_downloader = None _downloader = None
_WORKING = True
def __init__(self, downloader=None): def __init__(self, downloader=None):
"""Constructor. Receives an optional downloader.""" """Constructor. Receives an optional downloader."""
@ -70,6 +81,10 @@ def suitable(self, url):
"""Receives a URL and returns True if suitable for this IE.""" """Receives a URL and returns True if suitable for this IE."""
return re.match(self._VALID_URL, url) is not None return re.match(self._VALID_URL, url) is not None
def working(self):
"""Getter method for _WORKING."""
return self._WORKING
def initialize(self): def initialize(self):
"""Initializes an instance (authentication, etc).""" """Initializes an instance (authentication, etc)."""
if not self._ready: if not self._ready:
@ -365,7 +380,7 @@ def _real_extract(self, url):
video_thumbnail = urllib.unquote_plus(video_info['thumbnail_url'][0]) video_thumbnail = urllib.unquote_plus(video_info['thumbnail_url'][0])
# upload date # upload date
upload_date = u'NA' upload_date = None
mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL) mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
if mobj is not None: if mobj is not None:
upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split()) upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
@ -475,6 +490,9 @@ def _real_extract(self, url):
# Extension # Extension
video_extension = self._video_extensions.get(format_param, 'flv') video_extension = self._video_extensions.get(format_param, 'flv')
video_format = '{} - {}'.format(format_param.decode('utf-8') if format_param else video_extension.decode('utf-8'),
self._video_dimensions.get(format_param, '???'))
results.append({ results.append({
'id': video_id.decode('utf-8'), 'id': video_id.decode('utf-8'),
'url': video_real_url.decode('utf-8'), 'url': video_real_url.decode('utf-8'),
@ -482,7 +500,7 @@ def _real_extract(self, url):
'upload_date': upload_date, 'upload_date': upload_date,
'title': video_title, 'title': video_title,
'ext': video_extension.decode('utf-8'), 'ext': video_extension.decode('utf-8'),
'format': (format_param is None and u'NA' or format_param.decode('utf-8')), 'format': video_format,
'thumbnail': video_thumbnail.decode('utf-8'), 'thumbnail': video_thumbnail.decode('utf-8'),
'description': video_description, 'description': video_description,
'player_url': player_url, 'player_url': player_url,
@ -613,11 +631,9 @@ def _real_extract(self, url):
'id': video_id.decode('utf-8'), 'id': video_id.decode('utf-8'),
'url': video_url.decode('utf-8'), 'url': video_url.decode('utf-8'),
'uploader': video_uploader.decode('utf-8'), 'uploader': video_uploader.decode('utf-8'),
'upload_date': u'NA', 'upload_date': None,
'title': video_title, 'title': video_title,
'ext': video_extension.decode('utf-8'), 'ext': video_extension.decode('utf-8'),
'format': u'NA',
'player_url': None,
}] }]
@ -691,7 +707,7 @@ def _real_extract(self, url):
return return
video_title = unescapeHTML(mobj.group('title').decode('utf-8')) video_title = unescapeHTML(mobj.group('title').decode('utf-8'))
video_uploader = u'NA' video_uploader = None
mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a>', webpage) mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a>', webpage)
if mobj is None: if mobj is None:
# lookin for official user # lookin for official user
@ -703,7 +719,7 @@ def _real_extract(self, url):
else: else:
video_uploader = mobj.group(1) video_uploader = mobj.group(1)
video_upload_date = u'NA' video_upload_date = None
mobj = re.search(r'<div class="[^"]*uploaded_cont[^"]*" title="[^"]*">([0-9]{2})-([0-9]{2})-([0-9]{4})</div>', webpage) mobj = re.search(r'<div class="[^"]*uploaded_cont[^"]*" title="[^"]*">([0-9]{2})-([0-9]{2})-([0-9]{4})</div>', webpage)
if mobj is not None: if mobj is not None:
video_upload_date = mobj.group(3) + mobj.group(2) + mobj.group(1) video_upload_date = mobj.group(3) + mobj.group(2) + mobj.group(1)
@ -715,8 +731,6 @@ def _real_extract(self, url):
'upload_date': video_upload_date, 'upload_date': video_upload_date,
'title': video_title, 'title': video_title,
'ext': video_extension.decode('utf-8'), 'ext': video_extension.decode('utf-8'),
'format': u'NA',
'player_url': None,
}] }]
@ -806,12 +820,10 @@ def _real_extract(self, url):
return [{ return [{
'id': video_id.decode('utf-8'), 'id': video_id.decode('utf-8'),
'url': video_url.decode('utf-8'), 'url': video_url.decode('utf-8'),
'uploader': u'NA', 'uploader': None,
'upload_date': u'NA', 'upload_date': None,
'title': video_title, 'title': video_title,
'ext': video_extension.decode('utf-8'), 'ext': video_extension.decode('utf-8'),
'format': u'NA',
'player_url': None,
}] }]
@ -874,11 +886,9 @@ def _real_extract(self, url):
'id': video_id.decode('utf-8'), 'id': video_id.decode('utf-8'),
'url': video_url.decode('utf-8'), 'url': video_url.decode('utf-8'),
'uploader': video_uploader, 'uploader': video_uploader,
'upload_date': u'NA', 'upload_date': None,
'title': video_title, 'title': video_title,
'ext': video_extension.decode('utf-8'), 'ext': video_extension.decode('utf-8'),
'format': u'NA',
'player_url': None,
}] }]
@ -1016,13 +1026,11 @@ def _real_extract(self, url, new_video=True):
'id': video_id.decode('utf-8'), 'id': video_id.decode('utf-8'),
'url': video_url, 'url': video_url,
'uploader': video_uploader, 'uploader': video_uploader,
'upload_date': u'NA', 'upload_date': None,
'title': video_title, 'title': video_title,
'ext': video_extension.decode('utf-8'), 'ext': video_extension.decode('utf-8'),
'thumbnail': video_thumbnail.decode('utf-8'), 'thumbnail': video_thumbnail.decode('utf-8'),
'description': video_description, 'description': video_description,
'thumbnail': video_thumbnail,
'player_url': None,
}] }]
@ -1090,7 +1098,7 @@ def _real_extract(self, url, new_video=True):
else: video_description = '' else: video_description = ''
# Extract upload date # Extract upload date
video_upload_date = u'NA' video_upload_date = None
mobj = re.search(r'<span id="clip-date" style="display:none">[^:]*: (.*?)( \([^\(]*\))?</span>', webpage) mobj = re.search(r'<span id="clip-date" style="display:none">[^:]*: (.*?)( \([^\(]*\))?</span>', webpage)
if mobj is not None: if mobj is not None:
video_upload_date = mobj.group(1) video_upload_date = mobj.group(1)
@ -1136,7 +1144,6 @@ def _real_extract(self, url, new_video=True):
'ext': video_extension, 'ext': video_extension,
'thumbnail': video_thumbnail, 'thumbnail': video_thumbnail,
'description': video_description, 'description': video_description,
'player_url': None,
}] }]
@ -1416,11 +1423,9 @@ def _real_extract(self, url):
'id': video_id.decode('utf-8'), 'id': video_id.decode('utf-8'),
'url': video_url.decode('utf-8'), 'url': video_url.decode('utf-8'),
'uploader': video_uploader, 'uploader': video_uploader,
'upload_date': u'NA', 'upload_date': None,
'title': video_title, 'title': video_title,
'ext': video_extension.decode('utf-8'), 'ext': video_extension.decode('utf-8'),
'format': u'NA',
'player_url': None,
}] }]
@ -2021,18 +2026,17 @@ def _real_extract(self, url):
return [{ return [{
'id': file_id.decode('utf-8'), 'id': file_id.decode('utf-8'),
'url': file_url.decode('utf-8'), 'url': file_url.decode('utf-8'),
'uploader': u'NA', 'uploader': None,
'upload_date': u'NA', 'upload_date': None,
'title': file_title, 'title': file_title,
'ext': file_extension.decode('utf-8'), 'ext': file_extension.decode('utf-8'),
'format': u'NA',
'player_url': None,
}] }]
class FacebookIE(InfoExtractor): class FacebookIE(InfoExtractor):
"""Information Extractor for Facebook""" """Information Extractor for Facebook"""
_WORKING = False
_VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)' _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
_LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&' _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
_NETRC_MACHINE = 'facebook' _NETRC_MACHINE = 'facebook'
@ -2177,7 +2181,7 @@ def _real_extract(self, url):
video_thumbnail = video_info['thumbnail'] video_thumbnail = video_info['thumbnail']
# upload date # upload date
upload_date = u'NA' upload_date = None
if 'upload_date' in video_info: if 'upload_date' in video_info:
upload_time = video_info['upload_date'] upload_time = video_info['upload_date']
timetuple = email.utils.parsedate_tz(upload_time) timetuple = email.utils.parsedate_tz(upload_time)
@ -2232,7 +2236,6 @@ def _real_extract(self, url):
'format': (format_param is None and u'NA' or format_param.decode('utf-8')), 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
'thumbnail': video_thumbnail.decode('utf-8'), 'thumbnail': video_thumbnail.decode('utf-8'),
'description': video_description.decode('utf-8'), 'description': video_description.decode('utf-8'),
'player_url': None,
}) })
return results return results
@ -2276,6 +2279,8 @@ def _real_extract(self, url):
info = { info = {
'id': title, 'id': title,
'url': url, 'url': url,
'uploader': None,
'upload_date': None,
'title': title, 'title': title,
'ext': ext, 'ext': ext,
'urlhandle': urlh 'urlhandle': urlh
@ -2376,12 +2381,10 @@ def _real_extract(self,url):
return [{ return [{
'id': video_id, 'id': video_id,
'url': video_url, 'url': video_url,
'uploader': u'NA', 'uploader': None,
'upload_date': u'NA', 'upload_date': None,
'title': video_title, 'title': video_title,
'ext': u'flv', 'ext': u'flv',
'format': u'NA',
'player_url': None,
}] }]
class ComedyCentralIE(InfoExtractor): class ComedyCentralIE(InfoExtractor):
@ -2638,7 +2641,6 @@ def _real_extract(self, url):
'upload_date': None, 'upload_date': None,
'title': showName, 'title': showName,
'ext': 'flv', 'ext': 'flv',
'format': 'flv',
'thumbnail': imgUrl, 'thumbnail': imgUrl,
'description': description, 'description': description,
'player_url': playerUrl, 'player_url': playerUrl,
@ -2685,6 +2687,8 @@ def _real_extract(self, url):
info = { info = {
'id': video_id, 'id': video_id,
'internal_id': internal_video_id, 'internal_id': internal_video_id,
'uploader': None,
'upload_date': None,
} }
self.report_extraction(video_id) self.report_extraction(video_id)
@ -2703,7 +2707,6 @@ def _real_extract(self, url):
info['url'] = videoNode.findall('./file')[0].text info['url'] = videoNode.findall('./file')[0].text
info['thumbnail'] = videoNode.findall('./thumbnail')[0].text info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
info['ext'] = info['url'].rpartition('.')[2] info['ext'] = info['url'].rpartition('.')[2]
info['format'] = info['ext']
except IndexError: except IndexError:
self._downloader.trouble(u'\nERROR: Invalid metadata XML file') self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
return return
@ -2774,10 +2777,8 @@ def _real_extract(self, url):
'upload_date': None, 'upload_date': None,
'title': video_title, 'title': video_title,
'ext': 'flv', 'ext': 'flv',
'format': 'flv',
'thumbnail': video_thumbnail, 'thumbnail': video_thumbnail,
'description': None, 'description': None,
'player_url': None,
} }
return [info] return [info]
@ -2871,8 +2872,6 @@ def _real_extract(self, url):
'upload_date': upload_date, 'upload_date': upload_date,
'title': title, 'title': title,
'ext': u'mp3', 'ext': u'mp3',
'format': u'NA',
'player_url': None,
'description': description.decode('utf-8') 'description': description.decode('utf-8')
}] }]
@ -2939,11 +2938,9 @@ def _real_extract(self, url):
'uploader': None, 'uploader': None,
'upload_date': None, 'upload_date': None,
'title': video_title, 'title': video_title,
'ext': extension, 'ext': extension, # Extension is always(?) mp4, but seems to be flv
'format': extension, # Extension is always(?) mp4, but seems to be flv
'thumbnail': None, 'thumbnail': None,
'description': video_description, 'description': video_description,
'player_url': None,
} }
return [info] return [info]
@ -3052,7 +3049,7 @@ def _real_extract(self, url):
'id': file_id.decode('utf-8'), 'id': file_id.decode('utf-8'),
'url': file_url.decode('utf-8'), 'url': file_url.decode('utf-8'),
'uploader': uploader.decode('utf-8'), 'uploader': uploader.decode('utf-8'),
'upload_date': u'NA', 'upload_date': None,
'title': json_data['name'], 'title': json_data['name'],
'ext': file_url.split('.')[-1].decode('utf-8'), 'ext': file_url.split('.')[-1].decode('utf-8'),
'format': (format_param is None and u'NA' or format_param.decode('utf-8')), 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
@ -3086,6 +3083,8 @@ def _real_extract(self, url):
video = mobj.group('video') video = mobj.group('video')
info = { info = {
'id': course + '_' + video, 'id': course + '_' + video,
'uploader': None,
'upload_date': None,
} }
self.report_extraction(info['id']) self.report_extraction(info['id'])
@ -3104,13 +3103,14 @@ def _real_extract(self, url):
self._downloader.trouble(u'\nERROR: Invalid metadata XML file') self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
return return
info['ext'] = info['url'].rpartition('.')[2] info['ext'] = info['url'].rpartition('.')[2]
info['format'] = info['ext']
return [info] return [info]
elif mobj.group('course'): # A course page elif mobj.group('course'): # A course page
course = mobj.group('course') course = mobj.group('course')
info = { info = {
'id': course, 'id': course,
'type': 'playlist', 'type': 'playlist',
'uploader': None,
'upload_date': None,
} }
self.report_download_webpage(info['id']) self.report_download_webpage(info['id'])
@ -3147,6 +3147,8 @@ def _real_extract(self, url):
info = { info = {
'id': 'Stanford OpenClassroom', 'id': 'Stanford OpenClassroom',
'type': 'playlist', 'type': 'playlist',
'uploader': None,
'upload_date': None,
} }
self.report_download_webpage(info['id']) self.report_download_webpage(info['id'])
@ -3255,6 +3257,7 @@ def _real_extract(self, url):
'id': video_id, 'id': video_id,
'url': video_url, 'url': video_url,
'uploader': performer, 'uploader': performer,
'upload_date': None,
'title': video_title, 'title': video_title,
'ext': ext, 'ext': ext,
'format': format, 'format': format,
@ -3376,9 +3379,9 @@ def _real_extract(self, url):
'id': '%s_part%02d' % (video_id, index), 'id': '%s_part%02d' % (video_id, index),
'url': download_url, 'url': download_url,
'uploader': None, 'uploader': None,
'upload_date': None,
'title': video_title, 'title': video_title,
'ext': ext, 'ext': ext,
'format': u'NA'
} }
files_info.append(info) files_info.append(info)
@ -3436,18 +3439,16 @@ def _real_extract(self, url):
return return
video_thumbnail = result.group(1).decode('utf-8') video_thumbnail = result.group(1).decode('utf-8')
info = {'id': video_id, return [{
'id': video_id,
'url': video_url, 'url': video_url,
'uploader': None, 'uploader': None,
'upload_date': None, 'upload_date': None,
'title': video_title, 'title': video_title,
'ext': 'flv', 'ext': 'flv',
'format': 'flv',
'thumbnail': video_thumbnail, 'thumbnail': video_thumbnail,
'description': None, 'description': None,
'player_url': None} }]
return [info]
class GooglePlusIE(InfoExtractor): class GooglePlusIE(InfoExtractor):
@ -3501,7 +3502,7 @@ def _real_extract(self, url):
return return
# Extract update date # Extract update date
upload_date = u'NA' upload_date = None
pattern = 'title="Timestamp">(.*?)</a>' pattern = 'title="Timestamp">(.*?)</a>'
mobj = re.search(pattern, webpage) mobj = re.search(pattern, webpage)
if mobj: if mobj:
@ -3512,7 +3513,7 @@ def _real_extract(self, url):
self.report_date(upload_date) self.report_date(upload_date)
# Extract uploader # Extract uploader
uploader = u'NA' uploader = None
pattern = r'rel\="author".*?>(.*?)</a>' pattern = r'rel\="author".*?>(.*?)</a>'
mobj = re.search(pattern, webpage) mobj = re.search(pattern, webpage)
if mobj: if mobj:
@ -3569,6 +3570,4 @@ def _real_extract(self, url):
'upload_date': upload_date.decode('utf-8'), 'upload_date': upload_date.decode('utf-8'),
'title': video_title.decode('utf-8'), 'title': video_title.decode('utf-8'),
'ext': video_extension.decode('utf-8'), 'ext': video_extension.decode('utf-8'),
'format': u'NA',
'player_url': None,
}] }]