Merge remote-tracking branch 'upstream/master'

This commit is contained in:
Andreas Schmitz 2014-02-07 12:20:58 +01:00
commit f4371f4784
11 changed files with 219 additions and 64 deletions

View file

@ -37,6 +37,8 @@ def test_youtube_playlist_matching(self):
assertPlaylist(u'https://www.youtube.com/playlist?list=PLwP_SiAcdui0KVebT0mU9Apz359a4ubsC')
assertPlaylist(u'https://www.youtube.com/watch?v=AV6J6_AeFEQ&playnext=1&list=PL4023E734DA416012') #668
self.assertFalse('youtube:playlist' in self.matching_ies(u'PLtS2H6bU1M'))
# Top tracks
assertPlaylist('https://www.youtube.com/playlist?list=MCUS.20142101')
def test_youtube_matching(self):
self.assertTrue(YoutubeIE.suitable(u'PLtS2H6bU1M'))

View file

@ -117,6 +117,13 @@ def test_youtube_mix(self):
original_video = entries[0]
self.assertEqual(original_video['id'], 'rjFaenf1T-Y')
def test_youtube_toptracks(self):
dl = FakeYDL()
ie = YoutubePlaylistIE(dl)
result = ie.extract('https://www.youtube.com/playlist?list=MCUS')
entries = result['entries']
self.assertEqual(len(entries), 100)
def test_youtube_toplist(self):
dl = FakeYDL()
ie = YoutubeTopListIE(dl)

View file

@ -41,6 +41,7 @@
'Chris Gahan',
'Saimadhav Heblikar',
'Mike Col',
'Andreas Schmitz',
)
__license__ = 'Public Domain'

View file

@ -143,8 +143,10 @@
from .naver import NaverIE
from .nba import NBAIE
from .nbc import NBCNewsIE
from .ndr import NDRIE
from .ndtv import NDTVIE
from .newgrounds import NewgroundsIE
from .nfb import NFBIE
from .nhl import NHLIE, NHLVideocenterIE
from .niconico import NiconicoIE
from .ninegag import NineGagIE

View file

@ -1,14 +1,18 @@
from __future__ import unicode_literals
import re
import base64
import urllib
import json
from .common import InfoExtractor
from ..utils import (
clean_html,
ExtractorError
)
video_container = ('.mp4', '.mkv', '.flv')
class ChilloutzoneIE(InfoExtractor):
_VALID_URL = r'(?:https?://)?(?:www\.)?chilloutzone\.net/video/(?P<id>[\w|-]+).html'
_VALID_URL = r'https?://(?:www\.)?chilloutzone\.net/video/(?P<id>[\w|-]+)\.html'
_TEST = {
'url': 'http://www.chilloutzone.net/video/enemene-meck-alle-katzen-weg.html',
'md5': 'a76f3457e813ea0037e5244f509e66d1',
@ -16,6 +20,7 @@ class ChilloutzoneIE(InfoExtractor):
'id': 'enemene-meck-alle-katzen-weg',
'ext': 'mp4',
'title': 'Enemene Meck - Alle Katzen weg',
'description': 'Ist das der Umkehrschluss des Niesenden Panda-Babys?',
},
}
@ -23,71 +28,45 @@ def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
video_id = mobj.group('id')
webpage_url = 'http://www.chilloutzone.net/video/' + video_id + '.html'
webpage = self._download_webpage(url, video_id)
# Log that we are starting to download the page
self.report_download_webpage(webpage_url)
webpage = self._download_webpage(webpage_url, video_id)
# Log that we are starting to parse the page
self.report_extraction(video_id)
# Find base64 decoded file info
base64_video_info = self._html_search_regex(r'var cozVidData = "(.+?)";', webpage, u'video Data')
# decode string and find video file
base64_video_info = self._html_search_regex(
r'var cozVidData = "(.+?)";', webpage, 'video data')
decoded_video_info = base64.b64decode(base64_video_info).decode("utf-8")
video_info_dict = json.loads(decoded_video_info)
# get video information from dict
media_url = video_info_dict['mediaUrl']
description = video_info_dict['description']
video_url = video_info_dict['mediaUrl']
description = clean_html(video_info_dict.get('description'))
title = video_info_dict['title']
native_platform = video_info_dict['nativePlatform']
native_video_id = video_info_dict['nativeVideoId']
source_priority = video_info_dict['sourcePriority']
# Start video extraction
video_url = ''
# If nativePlatform is None a fallback mechanism is used (i.e. youtube embed)
if native_platform == None:
# Look for other video urls
video_url = self._html_search_regex(r'<iframe.* src="(.+?)".*', webpage, u'fallback Video URL')
if 'youtube' in video_url:
self.to_screen(u'Youtube video detected:')
return self.url_result(video_url, ie='Youtube')
# For debugging purposes
#print video_info_dict
#print native_platform
#print native_video_id
#print source_priority
#print media_url
if native_platform is None:
youtube_url = self._html_search_regex(
r'<iframe.* src="((?:https?:)?//(?:[^.]+\.)?youtube\.com/.+?)"',
webpage, 'fallback video URL', default=None)
if youtube_url is not None:
return self.url_result(youtube_url, ie='Youtube')
# Non Fallback: Decide to use native source (e.g. youtube or vimeo) or
# the own CDN
if source_priority == 'native':
if native_platform == 'youtube':
self.to_screen(u'Youtube video detected:')
video_url = 'https://www.youtube.com/watch?v=' + native_video_id
return self.url_result(video_url, ie='Youtube')
return self.url_result(video_id, ie='Youtube')
if native_platform == 'vimeo':
self.to_screen(u'Vimeo video detected:')
video_url = 'http://vimeo.com/' + native_video_id
return self.url_result(video_url, ie='Vimeo')
return self.url_result(
'http://vimeo.com/' + native_video_id, ie='Vimeo')
# No redirect, use coz media url
video_url = media_url
if video_url.endswith('.mp4') == False:
self.report_warning(u'Url does not contain a video container')
return []
if not video_url:
raise ExtractorError('No video found')
return [{
'id': video_id,
'url': video_url,
'ext': 'mp4',
'title': title,
return {
'id': video_id,
'url': video_url,
'ext': 'mp4',
'title': title,
'description': description,
}]
}

View file

@ -9,7 +9,7 @@
class ElPaisIE(InfoExtractor):
_VALID_URL = r'https?://(?:[^.]+\.)?elpais\.com/.*/(?P<id>[^/#?]+)\.html(?:$|[?#])'
IE_DESCR = 'El País'
IE_DESC = 'El País'
_TEST = {
'url': 'http://blogs.elpais.com/la-voz-de-inaki/2014/02/tiempo-nuevo-recetas-viejas.html',

View file

@ -61,7 +61,7 @@ def _real_extract(self, url):
}
request = compat_urllib_request.Request(
'http://mooshare.biz/8dqtk4bjbp8g', compat_urllib_parse.urlencode(download_form))
'http://mooshare.biz/%s' % video_id, compat_urllib_parse.urlencode(download_form))
request.add_header('Content-Type', 'application/x-www-form-urlencoded')
self.to_screen('%s: Waiting for timeout' % video_id)
@ -111,4 +111,4 @@ def _real_extract(self, url):
'thumbnail': thumbnail,
'duration': duration,
'formats': formats,
}
}

View file

@ -0,0 +1,89 @@
# encoding: utf-8
from __future__ import unicode_literals
import re
from .common import InfoExtractor
from ..utils import ExtractorError
class NDRIE(InfoExtractor):
IE_NAME = 'ndr'
IE_DESC = 'NDR.de - Mediathek'
_VALID_URL = r'https?://www\.ndr\.de/.+?(?P<id>\d+)\.html'
_TESTS = [
# video
{
'url': 'http://www.ndr.de/fernsehen/sendungen/hallo_niedersachsen/media/hallonds19925.html',
'md5': '20eba151ff165f386643dad9c1da08f7',
'info_dict': {
'id': '19925',
'ext': 'mp4',
'title': 'Hallo Niedersachsen ',
'description': 'Bei Hallo Niedersachsen um 19:30 Uhr erfahren Sie alles, was am Tag in Niedersachsen los war.',
'duration': 1722,
},
},
# audio
{
'url': 'http://www.ndr.de/903/audio191719.html',
'md5': '41ed601768534dd18a9ae34d84798129',
'info_dict': {
'id': '191719',
'ext': 'mp3',
'title': '"Es war schockierend"',
'description': 'md5:ed7ff8364793545021a6355b97e95f10',
'duration': 112,
}
}
]
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
video_id = mobj.group('id')
page = self._download_webpage(url, video_id, 'Downloading page')
title = self._og_search_title(page)
description = self._og_search_description(page)
mobj = re.search(
r'<div class="duration"><span class="min">(?P<minutes>\d+)</span>:<span class="sec">(?P<seconds>\d+)</span></div>',
page)
duration = int(mobj.group('minutes')) * 60 + int(mobj.group('seconds')) if mobj else None
formats = []
mp3_url = re.search(r'''{src:'(?P<audio>[^']+)', type:"audio/mp3"},''', page)
if mp3_url:
formats.append({
'url': mp3_url.group('audio'),
'format_id': 'mp3',
})
thumbnail = None
video_url = re.search(r'''3: {src:'(?P<video>.+?)\.hi\.mp4', type:"video/mp4"},''', page)
if video_url:
thumbnail = self._html_search_regex(r'(?m)title: "NDR PLAYER",\s*poster: "([^"]+)",',
page, 'thumbnail', fatal=False)
if thumbnail:
thumbnail = 'http://www.ndr.de' + thumbnail
for format_id in ['lo', 'hi', 'hq']:
formats.append({
'url': '%s.%s.mp4' % (video_url.group('video'), format_id),
'format_id': format_id,
})
if not formats:
raise ExtractorError('No media links available for %s' % video_id)
return {
'id': video_id,
'title': title,
'description': description,
'thumbnail': thumbnail,
'duration': duration,
'formats': formats,
}

View file

@ -0,0 +1,76 @@
from __future__ import unicode_literals
import re
from .common import InfoExtractor
from ..utils import (
compat_urllib_request,
compat_urllib_parse,
)
class NFBIE(InfoExtractor):
IE_NAME = 'nfb'
IE_DESC = 'National Film Board of Canada'
_VALID_URL = r'https?://(?:www\.)?(nfb|onf)\.ca/film/(?P<id>[\da-z_-]+)'
_TEST = {
'url': 'https://www.nfb.ca/film/qallunaat_why_white_people_are_funny',
'info_dict': {
'id': 'qallunaat_why_white_people_are_funny',
'ext': 'mp4',
'title': 'Qallunaat! Why White People Are Funny ',
'description': 'md5:836d8aff55e087d04d9f6df554d4e038',
'duration': 3128,
'uploader': 'Mark Sandiford',
'uploader_id': 'mark-sandiford',
},
'params': {
# rtmp download
'skip_download': True,
}
}
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
video_id = mobj.group('id')
page = self._download_webpage('https://www.nfb.ca/film/%s' % video_id, video_id, 'Downloading film page')
uploader_id = self._html_search_regex(r'<a class="director-link" href="/explore-all-directors/([^/]+)/"',
page, 'director id', fatal=False)
uploader = self._html_search_regex(r'<em class="director-name" itemprop="name">([^<]+)</em>',
page, 'director name', fatal=False)
request = compat_urllib_request.Request('https://www.nfb.ca/film/%s/player_config' % video_id,
compat_urllib_parse.urlencode({'getConfig': 'true'}).encode('ascii'))
request.add_header('Content-Type', 'application/x-www-form-urlencoded')
request.add_header('X-NFB-Referer', 'http://www.nfb.ca/medias/flash/NFBVideoPlayer.swf')
config = self._download_xml(request, video_id, 'Downloading player config XML')
thumbnail = config.find("./player/stream/media[@type='posterImage']/assets/asset[@quality='high']/default/url").text
video = config.find("./player/stream/media[@type='video']")
duration = int(video.get('duration'))
title = video.find('title').text
description = video.find('description').text
# It seems assets always go from lower to better quality, so no need to sort
formats = [{
'url': x.find('default/streamerURI').text + '/',
'play_path': x.find('default/url').text,
'rtmp_live': False,
'ext': 'mp4',
'format_id': x.get('quality'),
} for x in video.findall('assets/asset')]
return {
'id': video_id,
'title': title,
'description': description,
'thumbnail': thumbnail,
'duration': duration,
'uploader': uploader,
'uploader_id': uploader_id,
'formats': formats,
}

View file

@ -1422,7 +1422,7 @@ def _map_to_format_list(urlmap):
class YoutubePlaylistIE(YoutubeBaseInfoExtractor):
IE_DESC = u'YouTube.com playlists'
_VALID_URL = r"""(?:
_VALID_URL = r"""(?x)(?:
(?:https?://)?
(?:\w+\.)?
youtube\.com/
@ -1431,7 +1431,11 @@ class YoutubePlaylistIE(YoutubeBaseInfoExtractor):
\? (?:.*?&)*? (?:p|a|list)=
| p/
)
((?:PL|EC|UU|FL|RD)?[0-9A-Za-z-_]{10,})
(
(?:PL|EC|UU|FL|RD)?[0-9A-Za-z-_]{10,}
# Top tracks, they can also include dots
|(?:MC)[\w\.]*
)
.*
|
((?:PL|EC|UU|FL|RD)[0-9A-Za-z-_]{10,})
@ -1441,11 +1445,6 @@ class YoutubePlaylistIE(YoutubeBaseInfoExtractor):
_VIDEO_RE = r'href="/watch\?v=(?P<id>[0-9A-Za-z_-]{11})&amp;[^"]*?index=(?P<index>\d+)'
IE_NAME = u'youtube:playlist'
@classmethod
def suitable(cls, url):
"""Receives a URL and returns True if suitable for this IE."""
return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
def _real_initialize(self):
self._login()
@ -1469,7 +1468,7 @@ def _extract_mix(self, playlist_id):
def _real_extract(self, url):
# Extract playlist id
mobj = re.match(self._VALID_URL, url, re.VERBOSE)
mobj = re.match(self._VALID_URL, url)
if mobj is None:
raise ExtractorError(u'Invalid URL: %s' % url)
playlist_id = mobj.group(1) or mobj.group(2)

View file

@ -1,2 +1,2 @@
__version__ = '2014.02.06.1'
__version__ = '2014.02.06.3'