Merge remote-tracking branch 'upstream/master'

This commit is contained in:
Andreas Schmitz 2014-02-07 12:20:58 +01:00
commit f4371f4784
11 changed files with 219 additions and 64 deletions

View file

@ -37,6 +37,8 @@ def test_youtube_playlist_matching(self):
assertPlaylist(u'https://www.youtube.com/playlist?list=PLwP_SiAcdui0KVebT0mU9Apz359a4ubsC') assertPlaylist(u'https://www.youtube.com/playlist?list=PLwP_SiAcdui0KVebT0mU9Apz359a4ubsC')
assertPlaylist(u'https://www.youtube.com/watch?v=AV6J6_AeFEQ&playnext=1&list=PL4023E734DA416012') #668 assertPlaylist(u'https://www.youtube.com/watch?v=AV6J6_AeFEQ&playnext=1&list=PL4023E734DA416012') #668
self.assertFalse('youtube:playlist' in self.matching_ies(u'PLtS2H6bU1M')) self.assertFalse('youtube:playlist' in self.matching_ies(u'PLtS2H6bU1M'))
# Top tracks
assertPlaylist('https://www.youtube.com/playlist?list=MCUS.20142101')
def test_youtube_matching(self): def test_youtube_matching(self):
self.assertTrue(YoutubeIE.suitable(u'PLtS2H6bU1M')) self.assertTrue(YoutubeIE.suitable(u'PLtS2H6bU1M'))

View file

@ -117,6 +117,13 @@ def test_youtube_mix(self):
original_video = entries[0] original_video = entries[0]
self.assertEqual(original_video['id'], 'rjFaenf1T-Y') self.assertEqual(original_video['id'], 'rjFaenf1T-Y')
def test_youtube_toptracks(self):
dl = FakeYDL()
ie = YoutubePlaylistIE(dl)
result = ie.extract('https://www.youtube.com/playlist?list=MCUS')
entries = result['entries']
self.assertEqual(len(entries), 100)
def test_youtube_toplist(self): def test_youtube_toplist(self):
dl = FakeYDL() dl = FakeYDL()
ie = YoutubeTopListIE(dl) ie = YoutubeTopListIE(dl)

View file

@ -41,6 +41,7 @@
'Chris Gahan', 'Chris Gahan',
'Saimadhav Heblikar', 'Saimadhav Heblikar',
'Mike Col', 'Mike Col',
'Andreas Schmitz',
) )
__license__ = 'Public Domain' __license__ = 'Public Domain'

View file

@ -143,8 +143,10 @@
from .naver import NaverIE from .naver import NaverIE
from .nba import NBAIE from .nba import NBAIE
from .nbc import NBCNewsIE from .nbc import NBCNewsIE
from .ndr import NDRIE
from .ndtv import NDTVIE from .ndtv import NDTVIE
from .newgrounds import NewgroundsIE from .newgrounds import NewgroundsIE
from .nfb import NFBIE
from .nhl import NHLIE, NHLVideocenterIE from .nhl import NHLIE, NHLVideocenterIE
from .niconico import NiconicoIE from .niconico import NiconicoIE
from .ninegag import NineGagIE from .ninegag import NineGagIE

View file

@ -1,14 +1,18 @@
from __future__ import unicode_literals
import re import re
import base64 import base64
import urllib
import json import json
from .common import InfoExtractor from .common import InfoExtractor
from ..utils import (
clean_html,
ExtractorError
)
video_container = ('.mp4', '.mkv', '.flv')
class ChilloutzoneIE(InfoExtractor): class ChilloutzoneIE(InfoExtractor):
_VALID_URL = r'(?:https?://)?(?:www\.)?chilloutzone\.net/video/(?P<id>[\w|-]+).html' _VALID_URL = r'https?://(?:www\.)?chilloutzone\.net/video/(?P<id>[\w|-]+)\.html'
_TEST = { _TEST = {
'url': 'http://www.chilloutzone.net/video/enemene-meck-alle-katzen-weg.html', 'url': 'http://www.chilloutzone.net/video/enemene-meck-alle-katzen-weg.html',
'md5': 'a76f3457e813ea0037e5244f509e66d1', 'md5': 'a76f3457e813ea0037e5244f509e66d1',
@ -16,6 +20,7 @@ class ChilloutzoneIE(InfoExtractor):
'id': 'enemene-meck-alle-katzen-weg', 'id': 'enemene-meck-alle-katzen-weg',
'ext': 'mp4', 'ext': 'mp4',
'title': 'Enemene Meck - Alle Katzen weg', 'title': 'Enemene Meck - Alle Katzen weg',
'description': 'Ist das der Umkehrschluss des Niesenden Panda-Babys?',
}, },
} }
@ -23,71 +28,45 @@ def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url) mobj = re.match(self._VALID_URL, url)
video_id = mobj.group('id') video_id = mobj.group('id')
webpage_url = 'http://www.chilloutzone.net/video/' + video_id + '.html' webpage = self._download_webpage(url, video_id)
# Log that we are starting to download the page base64_video_info = self._html_search_regex(
self.report_download_webpage(webpage_url) r'var cozVidData = "(.+?)";', webpage, 'video data')
webpage = self._download_webpage(webpage_url, video_id)
# Log that we are starting to parse the page
self.report_extraction(video_id)
# Find base64 decoded file info
base64_video_info = self._html_search_regex(r'var cozVidData = "(.+?)";', webpage, u'video Data')
# decode string and find video file
decoded_video_info = base64.b64decode(base64_video_info).decode("utf-8") decoded_video_info = base64.b64decode(base64_video_info).decode("utf-8")
video_info_dict = json.loads(decoded_video_info) video_info_dict = json.loads(decoded_video_info)
# get video information from dict # get video information from dict
media_url = video_info_dict['mediaUrl'] video_url = video_info_dict['mediaUrl']
description = video_info_dict['description'] description = clean_html(video_info_dict.get('description'))
title = video_info_dict['title'] title = video_info_dict['title']
native_platform = video_info_dict['nativePlatform'] native_platform = video_info_dict['nativePlatform']
native_video_id = video_info_dict['nativeVideoId'] native_video_id = video_info_dict['nativeVideoId']
source_priority = video_info_dict['sourcePriority'] source_priority = video_info_dict['sourcePriority']
# Start video extraction
video_url = ''
# If nativePlatform is None a fallback mechanism is used (i.e. youtube embed) # If nativePlatform is None a fallback mechanism is used (i.e. youtube embed)
if native_platform == None: if native_platform is None:
# Look for other video urls youtube_url = self._html_search_regex(
video_url = self._html_search_regex(r'<iframe.* src="(.+?)".*', webpage, u'fallback Video URL') r'<iframe.* src="((?:https?:)?//(?:[^.]+\.)?youtube\.com/.+?)"',
if 'youtube' in video_url: webpage, 'fallback video URL', default=None)
self.to_screen(u'Youtube video detected:') if youtube_url is not None:
return self.url_result(video_url, ie='Youtube') return self.url_result(youtube_url, ie='Youtube')
# For debugging purposes
#print video_info_dict
#print native_platform
#print native_video_id
#print source_priority
#print media_url
# Non Fallback: Decide to use native source (e.g. youtube or vimeo) or # Non Fallback: Decide to use native source (e.g. youtube or vimeo) or
# the own CDN # the own CDN
if source_priority == 'native': if source_priority == 'native':
if native_platform == 'youtube': if native_platform == 'youtube':
self.to_screen(u'Youtube video detected:') return self.url_result(video_id, ie='Youtube')
video_url = 'https://www.youtube.com/watch?v=' + native_video_id
return self.url_result(video_url, ie='Youtube')
if native_platform == 'vimeo': if native_platform == 'vimeo':
self.to_screen(u'Vimeo video detected:') return self.url_result(
video_url = 'http://vimeo.com/' + native_video_id 'http://vimeo.com/' + native_video_id, ie='Vimeo')
return self.url_result(video_url, ie='Vimeo')
# No redirect, use coz media url if not video_url:
video_url = media_url raise ExtractorError('No video found')
if video_url.endswith('.mp4') == False:
self.report_warning(u'Url does not contain a video container')
return []
return {
return [{
'id': video_id, 'id': video_id,
'url': video_url, 'url': video_url,
'ext': 'mp4', 'ext': 'mp4',
'title': title, 'title': title,
'description': description, 'description': description,
}] }

View file

@ -9,7 +9,7 @@
class ElPaisIE(InfoExtractor): class ElPaisIE(InfoExtractor):
_VALID_URL = r'https?://(?:[^.]+\.)?elpais\.com/.*/(?P<id>[^/#?]+)\.html(?:$|[?#])' _VALID_URL = r'https?://(?:[^.]+\.)?elpais\.com/.*/(?P<id>[^/#?]+)\.html(?:$|[?#])'
IE_DESCR = 'El País' IE_DESC = 'El País'
_TEST = { _TEST = {
'url': 'http://blogs.elpais.com/la-voz-de-inaki/2014/02/tiempo-nuevo-recetas-viejas.html', 'url': 'http://blogs.elpais.com/la-voz-de-inaki/2014/02/tiempo-nuevo-recetas-viejas.html',

View file

@ -61,7 +61,7 @@ def _real_extract(self, url):
} }
request = compat_urllib_request.Request( request = compat_urllib_request.Request(
'http://mooshare.biz/8dqtk4bjbp8g', compat_urllib_parse.urlencode(download_form)) 'http://mooshare.biz/%s' % video_id, compat_urllib_parse.urlencode(download_form))
request.add_header('Content-Type', 'application/x-www-form-urlencoded') request.add_header('Content-Type', 'application/x-www-form-urlencoded')
self.to_screen('%s: Waiting for timeout' % video_id) self.to_screen('%s: Waiting for timeout' % video_id)

View file

@ -0,0 +1,89 @@
# encoding: utf-8
from __future__ import unicode_literals
import re
from .common import InfoExtractor
from ..utils import ExtractorError
class NDRIE(InfoExtractor):
IE_NAME = 'ndr'
IE_DESC = 'NDR.de - Mediathek'
_VALID_URL = r'https?://www\.ndr\.de/.+?(?P<id>\d+)\.html'
_TESTS = [
# video
{
'url': 'http://www.ndr.de/fernsehen/sendungen/hallo_niedersachsen/media/hallonds19925.html',
'md5': '20eba151ff165f386643dad9c1da08f7',
'info_dict': {
'id': '19925',
'ext': 'mp4',
'title': 'Hallo Niedersachsen ',
'description': 'Bei Hallo Niedersachsen um 19:30 Uhr erfahren Sie alles, was am Tag in Niedersachsen los war.',
'duration': 1722,
},
},
# audio
{
'url': 'http://www.ndr.de/903/audio191719.html',
'md5': '41ed601768534dd18a9ae34d84798129',
'info_dict': {
'id': '191719',
'ext': 'mp3',
'title': '"Es war schockierend"',
'description': 'md5:ed7ff8364793545021a6355b97e95f10',
'duration': 112,
}
}
]
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
video_id = mobj.group('id')
page = self._download_webpage(url, video_id, 'Downloading page')
title = self._og_search_title(page)
description = self._og_search_description(page)
mobj = re.search(
r'<div class="duration"><span class="min">(?P<minutes>\d+)</span>:<span class="sec">(?P<seconds>\d+)</span></div>',
page)
duration = int(mobj.group('minutes')) * 60 + int(mobj.group('seconds')) if mobj else None
formats = []
mp3_url = re.search(r'''{src:'(?P<audio>[^']+)', type:"audio/mp3"},''', page)
if mp3_url:
formats.append({
'url': mp3_url.group('audio'),
'format_id': 'mp3',
})
thumbnail = None
video_url = re.search(r'''3: {src:'(?P<video>.+?)\.hi\.mp4', type:"video/mp4"},''', page)
if video_url:
thumbnail = self._html_search_regex(r'(?m)title: "NDR PLAYER",\s*poster: "([^"]+)",',
page, 'thumbnail', fatal=False)
if thumbnail:
thumbnail = 'http://www.ndr.de' + thumbnail
for format_id in ['lo', 'hi', 'hq']:
formats.append({
'url': '%s.%s.mp4' % (video_url.group('video'), format_id),
'format_id': format_id,
})
if not formats:
raise ExtractorError('No media links available for %s' % video_id)
return {
'id': video_id,
'title': title,
'description': description,
'thumbnail': thumbnail,
'duration': duration,
'formats': formats,
}

View file

@ -0,0 +1,76 @@
from __future__ import unicode_literals
import re
from .common import InfoExtractor
from ..utils import (
compat_urllib_request,
compat_urllib_parse,
)
class NFBIE(InfoExtractor):
IE_NAME = 'nfb'
IE_DESC = 'National Film Board of Canada'
_VALID_URL = r'https?://(?:www\.)?(nfb|onf)\.ca/film/(?P<id>[\da-z_-]+)'
_TEST = {
'url': 'https://www.nfb.ca/film/qallunaat_why_white_people_are_funny',
'info_dict': {
'id': 'qallunaat_why_white_people_are_funny',
'ext': 'mp4',
'title': 'Qallunaat! Why White People Are Funny ',
'description': 'md5:836d8aff55e087d04d9f6df554d4e038',
'duration': 3128,
'uploader': 'Mark Sandiford',
'uploader_id': 'mark-sandiford',
},
'params': {
# rtmp download
'skip_download': True,
}
}
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
video_id = mobj.group('id')
page = self._download_webpage('https://www.nfb.ca/film/%s' % video_id, video_id, 'Downloading film page')
uploader_id = self._html_search_regex(r'<a class="director-link" href="/explore-all-directors/([^/]+)/"',
page, 'director id', fatal=False)
uploader = self._html_search_regex(r'<em class="director-name" itemprop="name">([^<]+)</em>',
page, 'director name', fatal=False)
request = compat_urllib_request.Request('https://www.nfb.ca/film/%s/player_config' % video_id,
compat_urllib_parse.urlencode({'getConfig': 'true'}).encode('ascii'))
request.add_header('Content-Type', 'application/x-www-form-urlencoded')
request.add_header('X-NFB-Referer', 'http://www.nfb.ca/medias/flash/NFBVideoPlayer.swf')
config = self._download_xml(request, video_id, 'Downloading player config XML')
thumbnail = config.find("./player/stream/media[@type='posterImage']/assets/asset[@quality='high']/default/url").text
video = config.find("./player/stream/media[@type='video']")
duration = int(video.get('duration'))
title = video.find('title').text
description = video.find('description').text
# It seems assets always go from lower to better quality, so no need to sort
formats = [{
'url': x.find('default/streamerURI').text + '/',
'play_path': x.find('default/url').text,
'rtmp_live': False,
'ext': 'mp4',
'format_id': x.get('quality'),
} for x in video.findall('assets/asset')]
return {
'id': video_id,
'title': title,
'description': description,
'thumbnail': thumbnail,
'duration': duration,
'uploader': uploader,
'uploader_id': uploader_id,
'formats': formats,
}

View file

@ -1422,7 +1422,7 @@ def _map_to_format_list(urlmap):
class YoutubePlaylistIE(YoutubeBaseInfoExtractor): class YoutubePlaylistIE(YoutubeBaseInfoExtractor):
IE_DESC = u'YouTube.com playlists' IE_DESC = u'YouTube.com playlists'
_VALID_URL = r"""(?: _VALID_URL = r"""(?x)(?:
(?:https?://)? (?:https?://)?
(?:\w+\.)? (?:\w+\.)?
youtube\.com/ youtube\.com/
@ -1431,7 +1431,11 @@ class YoutubePlaylistIE(YoutubeBaseInfoExtractor):
\? (?:.*?&)*? (?:p|a|list)= \? (?:.*?&)*? (?:p|a|list)=
| p/ | p/
) )
((?:PL|EC|UU|FL|RD)?[0-9A-Za-z-_]{10,}) (
(?:PL|EC|UU|FL|RD)?[0-9A-Za-z-_]{10,}
# Top tracks, they can also include dots
|(?:MC)[\w\.]*
)
.* .*
| |
((?:PL|EC|UU|FL|RD)[0-9A-Za-z-_]{10,}) ((?:PL|EC|UU|FL|RD)[0-9A-Za-z-_]{10,})
@ -1441,11 +1445,6 @@ class YoutubePlaylistIE(YoutubeBaseInfoExtractor):
_VIDEO_RE = r'href="/watch\?v=(?P<id>[0-9A-Za-z_-]{11})&amp;[^"]*?index=(?P<index>\d+)' _VIDEO_RE = r'href="/watch\?v=(?P<id>[0-9A-Za-z_-]{11})&amp;[^"]*?index=(?P<index>\d+)'
IE_NAME = u'youtube:playlist' IE_NAME = u'youtube:playlist'
@classmethod
def suitable(cls, url):
"""Receives a URL and returns True if suitable for this IE."""
return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
def _real_initialize(self): def _real_initialize(self):
self._login() self._login()
@ -1469,7 +1468,7 @@ def _extract_mix(self, playlist_id):
def _real_extract(self, url): def _real_extract(self, url):
# Extract playlist id # Extract playlist id
mobj = re.match(self._VALID_URL, url, re.VERBOSE) mobj = re.match(self._VALID_URL, url)
if mobj is None: if mobj is None:
raise ExtractorError(u'Invalid URL: %s' % url) raise ExtractorError(u'Invalid URL: %s' % url)
playlist_id = mobj.group(1) or mobj.group(2) playlist_id = mobj.group(1) or mobj.group(2)

View file

@ -1,2 +1,2 @@
__version__ = '2014.02.06.1' __version__ = '2014.02.06.3'