[extractor/html5] Separate into own extractor (#4307)

Closes #4291

Authored by: coletdjnz, pukkandan
This commit is contained in:
pukkandan 2022-07-08 03:25:04 +05:30
parent 5fff2e576f
commit f14a2d8382
4 changed files with 30 additions and 21 deletions

View file

@ -85,7 +85,7 @@ def test_nocheckcertificate(self):
ydl = YoutubeDL({'logger': FakeLogger(), 'nocheckcertificate': True}) ydl = YoutubeDL({'logger': FakeLogger(), 'nocheckcertificate': True})
r = ydl.extract_info('https://127.0.0.1:%d/video.html' % self.port) r = ydl.extract_info('https://127.0.0.1:%d/video.html' % self.port)
self.assertEqual(r['entries'][0]['url'], 'https://127.0.0.1:%d/vid.mp4' % self.port) self.assertEqual(r['url'], 'https://127.0.0.1:%d/vid.mp4' % self.port)
class TestClientCert(unittest.TestCase): class TestClientCert(unittest.TestCase):
@ -113,7 +113,7 @@ def _run_test(self, **params):
**params, **params,
}) })
r = ydl.extract_info('https://127.0.0.1:%d/video.html' % self.port) r = ydl.extract_info('https://127.0.0.1:%d/video.html' % self.port)
self.assertEqual(r['entries'][0]['url'], 'https://127.0.0.1:%d/vid.mp4' % self.port) self.assertEqual(r['url'], 'https://127.0.0.1:%d/vid.mp4' % self.port)
def test_certificate_combined_nopass(self): def test_certificate_combined_nopass(self):
self._run_test(client_certificate=os.path.join(self.certdir, 'clientwithkey.crt')) self._run_test(client_certificate=os.path.join(self.certdir, 'clientwithkey.crt'))

View file

@ -662,6 +662,7 @@
HSEShowIE, HSEShowIE,
HSEProductIE, HSEProductIE,
) )
from .genericembeds import HTML5MediaEmbedIE
from .huajiao import HuajiaoIE from .huajiao import HuajiaoIE
from .huya import HuyaLiveIE from .huya import HuyaLiveIE
from .huffpost import HuffPostIE from .huffpost import HuffPostIE

View file

@ -3776,25 +3776,6 @@ def _real_extract(self, url):
elif embeds: elif embeds:
return self.playlist_result(embeds, **info_dict) return self.playlist_result(embeds, **info_dict)
# Look for HTML5 media
entries = self._parse_html5_media_entries(url, webpage, video_id, m3u8_id='hls')
if entries:
self.report_detected('HTML5 media')
if len(entries) == 1:
entries[0].update({
'id': video_id,
'title': video_title,
})
else:
for num, entry in enumerate(entries, start=1):
entry.update({
'id': f'{video_id}-{num}',
'title': '%s (%d)' % (video_title, num),
})
for entry in entries:
self._sort_formats(entry['formats'])
return self.playlist_result(entries, video_id, video_title)
jwplayer_data = self._find_jwplayer_data( jwplayer_data = self._find_jwplayer_data(
webpage, video_id, transform_source=js_to_json) webpage, video_id, transform_source=js_to_json)
if jwplayer_data: if jwplayer_data:

View file

@ -0,0 +1,27 @@
from .common import InfoExtractor
class HTML5MediaEmbedIE(InfoExtractor):
_VALID_URL = False
IE_NAME = 'html5'
_WEBPAGE_TESTS = [
{
'url': 'https://html.com/media/',
'info_dict': {
'title': 'HTML5 Media',
'description': 'md5:933b2d02ceffe7a7a0f3c8326d91cc2a',
},
'playlist_count': 2
}
]
def _extract_from_webpage(self, url, webpage):
video_id, title = self._generic_id(url), self._generic_title(url)
entries = self._parse_html5_media_entries(url, webpage, video_id, m3u8_id='hls') or []
for num, entry in enumerate(entries, start=1):
entry.update({
'id': f'{video_id}-{num}',
'title': f'{title} ({num})',
})
self._sort_formats(entry['formats'])
yield entry