Update to ytdl-commit-2dd6c6e

[YouTube] Avoid crash if uploader_id extraction fails
2dd6c6edd8

Except:
    * 295736c9cba714fb5de7d1c3dd31d86e50091cf8 [jsinterp] Improve parsing
    * 384f632e8a9b61e864a26678d85b2b39933b9bae [ITV] Overhaul ITV extractor
    * 33db85c571304bbd6863e3407ad8d08764c9e53b [feat]: Add support to external downloader aria2p
This commit is contained in:
pukkandan 2023-02-17 16:51:34 +05:30
parent a538772969
commit 45b2ee6f4f
No known key found for this signature in database
GPG key ID: 7EEE9E1E817D0A39
19 changed files with 911 additions and 210 deletions

View file

@ -76,7 +76,7 @@
# NEW FEATURES # NEW FEATURES
* Merged with **youtube-dl v2021.12.17+ [commit/195f22f](https://github.com/ytdl-org/youtube-dl/commit/195f22f)** <!--([exceptions](https://github.com/yt-dlp/yt-dlp/issues/21))--> and **youtube-dlc v2020.11.11-3+ [commit/f9401f2](https://github.com/blackjack4494/yt-dlc/commit/f9401f2a91987068139c5f757b12fc711d4c0cee)**: You get all the features and patches of [youtube-dlc](https://github.com/blackjack4494/yt-dlc) in addition to the latest [youtube-dl](https://github.com/ytdl-org/youtube-dl) * Merged with **youtube-dl v2021.12.17+ [commit/2dd6c6e](https://github.com/ytdl-org/youtube-dl/commit/2dd6c6e)** ([exceptions](https://github.com/yt-dlp/yt-dlp/issues/21)) and **youtube-dlc v2020.11.11-3+ [commit/f9401f2](https://github.com/blackjack4494/yt-dlc/commit/f9401f2a91987068139c5f757b12fc711d4c0cee)**: You get all the features and patches of [youtube-dlc](https://github.com/blackjack4494/yt-dlc) in addition to the latest [youtube-dl](https://github.com/ytdl-org/youtube-dl)
* **[SponsorBlock Integration](#sponsorblock-options)**: You can mark/remove sponsor sections in YouTube videos by utilizing the [SponsorBlock](https://sponsor.ajay.app) API * **[SponsorBlock Integration](#sponsorblock-options)**: You can mark/remove sponsor sections in YouTube videos by utilizing the [SponsorBlock](https://sponsor.ajay.app) API

View file

@ -69,6 +69,7 @@ def test_opengraph(self):
<meta name="og:test1" content='foo > < bar'/> <meta name="og:test1" content='foo > < bar'/>
<meta name="og:test2" content="foo >//< bar"/> <meta name="og:test2" content="foo >//< bar"/>
<meta property=og-test3 content='Ill-formatted opengraph'/> <meta property=og-test3 content='Ill-formatted opengraph'/>
<meta property=og:test4 content=unquoted-value/>
''' '''
self.assertEqual(ie._og_search_title(html), 'Foo') self.assertEqual(ie._og_search_title(html), 'Foo')
self.assertEqual(ie._og_search_description(html), 'Some video\'s description ') self.assertEqual(ie._og_search_description(html), 'Some video\'s description ')
@ -81,6 +82,7 @@ def test_opengraph(self):
self.assertEqual(ie._og_search_property(('test0', 'test1'), html), 'foo > < bar') self.assertEqual(ie._og_search_property(('test0', 'test1'), html), 'foo > < bar')
self.assertRaises(RegexNotFoundError, ie._og_search_property, 'test0', html, None, fatal=True) self.assertRaises(RegexNotFoundError, ie._og_search_property, 'test0', html, None, fatal=True)
self.assertRaises(RegexNotFoundError, ie._og_search_property, ('test0', 'test00'), html, None, fatal=True) self.assertRaises(RegexNotFoundError, ie._og_search_property, ('test0', 'test00'), html, None, fatal=True)
self.assertEqual(ie._og_search_property('test4', html), 'unquoted-value')
def test_html_search_meta(self): def test_html_search_meta(self):
ie = self.ie ie = self.ie

View file

@ -10,6 +10,7 @@
from test.helper import is_download_test, try_rm from test.helper import is_download_test, try_rm
from yt_dlp import YoutubeDL from yt_dlp import YoutubeDL
from yt_dlp.utils import DownloadError
def _download_restricted(url, filename, age): def _download_restricted(url, filename, age):
@ -25,10 +26,14 @@ def _download_restricted(url, filename, age):
ydl.add_default_info_extractors() ydl.add_default_info_extractors()
json_filename = os.path.splitext(filename)[0] + '.info.json' json_filename = os.path.splitext(filename)[0] + '.info.json'
try_rm(json_filename) try_rm(json_filename)
ydl.download([url]) try:
res = os.path.exists(json_filename) ydl.download([url])
try_rm(json_filename) except DownloadError:
return res pass
else:
return os.path.exists(json_filename)
finally:
try_rm(json_filename)
@is_download_test @is_download_test
@ -38,12 +43,12 @@ def _assert_restricted(self, url, filename, age, old_age=None):
self.assertFalse(_download_restricted(url, filename, age)) self.assertFalse(_download_restricted(url, filename, age))
def test_youtube(self): def test_youtube(self):
self._assert_restricted('07FYdnEawAQ', '07FYdnEawAQ.mp4', 10) self._assert_restricted('HtVdAasjOgU', 'HtVdAasjOgU.mp4', 10)
def test_youporn(self): def test_youporn(self):
self._assert_restricted( self._assert_restricted(
'http://www.youporn.com/watch/505835/sex-ed-is-it-safe-to-masturbate-daily/', 'https://www.youporn.com/watch/16715086/sex-ed-in-detention-18-asmr/',
'505835.mp4', 2, old_age=25) '16715086.mp4', 2, old_age=25)
if __name__ == '__main__': if __name__ == '__main__':

View file

@ -1,5 +1,6 @@
""" Do not use! """ """ Do not use! """
import base64
import collections import collections
import ctypes import ctypes
import getpass import getpass
@ -29,6 +30,7 @@
from re import Pattern as compat_Pattern # noqa: F401 from re import Pattern as compat_Pattern # noqa: F401
from re import match as compat_Match # noqa: F401 from re import match as compat_Match # noqa: F401
from . import compat_expanduser, compat_HTMLParseError, compat_realpath
from .compat_utils import passthrough_module from .compat_utils import passthrough_module
from ..dependencies import Cryptodome_AES as compat_pycrypto_AES # noqa: F401 from ..dependencies import Cryptodome_AES as compat_pycrypto_AES # noqa: F401
from ..dependencies import brotli as compat_brotli # noqa: F401 from ..dependencies import brotli as compat_brotli # noqa: F401
@ -47,23 +49,25 @@ def compat_setenv(key, value, env=os.environ):
env[key] = value env[key] = value
compat_base64_b64decode = base64.b64decode
compat_basestring = str compat_basestring = str
compat_casefold = str.casefold compat_casefold = str.casefold
compat_chr = chr compat_chr = chr
compat_collections_abc = collections.abc compat_collections_abc = collections.abc
compat_cookiejar = http.cookiejar compat_cookiejar = compat_http_cookiejar = http.cookiejar
compat_cookiejar_Cookie = http.cookiejar.Cookie compat_cookiejar_Cookie = compat_http_cookiejar_Cookie = http.cookiejar.Cookie
compat_cookies = http.cookies compat_cookies = compat_http_cookies = http.cookies
compat_cookies_SimpleCookie = http.cookies.SimpleCookie compat_cookies_SimpleCookie = compat_http_cookies_SimpleCookie = http.cookies.SimpleCookie
compat_etree_Element = etree.Element compat_etree_Element = compat_xml_etree_ElementTree_Element = etree.Element
compat_etree_register_namespace = etree.register_namespace compat_etree_register_namespace = compat_xml_etree_register_namespace = etree.register_namespace
compat_filter = filter compat_filter = filter
compat_get_terminal_size = shutil.get_terminal_size compat_get_terminal_size = shutil.get_terminal_size
compat_getenv = os.getenv compat_getenv = os.getenv
compat_getpass = getpass.getpass compat_getpass = compat_getpass_getpass = getpass.getpass
compat_html_entities = html.entities compat_html_entities = html.entities
compat_html_entities_html5 = html.entities.html5 compat_html_entities_html5 = html.entities.html5
compat_HTMLParser = html.parser.HTMLParser compat_html_parser_HTMLParseError = compat_HTMLParseError
compat_HTMLParser = compat_html_parser_HTMLParser = html.parser.HTMLParser
compat_http_client = http.client compat_http_client = http.client
compat_http_server = http.server compat_http_server = http.server
compat_input = input compat_input = input
@ -72,6 +76,8 @@ def compat_setenv(key, value, env=os.environ):
compat_kwargs = lambda kwargs: kwargs compat_kwargs = lambda kwargs: kwargs
compat_map = map compat_map = map
compat_numeric_types = (int, float, complex) compat_numeric_types = (int, float, complex)
compat_os_path_expanduser = compat_expanduser
compat_os_path_realpath = compat_realpath
compat_print = print compat_print = print
compat_shlex_split = shlex.split compat_shlex_split = shlex.split
compat_socket_create_connection = socket.create_connection compat_socket_create_connection = socket.create_connection
@ -81,7 +87,9 @@ def compat_setenv(key, value, env=os.environ):
compat_subprocess_get_DEVNULL = lambda: DEVNULL compat_subprocess_get_DEVNULL = lambda: DEVNULL
compat_tokenize_tokenize = tokenize.tokenize compat_tokenize_tokenize = tokenize.tokenize
compat_urllib_error = urllib.error compat_urllib_error = urllib.error
compat_urllib_HTTPError = urllib.error.HTTPError
compat_urllib_parse = urllib.parse compat_urllib_parse = urllib.parse
compat_urllib_parse_parse_qs = urllib.parse.parse_qs
compat_urllib_parse_quote = urllib.parse.quote compat_urllib_parse_quote = urllib.parse.quote
compat_urllib_parse_quote_plus = urllib.parse.quote_plus compat_urllib_parse_quote_plus = urllib.parse.quote_plus
compat_urllib_parse_unquote_plus = urllib.parse.unquote_plus compat_urllib_parse_unquote_plus = urllib.parse.unquote_plus
@ -90,8 +98,10 @@ def compat_setenv(key, value, env=os.environ):
compat_urllib_request = urllib.request compat_urllib_request = urllib.request
compat_urllib_request_DataHandler = urllib.request.DataHandler compat_urllib_request_DataHandler = urllib.request.DataHandler
compat_urllib_response = urllib.response compat_urllib_response = urllib.response
compat_urlretrieve = urllib.request.urlretrieve compat_urlretrieve = compat_urllib_request_urlretrieve = urllib.request.urlretrieve
compat_xml_parse_error = etree.ParseError compat_xml_parse_error = compat_xml_etree_ElementTree_ParseError = etree.ParseError
compat_xpath = lambda xpath: xpath compat_xpath = lambda xpath: xpath
compat_zip = zip compat_zip = zip
workaround_optparse_bug9161 = lambda: None workaround_optparse_bug9161 = lambda: None
legacy = []

View file

@ -239,6 +239,7 @@
BleacherReportIE, BleacherReportIE,
BleacherReportCMSIE, BleacherReportCMSIE,
) )
from .blerp import BlerpIE
from .blogger import BloggerIE from .blogger import BloggerIE
from .bloomberg import BloombergIE from .bloomberg import BloombergIE
from .bokecc import BokeCCIE from .bokecc import BokeCCIE
@ -861,6 +862,7 @@
from .kickstarter import KickStarterIE from .kickstarter import KickStarterIE
from .kinja import KinjaEmbedIE from .kinja import KinjaEmbedIE
from .kinopoisk import KinoPoiskIE from .kinopoisk import KinoPoiskIE
from .kommunetv import KommunetvIE
from .kompas import KompasVideoIE from .kompas import KompasVideoIE
from .konserthusetplay import KonserthusetPlayIE from .konserthusetplay import KonserthusetPlayIE
from .koo import KooIE from .koo import KooIE
@ -1460,6 +1462,7 @@
PuhuTVIE, PuhuTVIE,
PuhuTVSerieIE, PuhuTVSerieIE,
) )
from .pr0gramm import Pr0grammStaticIE, Pr0grammIE
from .prankcast import PrankCastIE from .prankcast import PrankCastIE
from .premiershiprugby import PremiershipRugbyIE from .premiershiprugby import PremiershipRugbyIE
from .presstv import PressTVIE from .presstv import PressTVIE
@ -1521,6 +1524,10 @@
RayWenderlichCourseIE, RayWenderlichCourseIE,
) )
from .rbmaradio import RBMARadioIE from .rbmaradio import RBMARadioIE
from .rbgtum import (
RbgTumIE,
RbgTumCourseIE,
)
from .rcs import ( from .rcs import (
RCSIE, RCSIE,
RCSEmbedsIE, RCSEmbedsIE,

View file

@ -11,7 +11,7 @@
class AmericasTestKitchenIE(InfoExtractor): class AmericasTestKitchenIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?americastestkitchen\.com/(?:cooks(?:country|illustrated)/)?(?P<resource_type>episode|videos)/(?P<id>\d+)' _VALID_URL = r'https?://(?:www\.)?(?:americastestkitchen|cooks(?:country|illustrated))\.com/(?:cooks(?:country|illustrated)/)?(?P<resource_type>episode|videos)/(?P<id>\d+)'
_TESTS = [{ _TESTS = [{
'url': 'https://www.americastestkitchen.com/episode/582-weeknight-japanese-suppers', 'url': 'https://www.americastestkitchen.com/episode/582-weeknight-japanese-suppers',
'md5': 'b861c3e365ac38ad319cfd509c30577f', 'md5': 'b861c3e365ac38ad319cfd509c30577f',
@ -72,6 +72,12 @@ class AmericasTestKitchenIE(InfoExtractor):
}, { }, {
'url': 'https://www.americastestkitchen.com/cooksillustrated/videos/4478-beef-wellington', 'url': 'https://www.americastestkitchen.com/cooksillustrated/videos/4478-beef-wellington',
'only_matching': True, 'only_matching': True,
}, {
'url': 'https://www.cookscountry.com/episode/564-when-only-chocolate-will-do',
'only_matching': True,
}, {
'url': 'https://www.cooksillustrated.com/videos/4478-beef-wellington',
'only_matching': True,
}] }]
def _real_extract(self, url): def _real_extract(self, url):
@ -100,7 +106,7 @@ def _real_extract(self, url):
class AmericasTestKitchenSeasonIE(InfoExtractor): class AmericasTestKitchenSeasonIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?americastestkitchen\.com(?P<show>/cookscountry)?/episodes/browse/season_(?P<id>\d+)' _VALID_URL = r'https?://(?:www\.)?(?P<show>americastestkitchen|(?P<cooks>cooks(?:country|illustrated)))\.com(?:(?:/(?P<show2>cooks(?:country|illustrated)))?(?:/?$|(?<!ated)(?<!ated\.com)/episodes/browse/season_(?P<season>\d+)))'
_TESTS = [{ _TESTS = [{
# ATK Season # ATK Season
'url': 'https://www.americastestkitchen.com/episodes/browse/season_1', 'url': 'https://www.americastestkitchen.com/episodes/browse/season_1',
@ -117,29 +123,73 @@ class AmericasTestKitchenSeasonIE(InfoExtractor):
'title': 'Season 12', 'title': 'Season 12',
}, },
'playlist_count': 13, 'playlist_count': 13,
}, {
# America's Test Kitchen Series
'url': 'https://www.americastestkitchen.com/',
'info_dict': {
'id': 'americastestkitchen',
'title': 'America\'s Test Kitchen',
},
'playlist_count': 558,
}, {
# Cooks Country Series
'url': 'https://www.americastestkitchen.com/cookscountry',
'info_dict': {
'id': 'cookscountry',
'title': 'Cook\'s Country',
},
'playlist_count': 199,
}, {
'url': 'https://www.americastestkitchen.com/cookscountry/',
'only_matching': True,
}, {
'url': 'https://www.cookscountry.com/episodes/browse/season_12',
'only_matching': True,
}, {
'url': 'https://www.cookscountry.com',
'only_matching': True,
}, {
'url': 'https://www.americastestkitchen.com/cooksillustrated/',
'only_matching': True,
}, {
'url': 'https://www.cooksillustrated.com',
'only_matching': True,
}] }]
def _real_extract(self, url): def _real_extract(self, url):
show_path, season_number = self._match_valid_url(url).group('show', 'id') season_number, show1, show = self._match_valid_url(url).group('season', 'show', 'show2')
season_number = int(season_number) show_path = ('/' + show) if show else ''
show = show or show1
season_number = int_or_none(season_number)
slug = 'cco' if show_path == '/cookscountry' else 'atk' slug, title = {
'americastestkitchen': ('atk', 'America\'s Test Kitchen'),
'cookscountry': ('cco', 'Cook\'s Country'),
'cooksillustrated': ('cio', 'Cook\'s Illustrated'),
}[show]
season = 'Season %d' % season_number facet_filters = [
'search_document_klass:episode',
'search_show_slug:' + slug,
]
if season_number:
playlist_id = 'season_%d' % season_number
playlist_title = 'Season %d' % season_number
facet_filters.append('search_season_list:' + playlist_title)
else:
playlist_id = show
playlist_title = title
season_search = self._download_json( season_search = self._download_json(
'https://y1fnzxui30-dsn.algolia.net/1/indexes/everest_search_%s_season_desc_production' % slug, 'https://y1fnzxui30-dsn.algolia.net/1/indexes/everest_search_%s_season_desc_production' % slug,
season, headers={ playlist_id, headers={
'Origin': 'https://www.americastestkitchen.com', 'Origin': 'https://www.americastestkitchen.com',
'X-Algolia-API-Key': '8d504d0099ed27c1b73708d22871d805', 'X-Algolia-API-Key': '8d504d0099ed27c1b73708d22871d805',
'X-Algolia-Application-Id': 'Y1FNZXUI30', 'X-Algolia-Application-Id': 'Y1FNZXUI30',
}, query={ }, query={
'facetFilters': json.dumps([ 'facetFilters': json.dumps(facet_filters),
'search_season_list:' + season, 'attributesToRetrieve': 'description,search_%s_episode_number,search_document_date,search_url,title,search_atk_episode_season' % slug,
'search_document_klass:episode',
'search_show_slug:' + slug,
]),
'attributesToRetrieve': 'description,search_%s_episode_number,search_document_date,search_url,title' % slug,
'attributesToHighlight': '', 'attributesToHighlight': '',
'hitsPerPage': 1000, 'hitsPerPage': 1000,
}) })
@ -162,4 +212,4 @@ def entries():
} }
return self.playlist_result( return self.playlist_result(
entries(), 'season_%d' % season_number, season) entries(), playlist_id, playlist_title)

167
yt_dlp/extractor/blerp.py Normal file
View file

@ -0,0 +1,167 @@
import json
from .common import InfoExtractor
from ..utils import strip_or_none, traverse_obj
class BlerpIE(InfoExtractor):
IE_NAME = 'blerp'
_VALID_URL = r'https?://(?:www\.)?blerp\.com/soundbites/(?P<id>[0-9a-zA-Z]+)'
_TESTS = [{
'url': 'https://blerp.com/soundbites/6320fe8745636cb4dd677a5a',
'info_dict': {
'id': '6320fe8745636cb4dd677a5a',
'title': 'Samsung Galaxy S8 Over the Horizon Ringtone 2016',
'uploader': 'luminousaj',
'uploader_id': '5fb81e51aa66ae000c395478',
'ext': 'mp3',
'tags': ['samsung', 'galaxy', 's8', 'over the horizon', '2016', 'ringtone'],
}
}, {
'url': 'https://blerp.com/soundbites/5bc94ef4796001000498429f',
'info_dict': {
'id': '5bc94ef4796001000498429f',
'title': 'Yee',
'uploader': '179617322678353920',
'uploader_id': '5ba99cf71386730004552c42',
'ext': 'mp3',
'tags': ['YEE', 'YEET', 'wo ha haah catchy tune yee', 'yee']
}
}]
_GRAPHQL_OPERATIONNAME = "webBitePageGetBite"
_GRAPHQL_QUERY = (
'''query webBitePageGetBite($_id: MongoID!) {
web {
biteById(_id: $_id) {
...bitePageFrag
__typename
}
__typename
}
}
fragment bitePageFrag on Bite {
_id
title
userKeywords
keywords
color
visibility
isPremium
owned
price
extraReview
isAudioExists
image {
filename
original {
url
__typename
}
__typename
}
userReactions {
_id
reactions
createdAt
__typename
}
topReactions
totalSaveCount
saved
blerpLibraryType
license
licenseMetaData
playCount
totalShareCount
totalFavoriteCount
totalAddedToBoardCount
userCategory
userAudioQuality
audioCreationState
transcription
userTranscription
description
createdAt
updatedAt
author
listingType
ownerObject {
_id
username
profileImage {
filename
original {
url
__typename
}
__typename
}
__typename
}
transcription
favorited
visibility
isCurated
sourceUrl
audienceRating
strictAudienceRating
ownerId
reportObject {
reportedContentStatus
__typename
}
giphy {
mp4
gif
__typename
}
audio {
filename
original {
url
__typename
}
mp3 {
url
__typename
}
__typename
}
__typename
}
''')
def _real_extract(self, url):
audio_id = self._match_id(url)
data = {
'operationName': self._GRAPHQL_OPERATIONNAME,
'query': self._GRAPHQL_QUERY,
'variables': {
'_id': audio_id
}
}
headers = {
'Content-Type': 'application/json'
}
json_result = self._download_json('https://api.blerp.com/graphql',
audio_id, data=json.dumps(data).encode('utf-8'), headers=headers)
bite_json = json_result['data']['web']['biteById']
info_dict = {
'id': bite_json['_id'],
'url': bite_json['audio']['mp3']['url'],
'title': bite_json['title'],
'uploader': traverse_obj(bite_json, ('ownerObject', 'username'), expected_type=strip_or_none),
'uploader_id': traverse_obj(bite_json, ('ownerObject', '_id'), expected_type=strip_or_none),
'ext': 'mp3',
'tags': list(filter(None, map(strip_or_none, (traverse_obj(bite_json, 'userKeywords', expected_type=list) or []))) or None)
}
return info_dict

View file

@ -1,9 +1,5 @@
from .common import InfoExtractor from .common import InfoExtractor
from ..utils import ( from ..utils import float_or_none, int_or_none, make_archive_id, traverse_obj
traverse_obj,
float_or_none,
int_or_none
)
class CallinIE(InfoExtractor): class CallinIE(InfoExtractor):
@ -35,6 +31,54 @@ class CallinIE(InfoExtractor):
'episode_number': 1, 'episode_number': 1,
'episode_id': '218b979630a35ead12c6fd096f2996c56c37e4d0dc1f6dc0feada32dcf7b31cd' 'episode_id': '218b979630a35ead12c6fd096f2996c56c37e4d0dc1f6dc0feada32dcf7b31cd'
} }
}, {
'url': 'https://www.callin.com/episode/fcc-commissioner-brendan-carr-on-elons-PrumRdSQJW',
'md5': '14ede27ee2c957b7e4db93140fc0745c',
'info_dict': {
'id': 'c3dab47f237bf953d180d3f243477a84302798be0e0b29bc9ade6d60a69f04f5',
'ext': 'ts',
'title': 'FCC Commissioner Brendan Carr on Elons Starlink',
'description': 'Or, why the government doesnt like SpaceX',
'channel': 'The Pull Request',
'channel_url': 'https://callin.com/show/the-pull-request-ucnDJmEKAa',
'duration': 3182.472,
'series_id': '7e9c23156e4aecfdcaef46bfb2ed7ca268509622ec006c0f0f25d90e34496638',
'uploader_url': 'http://thepullrequest.com',
'upload_date': '20220902',
'episode': 'FCC Commissioner Brendan Carr on Elons Starlink',
'display_id': 'fcc-commissioner-brendan-carr-on-elons-PrumRdSQJW',
'series': 'The Pull Request',
'channel_id': '7e9c23156e4aecfdcaef46bfb2ed7ca268509622ec006c0f0f25d90e34496638',
'view_count': int,
'uploader': 'Antonio García Martínez',
'thumbnail': 'https://d1z76fhpoqkd01.cloudfront.net/shows/legacy/1ade9142625344045dc17cf523469ced1d93610762f4c886d06aa190a2f979e8.png',
'episode_id': 'c3dab47f237bf953d180d3f243477a84302798be0e0b29bc9ade6d60a69f04f5',
'timestamp': 1662100688.005,
}
}, {
'url': 'https://www.callin.com/episode/episode-81-elites-melt-down-over-student-debt-lzxMidUnjA',
'md5': '16f704ddbf82a27e3930533b12062f07',
'info_dict': {
'id': '8d06f869798f93a7814e380bceabea72d501417e620180416ff6bd510596e83c',
'ext': 'ts',
'title': 'Episode 81- Elites MELT DOWN over Student Debt Victory? Rumble in NYC?',
'description': 'Lets talk todays episode about the primary election shake up in NYC and the elites melting down over student debt cancelation.',
'channel': 'The DEBRIEF With Briahna Joy Gray',
'channel_url': 'https://callin.com/show/the-debrief-with-briahna-joy-gray-siiFDzGegm',
'duration': 10043.16,
'series_id': '61cea58444465fd26674069703bd8322993bc9e5b4f1a6d0872690554a046ff7',
'uploader_url': 'http://patreon.com/badfaithpodcast',
'upload_date': '20220826',
'episode': 'Episode 81- Elites MELT DOWN over Student Debt Victory? Rumble in NYC?',
'display_id': 'episode-',
'series': 'The DEBRIEF With Briahna Joy Gray',
'channel_id': '61cea58444465fd26674069703bd8322993bc9e5b4f1a6d0872690554a046ff7',
'view_count': int,
'uploader': 'Briahna Gray',
'thumbnail': 'https://d1z76fhpoqkd01.cloudfront.net/shows/legacy/461ea0d86172cb6aff7d6c80fd49259cf5e64bdf737a4650f8bc24cf392ca218.png',
'episode_id': '8d06f869798f93a7814e380bceabea72d501417e620180416ff6bd510596e83c',
'timestamp': 1661476708.282,
}
}] }]
def try_get_user_name(self, d): def try_get_user_name(self, d):
@ -86,6 +130,7 @@ def _real_extract(self, url):
return { return {
'id': id, 'id': id,
'_old_archive_ids': [make_archive_id(self, display_id.rsplit('-', 1)[-1])],
'display_id': display_id, 'display_id': display_id,
'title': title, 'title': title,
'formats': formats, 'formats': formats,

View file

@ -1,9 +1,5 @@
from .common import InfoExtractor from .common import InfoExtractor
from ..utils import ( from ..utils import int_or_none, url_or_none
ExtractorError,
int_or_none,
url_or_none,
)
class CamModelsIE(InfoExtractor): class CamModelsIE(InfoExtractor):
@ -17,32 +13,11 @@ class CamModelsIE(InfoExtractor):
def _real_extract(self, url): def _real_extract(self, url):
user_id = self._match_id(url) user_id = self._match_id(url)
webpage = self._download_webpage(
url, user_id, headers=self.geo_verification_headers())
manifest_root = self._html_search_regex(
r'manifestUrlRoot=([^&\']+)', webpage, 'manifest', default=None)
if not manifest_root:
ERRORS = (
("I'm offline, but let's stay connected", 'This user is currently offline'),
('in a private show', 'This user is in a private show'),
('is currently performing LIVE', 'This model is currently performing live'),
)
for pattern, message in ERRORS:
if pattern in webpage:
error = message
expected = True
break
else:
error = 'Unable to find manifest URL root'
expected = False
raise ExtractorError(error, expected=expected)
manifest = self._download_json( manifest = self._download_json(
'%s%s.json' % (manifest_root, user_id), user_id) 'https://manifest-server.naiadsystems.com/live/s:%s.json' % user_id, user_id)
formats = [] formats = []
thumbnails = []
for format_id, format_dict in manifest['formats'].items(): for format_id, format_dict in manifest['formats'].items():
if not isinstance(format_dict, dict): if not isinstance(format_dict, dict):
continue continue
@ -82,12 +57,20 @@ def _real_extract(self, url):
'quality': -10, 'quality': -10,
}) })
else: else:
if format_id == 'jpeg':
thumbnails.append({
'url': f['url'],
'width': f['width'],
'height': f['height'],
'format_id': f['format_id'],
})
continue continue
formats.append(f) formats.append(f)
return { return {
'id': user_id, 'id': user_id,
'title': user_id, 'title': user_id,
'thumbnails': thumbnails,
'is_live': True, 'is_live': True,
'formats': formats, 'formats': formats,
'age_limit': 18 'age_limit': 18

View file

@ -1338,7 +1338,7 @@ def _get_tfa_info(self, note='two-factor verification code'):
# Helper functions for extracting OpenGraph info # Helper functions for extracting OpenGraph info
@staticmethod @staticmethod
def _og_regexes(prop): def _og_regexes(prop):
content_re = r'content=(?:"([^"]+?)"|\'([^\']+?)\'|\s*([^\s"\'=<>`]+?))' content_re = r'content=(?:"([^"]+?)"|\'([^\']+?)\'|\s*([^\s"\'=<>`]+?)(?=\s|/?>))'
property_re = (r'(?:name|property)=(?:\'og%(sep)s%(prop)s\'|"og%(sep)s%(prop)s"|\s*og%(sep)s%(prop)s\b)' property_re = (r'(?:name|property)=(?:\'og%(sep)s%(prop)s\'|"og%(sep)s%(prop)s"|\s*og%(sep)s%(prop)s\b)'
% {'prop': re.escape(prop), 'sep': '(?:&#x3A;|[:-])'}) % {'prop': re.escape(prop), 'sep': '(?:&#x3A;|[:-])'})
template = r'<meta[^>]+?%s[^>]+?%s' template = r'<meta[^>]+?%s[^>]+?%s'

View file

@ -1,17 +1,20 @@
import re import re
import urllib.error
from .common import InfoExtractor from .common import InfoExtractor
from ..compat import ( from ..compat import compat_parse_qs
compat_parse_qs,
compat_urllib_parse_urlparse,
)
from ..utils import ( from ..utils import (
HEADRequest, ExtractorError,
determine_ext, determine_ext,
error_to_compat_str,
extract_attributes,
int_or_none, int_or_none,
merge_dicts,
parse_iso8601, parse_iso8601,
strip_or_none, strip_or_none,
try_get, traverse_obj,
url_or_none,
urljoin,
) )
@ -20,14 +23,90 @@ def _call_api(self, slug):
return self._download_json( return self._download_json(
'http://apis.ign.com/{0}/v3/{0}s/slug/{1}'.format(self._PAGE_TYPE, slug), slug) 'http://apis.ign.com/{0}/v3/{0}s/slug/{1}'.format(self._PAGE_TYPE, slug), slug)
def _checked_call_api(self, slug):
try:
return self._call_api(slug)
except ExtractorError as e:
if isinstance(e.cause, urllib.error.HTTPError) and e.cause.code == 404:
e.cause.args = e.cause.args or [
e.cause.geturl(), e.cause.getcode(), e.cause.reason]
raise ExtractorError(
'Content not found: expired?', cause=e.cause,
expected=True)
raise
def _extract_video_info(self, video, fatal=True):
video_id = video['videoId']
formats = []
refs = traverse_obj(video, 'refs', expected_type=dict) or {}
m3u8_url = url_or_none(refs.get('m3uUrl'))
if m3u8_url:
formats.extend(self._extract_m3u8_formats(
m3u8_url, video_id, 'mp4', 'm3u8_native',
m3u8_id='hls', fatal=False))
f4m_url = url_or_none(refs.get('f4mUrl'))
if f4m_url:
formats.extend(self._extract_f4m_formats(
f4m_url, video_id, f4m_id='hds', fatal=False))
for asset in (video.get('assets') or []):
asset_url = url_or_none(asset.get('url'))
if not asset_url:
continue
formats.append({
'url': asset_url,
'tbr': int_or_none(asset.get('bitrate'), 1000),
'fps': int_or_none(asset.get('frame_rate')),
'height': int_or_none(asset.get('height')),
'width': int_or_none(asset.get('width')),
})
mezzanine_url = traverse_obj(
video, ('system', 'mezzanineUrl'), expected_type=url_or_none)
if mezzanine_url:
formats.append({
'ext': determine_ext(mezzanine_url, 'mp4'),
'format_id': 'mezzanine',
'quality': 1,
'url': mezzanine_url,
})
thumbnails = traverse_obj(
video, ('thumbnails', ..., {'url': 'url'}), expected_type=url_or_none)
tags = traverse_obj(
video, ('tags', ..., 'displayName'),
expected_type=lambda x: x.strip() or None)
metadata = traverse_obj(video, 'metadata', expected_type=dict) or {}
title = traverse_obj(
metadata, 'longTitle', 'title', 'name',
expected_type=lambda x: x.strip() or None)
return {
'id': video_id,
'title': title,
'description': strip_or_none(metadata.get('description')),
'timestamp': parse_iso8601(metadata.get('publishDate')),
'duration': int_or_none(metadata.get('duration')),
'thumbnails': thumbnails,
'formats': formats,
'tags': tags,
}
class IGNIE(IGNBaseIE): class IGNIE(IGNBaseIE):
""" """
Extractor for some of the IGN sites, like www.ign.com, es.ign.com de.ign.com. Extractor for some of the IGN sites, like www.ign.com, es.ign.com de.ign.com.
Some videos of it.ign.com are also supported Some videos of it.ign.com are also supported
""" """
_VIDEO_PATH_RE = r'/(?:\d{4}/\d{2}/\d{2}/)?(?P<id>.+?)'
_VALID_URL = r'https?://(?:.+?\.ign|www\.pcmag)\.com/videos/(?:\d{4}/\d{2}/\d{2}/)?(?P<id>[^/?&#]+)' _PLAYLIST_PATH_RE = r'(?:/?\?(?P<filt>[^&#]+))?'
_VALID_URL = (
r'https?://(?:.+?\.ign|www\.pcmag)\.com/videos(?:%s)'
% '|'.join((_VIDEO_PATH_RE + r'(?:[/?&#]|$)', _PLAYLIST_PATH_RE)))
IE_NAME = 'ign.com' IE_NAME = 'ign.com'
_PAGE_TYPE = 'video' _PAGE_TYPE = 'video'
@ -42,7 +121,13 @@ class IGNIE(IGNBaseIE):
'timestamp': 1370440800, 'timestamp': 1370440800,
'upload_date': '20130605', 'upload_date': '20130605',
'tags': 'count:9', 'tags': 'count:9',
} 'display_id': 'the-last-of-us-review',
'thumbnail': 'https://assets1.ignimgs.com/vid/thumbnails/user/2014/03/26/lastofusreviewmimig2.jpg',
'duration': 440,
},
'params': {
'nocheckcertificate': True,
},
}, { }, {
'url': 'http://www.pcmag.com/videos/2015/01/06/010615-whats-new-now-is-gogo-snooping-on-your-data', 'url': 'http://www.pcmag.com/videos/2015/01/06/010615-whats-new-now-is-gogo-snooping-on-your-data',
'md5': 'f1581a6fe8c5121be5b807684aeac3f6', 'md5': 'f1581a6fe8c5121be5b807684aeac3f6',
@ -54,84 +139,48 @@ class IGNIE(IGNBaseIE):
'timestamp': 1420571160, 'timestamp': 1420571160,
'upload_date': '20150106', 'upload_date': '20150106',
'tags': 'count:4', 'tags': 'count:4',
} },
'skip': '404 Not Found',
}, { }, {
'url': 'https://www.ign.com/videos/is-a-resident-evil-4-remake-on-the-way-ign-daily-fix', 'url': 'https://www.ign.com/videos/is-a-resident-evil-4-remake-on-the-way-ign-daily-fix',
'only_matching': True, 'only_matching': True,
}] }]
@classmethod
def _extract_embed_urls(cls, url, webpage):
grids = re.findall(
r'''(?s)<section\b[^>]+\bclass\s*=\s*['"](?:[\w-]+\s+)*?content-feed-grid(?!\B|-)[^>]+>(.+?)</section[^>]*>''',
webpage)
return filter(None,
(urljoin(url, m.group('path')) for m in re.finditer(
r'''<a\b[^>]+\bhref\s*=\s*('|")(?P<path>/videos%s)\1'''
% cls._VIDEO_PATH_RE, grids[0] if grids else '')))
def _real_extract(self, url): def _real_extract(self, url):
display_id = self._match_id(url) display_id, filt = self._match_valid_url(url).group('id', 'filt')
video = self._call_api(display_id) if display_id:
video_id = video['videoId'] return self._extract_video(url, display_id)
metadata = video['metadata'] return self._extract_playlist(url, filt or 'all')
title = metadata.get('longTitle') or metadata.get('title') or metadata['name']
formats = [] def _extract_playlist(self, url, display_id):
refs = video.get('refs') or {} webpage = self._download_webpage(url, display_id)
m3u8_url = refs.get('m3uUrl') return self.playlist_result(
if m3u8_url: (self.url_result(u, self.ie_key())
formats.extend(self._extract_m3u8_formats( for u in self._extract_embed_urls(url, webpage)),
m3u8_url, video_id, 'mp4', 'm3u8_native', playlist_id=display_id)
m3u8_id='hls', fatal=False))
f4m_url = refs.get('f4mUrl') def _extract_video(self, url, display_id):
if f4m_url: video = self._checked_call_api(display_id)
formats.extend(self._extract_f4m_formats(
f4m_url, video_id, f4m_id='hds', fatal=False))
for asset in (video.get('assets') or []): info = self._extract_video_info(video)
asset_url = asset.get('url')
if not asset_url:
continue
formats.append({
'url': asset_url,
'tbr': int_or_none(asset.get('bitrate'), 1000),
'fps': int_or_none(asset.get('frame_rate')),
'height': int_or_none(asset.get('height')),
'width': int_or_none(asset.get('width')),
})
mezzanine_url = try_get(video, lambda x: x['system']['mezzanineUrl']) return merge_dicts({
if mezzanine_url:
formats.append({
'ext': determine_ext(mezzanine_url, 'mp4'),
'format_id': 'mezzanine',
'quality': 1,
'url': mezzanine_url,
})
thumbnails = []
for thumbnail in (video.get('thumbnails') or []):
thumbnail_url = thumbnail.get('url')
if not thumbnail_url:
continue
thumbnails.append({
'url': thumbnail_url,
})
tags = []
for tag in (video.get('tags') or []):
display_name = tag.get('displayName')
if not display_name:
continue
tags.append(display_name)
return {
'id': video_id,
'title': title,
'description': strip_or_none(metadata.get('description')),
'timestamp': parse_iso8601(metadata.get('publishDate')),
'duration': int_or_none(metadata.get('duration')),
'display_id': display_id, 'display_id': display_id,
'thumbnails': thumbnails, }, info)
'formats': formats,
'tags': tags,
}
class IGNVideoIE(InfoExtractor): class IGNVideoIE(IGNBaseIE):
_VALID_URL = r'https?://.+?\.ign\.com/(?:[a-z]{2}/)?[^/]+/(?P<id>\d+)/(?:video|trailer)/' _VALID_URL = r'https?://.+?\.ign\.com/(?:[a-z]{2}/)?[^/]+/(?P<id>\d+)/(?:video|trailer)/'
_TESTS = [{ _TESTS = [{
'url': 'http://me.ign.com/en/videos/112203/video/how-hitman-aims-to-be-different-than-every-other-s', 'url': 'http://me.ign.com/en/videos/112203/video/how-hitman-aims-to-be-different-than-every-other-s',
@ -143,7 +192,16 @@ class IGNVideoIE(InfoExtractor):
'description': 'Taking out assassination targets in Hitman has never been more stylish.', 'description': 'Taking out assassination targets in Hitman has never been more stylish.',
'timestamp': 1444665600, 'timestamp': 1444665600,
'upload_date': '20151012', 'upload_date': '20151012',
} 'display_id': '112203',
'thumbnail': 'https://sm.ign.com/ign_me/video/h/how-hitman/how-hitman-aims-to-be-different-than-every-other-s_8z14.jpg',
'duration': 298,
'tags': 'count:13',
'display_id': '112203',
'thumbnail': 'https://sm.ign.com/ign_me/video/h/how-hitman/how-hitman-aims-to-be-different-than-every-other-s_8z14.jpg',
'duration': 298,
'tags': 'count:13',
},
'expected_warnings': ['HTTP Error 400: Bad Request'],
}, { }, {
'url': 'http://me.ign.com/ar/angry-birds-2/106533/video/lrd-ldyy-lwl-lfylm-angry-birds', 'url': 'http://me.ign.com/ar/angry-birds-2/106533/video/lrd-ldyy-lwl-lfylm-angry-birds',
'only_matching': True, 'only_matching': True,
@ -163,22 +221,38 @@ class IGNVideoIE(InfoExtractor):
def _real_extract(self, url): def _real_extract(self, url):
video_id = self._match_id(url) video_id = self._match_id(url)
req = HEADRequest(url.rsplit('/', 1)[0] + '/embed') parsed_url = urllib.parse.urlparse(url)
url = self._request_webpage(req, video_id).geturl() embed_url = urllib.parse.urlunparse(
parsed_url._replace(path=parsed_url.path.rsplit('/', 1)[0] + '/embed'))
webpage, urlh = self._download_webpage_handle(embed_url, video_id)
new_url = urlh.geturl()
ign_url = compat_parse_qs( ign_url = compat_parse_qs(
compat_urllib_parse_urlparse(url).query).get('url', [None])[0] urllib.parse.urlparse(new_url).query).get('url', [None])[-1]
if ign_url: if ign_url:
return self.url_result(ign_url, IGNIE.ie_key()) return self.url_result(ign_url, IGNIE.ie_key())
return self.url_result(url) video = self._search_regex(r'(<div\b[^>]+\bdata-video-id\s*=\s*[^>]+>)', webpage, 'video element', fatal=False)
if not video:
if new_url == url:
raise ExtractorError('Redirect loop: ' + url)
return self.url_result(new_url)
video = extract_attributes(video)
video_data = video.get('data-settings') or '{}'
video_data = self._parse_json(video_data, video_id)['video']
info = self._extract_video_info(video_data)
return merge_dicts({
'display_id': video_id,
}, info)
class IGNArticleIE(IGNBaseIE): class IGNArticleIE(IGNBaseIE):
_VALID_URL = r'https?://.+?\.ign\.com/(?:articles(?:/\d{4}/\d{2}/\d{2})?|(?:[a-z]{2}/)?feature/\d+)/(?P<id>[^/?&#]+)' _VALID_URL = r'https?://.+?\.ign\.com/(?:articles(?:/\d{4}/\d{2}/\d{2})?|(?:[a-z]{2}/)?(?:[\w-]+/)*?feature/\d+)/(?P<id>[^/?&#]+)'
_PAGE_TYPE = 'article' _PAGE_TYPE = 'article'
_TESTS = [{ _TESTS = [{
'url': 'http://me.ign.com/en/feature/15775/100-little-things-in-gta-5-that-will-blow-your-mind', 'url': 'http://me.ign.com/en/feature/15775/100-little-things-in-gta-5-that-will-blow-your-mind',
'info_dict': { 'info_dict': {
'id': '524497489e4e8ff5848ece34', 'id': '72113',
'title': '100 Little Things in GTA 5 That Will Blow Your Mind', 'title': '100 Little Things in GTA 5 That Will Blow Your Mind',
}, },
'playlist': [ 'playlist': [
@ -186,34 +260,43 @@ class IGNArticleIE(IGNBaseIE):
'info_dict': { 'info_dict': {
'id': '5ebbd138523268b93c9141af17bec937', 'id': '5ebbd138523268b93c9141af17bec937',
'ext': 'mp4', 'ext': 'mp4',
'title': 'GTA 5 Video Review', 'title': 'Grand Theft Auto V Video Review',
'description': 'Rockstar drops the mic on this generation of games. Watch our review of the masterly Grand Theft Auto V.', 'description': 'Rockstar drops the mic on this generation of games. Watch our review of the masterly Grand Theft Auto V.',
'timestamp': 1379339880, 'timestamp': 1379339880,
'upload_date': '20130916', 'upload_date': '20130916',
'tags': 'count:12',
'thumbnail': 'https://assets1.ignimgs.com/thumbs/userUploaded/2021/8/16/gta-v-heistsjpg-e94705-1629138553533.jpeg',
'display_id': 'grand-theft-auto-v-video-review',
'duration': 501,
}, },
}, },
{ {
'info_dict': { 'info_dict': {
'id': '638672ee848ae4ff108df2a296418ee2', 'id': '638672ee848ae4ff108df2a296418ee2',
'ext': 'mp4', 'ext': 'mp4',
'title': '26 Twisted Moments from GTA 5 in Slow Motion', 'title': 'GTA 5 In Slow Motion',
'description': 'The twisted beauty of GTA 5 in stunning slow motion.', 'description': 'The twisted beauty of GTA 5 in stunning slow motion.',
'timestamp': 1386878820, 'timestamp': 1386878820,
'upload_date': '20131212', 'upload_date': '20131212',
'duration': 202,
'tags': 'count:25',
'display_id': 'gta-5-in-slow-motion',
'thumbnail': 'https://assets1.ignimgs.com/vid/thumbnails/user/2013/11/03/GTA-SLO-MO-1.jpg',
}, },
}, },
], ],
'params': { 'params': {
'playlist_items': '2-3',
'skip_download': True, 'skip_download': True,
}, },
'expected_warnings': ['Backend fetch failed'],
}, { }, {
'url': 'http://www.ign.com/articles/2014/08/15/rewind-theater-wild-trailer-gamescom-2014?watch', 'url': 'http://www.ign.com/articles/2014/08/15/rewind-theater-wild-trailer-gamescom-2014?watch',
'info_dict': { 'info_dict': {
'id': '53ee806780a81ec46e0790f8', 'id': '53ee806780a81ec46e0790f8',
'title': 'Rewind Theater - Wild Trailer Gamescom 2014', 'title': 'Rewind Theater - Wild Trailer Gamescom 2014',
}, },
'playlist_count': 2, 'playlist_count': 1,
'expected_warnings': ['Backend fetch failed'],
}, { }, {
# videoId pattern # videoId pattern
'url': 'http://www.ign.com/articles/2017/06/08/new-ducktales-short-donalds-birthday-doesnt-go-as-planned', 'url': 'http://www.ign.com/articles/2017/06/08/new-ducktales-short-donalds-birthday-doesnt-go-as-planned',
@ -236,18 +319,84 @@ class IGNArticleIE(IGNBaseIE):
'only_matching': True, 'only_matching': True,
}] }]
def _checked_call_api(self, slug):
try:
return self._call_api(slug)
except ExtractorError as e:
if isinstance(e.cause, urllib.error.HTTPError):
e.cause.args = e.cause.args or [
e.cause.geturl(), e.cause.getcode(), e.cause.reason]
if e.cause.code == 404:
raise ExtractorError(
'Content not found: expired?', cause=e.cause,
expected=True)
elif e.cause.code == 503:
self.report_warning(error_to_compat_str(e.cause))
return
raise
def _real_extract(self, url): def _real_extract(self, url):
display_id = self._match_id(url) display_id = self._match_id(url)
article = self._call_api(display_id) article = self._checked_call_api(display_id)
def entries(): if article:
media_url = try_get(article, lambda x: x['mediaRelations'][0]['media']['metadata']['url']) # obsolete ?
if media_url: def entries():
yield self.url_result(media_url, IGNIE.ie_key()) media_url = traverse_obj(
for content in (article.get('content') or []): article, ('mediaRelations', 0, 'media', 'metadata', 'url'),
for video_url in re.findall(r'(?:\[(?:ignvideo\s+url|youtube\s+clip_id)|<iframe[^>]+src)="([^"]+)"', content): expected_type=url_or_none)
yield self.url_result(video_url) if media_url:
yield self.url_result(media_url, IGNIE.ie_key())
for content in (article.get('content') or []):
for video_url in re.findall(r'(?:\[(?:ignvideo\s+url|youtube\s+clip_id)|<iframe[^>]+src)="([^"]+)"', content):
if url_or_none(video_url):
yield self.url_result(video_url)
return self.playlist_result(
entries(), article.get('articleId'),
traverse_obj(
article, ('metadata', 'headline'),
expected_type=lambda x: x.strip() or None))
webpage = self._download_webpage(url, display_id)
playlist_id = self._html_search_meta('dable:item_id', webpage, default=None)
if playlist_id:
def entries():
for m in re.finditer(
r'''(?s)<object\b[^>]+\bclass\s*=\s*("|')ign-videoplayer\1[^>]*>(?P<params>.+?)</object''',
webpage):
flashvars = self._search_regex(
r'''(<param\b[^>]+\bname\s*=\s*("|')flashvars\2[^>]*>)''',
m.group('params'), 'flashvars', default='')
flashvars = compat_parse_qs(extract_attributes(flashvars).get('value') or '')
v_url = url_or_none((flashvars.get('url') or [None])[-1])
if v_url:
yield self.url_result(v_url)
else:
playlist_id = self._search_regex(
r'''\bdata-post-id\s*=\s*("|')(?P<id>[\da-f]+)\1''',
webpage, 'id', group='id', default=None)
nextjs_data = self._search_nextjs_data(webpage, display_id)
def entries():
for player in traverse_obj(
nextjs_data,
('props', 'apolloState', 'ROOT_QUERY', lambda k, _: k.startswith('videoPlayerProps('), '__ref')):
# skip promo links (which may not always be served, eg GH CI servers)
if traverse_obj(nextjs_data,
('props', 'apolloState', player.replace('PlayerProps', 'ModernContent')),
expected_type=dict):
continue
video = traverse_obj(nextjs_data, ('props', 'apolloState', player), expected_type=dict) or {}
info = self._extract_video_info(video, fatal=False)
if info:
yield merge_dicts({
'display_id': display_id,
}, info)
return self.playlist_result( return self.playlist_result(
entries(), article.get('articleId'), entries(), playlist_id or display_id,
strip_or_none(try_get(article, lambda x: x['metadata']['headline']))) re.sub(r'\s+-\s+IGN\s*$', '', self._og_search_title(webpage, default='')) or None)

View file

@ -0,0 +1,31 @@
from .common import InfoExtractor
from ..utils import update_url
class KommunetvIE(InfoExtractor):
_VALID_URL = r'https://(\w+).kommunetv.no/archive/(?P<id>\w+)'
_TEST = {
'url': 'https://oslo.kommunetv.no/archive/921',
'md5': '5f102be308ee759be1e12b63d5da4bbc',
'info_dict': {
'id': '921',
'title': 'Bystyremøte',
'ext': 'mp4'
}
}
def _real_extract(self, url):
video_id = self._match_id(url)
headers = {
'Accept': 'application/json'
}
data = self._download_json('https://oslo.kommunetv.no/api/streams?streamType=1&id=%s' % video_id, video_id, headers=headers)
title = data['stream']['title']
file = data['playlist'][0]['playlist'][0]['file']
url = update_url(file, query=None, fragment=None)
formats = self._extract_m3u8_formats(url, video_id, ext='mp4', entry_protocol='m3u8_native', m3u8_id='hls', fatal=False)
return {
'id': video_id,
'formats': formats,
'title': title
}

View file

@ -1,5 +1,16 @@
import re
from .common import InfoExtractor from .common import InfoExtractor
from ..utils import js_to_json from ..utils import (
MONTH_NAMES,
clean_html,
get_element_by_class,
get_element_by_id,
int_or_none,
js_to_json,
qualities,
unified_strdate,
)
class MyVideoGeIE(InfoExtractor): class MyVideoGeIE(InfoExtractor):
@ -11,37 +22,50 @@ class MyVideoGeIE(InfoExtractor):
'id': '3941048', 'id': '3941048',
'ext': 'mp4', 'ext': 'mp4',
'title': 'The best prikol', 'title': 'The best prikol',
'upload_date': '20200611',
'thumbnail': r're:^https?://.*\.jpg$', 'thumbnail': r're:^https?://.*\.jpg$',
'uploader': 'md5:d72addd357b0dd914e704781f7f777d8', 'uploader': 'chixa33',
'description': 'md5:5c0371f540f5888d603ebfedd46b6df3' 'description': 'md5:5b067801318e33c2e6eea4ab90b1fdd3',
} },
} }
_MONTH_NAMES_KA = ['იანვარი', 'თებერვალი', 'მარტი', 'აპრილი', 'მაისი', 'ივნისი', 'ივლისი', 'აგვისტო', 'სექტემბერი', 'ოქტომბერი', 'ნოემბერი', 'დეკემბერი']
_quality = staticmethod(qualities(('SD', 'HD')))
def _real_extract(self, url): def _real_extract(self, url):
video_id = self._match_id(url) video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id) webpage = self._download_webpage(url, video_id)
title = self._html_search_regex(r'<h1[^>]*>([^<]+)</h1>', webpage, 'title') title = (
description = self._og_search_description(webpage) self._og_search_title(webpage, default=None)
thumbnail = self._html_search_meta(['og:image'], webpage) or clean_html(get_element_by_class('my_video_title', webpage))
uploader = self._search_regex(r'<a[^>]+class="mv_user_name"[^>]*>([^<]+)<', webpage, 'uploader', fatal=False) or self._html_search_regex(r'<title\b[^>]*>([^<]+)</title\b', webpage, 'title'))
jwplayer_sources = self._parse_json( jwplayer_sources = self._parse_json(
self._search_regex( self._search_regex(
r"(?s)jwplayer\(\"mvplayer\"\).setup\(.*?sources: (.*?])", webpage, 'jwplayer sources'), r'''(?s)jwplayer\s*\(\s*['"]mvplayer['"]\s*\)\s*\.\s*setup\s*\(.*?\bsources\s*:\s*(\[.*?])\s*[,});]''', webpage, 'jwplayer sources', fatal=False)
video_id, transform_source=js_to_json) or '',
video_id, transform_source=js_to_json, fatal=False)
def _formats_key(f): formats = self._parse_jwplayer_formats(jwplayer_sources or [], video_id)
if f['label'] == 'SD': for f in formats or []:
return -1 f['quality'] = self._quality(f['format_id'])
elif f['label'] == 'HD':
return 1
else:
return 0
jwplayer_sources = sorted(jwplayer_sources, key=_formats_key) description = (
self._og_search_description(webpage)
or get_element_by_id('long_desc_holder', webpage)
or self._html_search_meta('description', webpage))
formats = self._parse_jwplayer_formats(jwplayer_sources, video_id) uploader = self._search_regex(r'<a[^>]+class="mv_user_name"[^>]*>([^<]+)<', webpage, 'uploader', fatal=False)
upload_date = get_element_by_class('mv_vid_upl_date', webpage)
# as ka locale may not be present roll a local date conversion
upload_date = (unified_strdate(
# translate any ka month to an en one
re.sub('|'.join(self._MONTH_NAMES_KA),
lambda m: MONTH_NAMES['en'][self._MONTH_NAMES_KA.index(m.group(0))],
upload_date, re.I))
if upload_date else None)
return { return {
'id': video_id, 'id': video_id,
@ -49,5 +73,9 @@ def _formats_key(f):
'description': description, 'description': description,
'uploader': uploader, 'uploader': uploader,
'formats': formats, 'formats': formats,
'thumbnail': thumbnail 'thumbnail': self._og_search_thumbnail(webpage),
'upload_date': upload_date,
'view_count': int_or_none(get_element_by_class('mv_vid_views', webpage)),
'like_count': int_or_none(get_element_by_id('likes_count', webpage)),
'dislike_count': int_or_none(get_element_by_id('dislikes_count', webpage)),
} }

View file

@ -0,0 +1,97 @@
import re
from .common import InfoExtractor
from ..utils import merge_dicts
class Pr0grammStaticIE(InfoExtractor):
# Possible urls:
# https://pr0gramm.com/static/5466437
_VALID_URL = r'https?://pr0gramm\.com/static/(?P<id>[0-9]+)'
_TEST = {
'url': 'https://pr0gramm.com/static/5466437',
'md5': '52fa540d70d3edc286846f8ca85938aa',
'info_dict': {
'id': '5466437',
'ext': 'mp4',
'title': 'pr0gramm-5466437 by g11st',
'uploader': 'g11st',
'upload_date': '20221221',
}
}
def _real_extract(self, url):
video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
# Fetch media sources
entries = self._parse_html5_media_entries(url, webpage, video_id)
media_info = entries[0]
# Fetch author
uploader = self._html_search_regex(r'by\W+([\w-]+)\W+', webpage, 'uploader')
# Fetch approx upload timestamp from filename
# Have None-defaults in case the extraction fails
uploadDay = None
uploadMon = None
uploadYear = None
uploadTimestr = None
# (//img.pr0gramm.com/2022/12/21/62ae8aa5e2da0ebf.mp4)
m = re.search(r'//img\.pr0gramm\.com/(?P<year>[\d]+)/(?P<mon>[\d]+)/(?P<day>[\d]+)/\w+\.\w{,4}', webpage)
if (m):
# Up to a day of accuracy should suffice...
uploadDay = m.groupdict().get('day')
uploadMon = m.groupdict().get('mon')
uploadYear = m.groupdict().get('year')
uploadTimestr = uploadYear + uploadMon + uploadDay
return merge_dicts({
'id': video_id,
'title': 'pr0gramm-%s%s' % (video_id, (' by ' + uploader) if uploader else ''),
'uploader': uploader,
'upload_date': uploadTimestr
}, media_info)
# This extractor is for the primary url (used for sharing, and appears in the
# location bar) Since this page loads the DOM via JS, yt-dl can't find any
# video information here. So let's redirect to a compatibility version of
# the site, which does contain the <video>-element by itself, without requiring
# js to be ran.
class Pr0grammIE(InfoExtractor):
# Possible urls:
# https://pr0gramm.com/new/546637
# https://pr0gramm.com/new/video/546637
# https://pr0gramm.com/top/546637
# https://pr0gramm.com/top/video/546637
# https://pr0gramm.com/user/g11st/uploads/5466437
# https://pr0gramm.com/user/froschler/dafur-ist-man-hier/5091290
# https://pr0gramm.com/user/froschler/reinziehen-1elf/5232030
# https://pr0gramm.com/user/froschler/1elf/5232030
# https://pr0gramm.com/new/5495710:comment62621020 <- this is not the id!
# https://pr0gramm.com/top/fruher war alles damals/5498175
_VALID_URL = r'https?:\/\/pr0gramm\.com\/(?!static/\d+).+?\/(?P<id>[\d]+)(:|$)'
_TEST = {
'url': 'https://pr0gramm.com/new/video/5466437',
'info_dict': {
'id': '5466437',
'ext': 'mp4',
'title': 'pr0gramm-5466437 by g11st',
'uploader': 'g11st',
'upload_date': '20221221',
}
}
def _generic_title():
return "oof"
def _real_extract(self, url):
video_id = self._match_id(url)
return self.url_result(
'https://pr0gramm.com/static/' + video_id,
video_id=video_id,
ie=Pr0grammStaticIE.ie_key())

View file

@ -0,0 +1,93 @@
import re
from .common import InfoExtractor
class RbgTumIE(InfoExtractor):
_VALID_URL = r'https://live\.rbg\.tum\.de/w/(?P<id>.+)'
_TESTS = [{
# Combined view
'url': 'https://live.rbg.tum.de/w/cpp/22128',
'md5': '53a5e7b3e07128e33bbf36687fe1c08f',
'info_dict': {
'id': 'cpp/22128',
'ext': 'mp4',
'title': 'Lecture: October 18. 2022',
'series': 'Concepts of C++ programming (IN2377)',
}
}, {
# Presentation only
'url': 'https://live.rbg.tum.de/w/I2DL/12349/PRES',
'md5': '36c584272179f3e56b0db5d880639cba',
'info_dict': {
'id': 'I2DL/12349/PRES',
'ext': 'mp4',
'title': 'Lecture 3: Introduction to Neural Networks',
'series': 'Introduction to Deep Learning (IN2346)',
}
}, {
# Camera only
'url': 'https://live.rbg.tum.de/w/fvv-info/16130/CAM',
'md5': 'e04189d92ff2f56aedf5cede65d37aad',
'info_dict': {
'id': 'fvv-info/16130/CAM',
'ext': 'mp4',
'title': 'Fachschaftsvollversammlung',
'series': 'Fachschaftsvollversammlung Informatik',
}
}, ]
def _real_extract(self, url):
video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
m3u8 = self._html_search_regex(r'(https://.+?\.m3u8)', webpage, 'm3u8')
lecture_title = self._html_search_regex(r'(?si)<h1.*?>(.*)</h1>', webpage, 'title')
lecture_series_title = self._html_search_regex(
r'(?s)<title\b[^>]*>\s*(?:TUM-Live\s\|\s?)?([^:]+):?.*?</title>', webpage, 'series')
formats = self._extract_m3u8_formats(m3u8, video_id, 'mp4', entry_protocol='m3u8_native', m3u8_id='hls')
return {
'id': video_id,
'title': lecture_title,
'series': lecture_series_title,
'formats': formats,
}
class RbgTumCourseIE(InfoExtractor):
_VALID_URL = r'https://live\.rbg\.tum\.de/course/(?P<id>.+)'
_TESTS = [{
'url': 'https://live.rbg.tum.de/course/2022/S/fpv',
'info_dict': {
'title': 'Funktionale Programmierung und Verifikation (IN0003)',
'id': '2022/S/fpv',
},
'params': {
'noplaylist': False,
},
'playlist_count': 13,
}, {
'url': 'https://live.rbg.tum.de/course/2022/W/set',
'info_dict': {
'title': 'SET FSMPIC',
'id': '2022/W/set',
},
'params': {
'noplaylist': False,
},
'playlist_count': 6,
}, ]
def _real_extract(self, url):
course_id = self._match_id(url)
webpage = self._download_webpage(url, course_id)
lecture_series_title = self._html_search_regex(r'(?si)<h1.*?>(.*)</h1>', webpage, 'title')
lecture_urls = []
for lecture_url in re.findall(r'(?i)href="/w/(.+)(?<!/cam)(?<!/pres)(?<!/chat)"', webpage):
lecture_urls.append(self.url_result('https://live.rbg.tum.de/w/' + lecture_url, ie=RbgTumIE.ie_key()))
return self.playlist_result(lecture_urls, course_id, lecture_series_title)

View file

@ -130,6 +130,9 @@ class KnownPiracyIE(UnsupportedInfoExtractor):
URLS = ( URLS = (
r'dood\.(?:to|watch|so|pm|wf|re)', r'dood\.(?:to|watch|so|pm|wf|re)',
# Sites youtube-dl supports, but we won't
r'https://viewsb\.com',
r'https://filemoon\.sx',
) )
_TESTS = [{ _TESTS = [{

View file

@ -304,27 +304,33 @@ class VimeoIE(VimeoBaseInfoExtractor):
# _VALID_URL matches Vimeo URLs # _VALID_URL matches Vimeo URLs
_VALID_URL = r'''(?x) _VALID_URL = r'''(?x)
https?:// https?://
(?: (?:
(?: (?:
www| www|
player player
) )
\. \.
)? )?
vimeo\.com/ vimeo\.com/
(?!(?:channels|album|showcase)/[^/?#]+/?(?:$|[?#])|[^/]+/review/|ondemand/) (?:
(?:[^/]+/)*? (?P<u>user)|
(?: (?!(?:channels|album|showcase)/[^/?#]+/?(?:$|[?#])|[^/]+/review/|ondemand/)
(?: (?:.*?/)??
play_redirect_hls| (?P<q>
moogaloop\.swf)\?clip_id= (?:
)? play_redirect_hls|
(?:videos?/)? moogaloop\.swf)\?clip_id=
(?P<id>[0-9]+) )?
(?:/(?P<unlisted_hash>[\da-f]{10}))? (?:videos?/)?
/?(?:[?&].*)?(?:[#].*)?$ )
''' (?P<id>[0-9]+)
(?(u)
/(?!videos|likes)[^/?#]+/?|
(?(q)|/(?P<unlisted_hash>[\da-f]{10}))?
)
(?:(?(q)[&]|(?(u)|/?)[?]).*?)?(?:[#].*)?$
'''
IE_NAME = 'vimeo' IE_NAME = 'vimeo'
_EMBED_REGEX = [ _EMBED_REGEX = [
# iframe # iframe
@ -705,7 +711,12 @@ class VimeoIE(VimeoBaseInfoExtractor):
'params': { 'params': {
'skip_download': True, 'skip_download': True,
}, },
} },
{
# user playlist alias -> https://vimeo.com/258705797
'url': 'https://vimeo.com/user26785108/newspiritualguide',
'only_matching': True,
},
# https://gettingthingsdone.com/workflowmap/ # https://gettingthingsdone.com/workflowmap/
# vimeo embed with check-password page protected by Referer header # vimeo embed with check-password page protected by Referer header
] ]

View file

@ -21,7 +21,7 @@
class XHamsterIE(InfoExtractor): class XHamsterIE(InfoExtractor):
_DOMAINS = r'(?:xhamster\.(?:com|one|desi)|xhms\.pro|xhamster\d+\.com|xhday\.com)' _DOMAINS = r'(?:xhamster\.(?:com|one|desi)|xhms\.pro|xhamster\d+\.com|xhday\.com|xhvid\.com)'
_VALID_URL = r'''(?x) _VALID_URL = r'''(?x)
https?:// https?://
(?:.+?\.)?%s/ (?:.+?\.)?%s/
@ -120,6 +120,9 @@ class XHamsterIE(InfoExtractor):
}, { }, {
'url': 'https://xhday.com/videos/strapless-threesome-xhh7yVf', 'url': 'https://xhday.com/videos/strapless-threesome-xhh7yVf',
'only_matching': True, 'only_matching': True,
}, {
'url': 'https://xhvid.com/videos/lk-mm-xhc6wn6',
'only_matching': True,
}] }]
def _real_extract(self, url): def _real_extract(self, url):
@ -422,6 +425,9 @@ class XHamsterUserIE(InfoExtractor):
}, { }, {
'url': 'https://xhday.com/users/mobhunter', 'url': 'https://xhday.com/users/mobhunter',
'only_matching': True, 'only_matching': True,
}, {
'url': 'https://xhvid.com/users/pelushe21',
'only_matching': True,
}] }]
def _entries(self, user_id): def _entries(self, user_id):

View file

@ -3149,14 +3149,28 @@ def urlencode_postdata(*args, **kargs):
return urllib.parse.urlencode(*args, **kargs).encode('ascii') return urllib.parse.urlencode(*args, **kargs).encode('ascii')
def update_url(url, *, query_update=None, **kwargs):
"""Replace URL components specified by kwargs
@param url str or parse url tuple
@param query_update update query
@returns str
"""
if isinstance(url, str):
if not kwargs and not query_update:
return url
else:
url = urllib.parse.urlparse(url)
if query_update:
assert 'query' not in kwargs, 'query_update and query cannot be specified at the same time'
kwargs['query'] = urllib.parse.urlencode({
**urllib.parse.parse_qs(url.query),
**query_update
}, True)
return urllib.parse.urlunparse(url._replace(**kwargs))
def update_url_query(url, query): def update_url_query(url, query):
if not query: return update_url(url, query_update=query)
return url
parsed_url = urllib.parse.urlparse(url)
qs = urllib.parse.parse_qs(parsed_url.query)
qs.update(query)
return urllib.parse.urlunparse(parsed_url._replace(
query=urllib.parse.urlencode(qs, True)))
def update_Request(req, url=None, data=None, headers=None, query=None): def update_Request(req, url=None, data=None, headers=None, query=None):