mirror of
https://github.com/yt-dlp/yt-dlp.git
synced 2024-11-10 09:07:58 +01:00
[extractor] Extract chapters from JSON-LD (#2031)
Authored by: iw0nderhow, pukkandan
This commit is contained in:
parent
7592749cbe
commit
f522573787
2 changed files with 77 additions and 0 deletions
|
@ -208,6 +208,65 @@ def test_search_json_ld_realworld(self):
|
||||||
},
|
},
|
||||||
{'expected_type': 'NewsArticle'},
|
{'expected_type': 'NewsArticle'},
|
||||||
),
|
),
|
||||||
|
(
|
||||||
|
r'''<script type="application/ld+json">
|
||||||
|
{"url":"/vrtnu/a-z/het-journaal/2021/het-journaal-het-journaal-19u-20211231/",
|
||||||
|
"name":"Het journaal 19u",
|
||||||
|
"description":"Het journaal 19u van vrijdag 31 december 2021.",
|
||||||
|
"potentialAction":{"url":"https://vrtnu.page.link/pfVy6ihgCAJKgHqe8","@type":"ShareAction"},
|
||||||
|
"mainEntityOfPage":{"@id":"1640092242445","@type":"WebPage"},
|
||||||
|
"publication":[{
|
||||||
|
"startDate":"2021-12-31T19:00:00.000+01:00",
|
||||||
|
"endDate":"2022-01-30T23:55:00.000+01:00",
|
||||||
|
"publishedBy":{"name":"een","@type":"Organization"},
|
||||||
|
"publishedOn":{"url":"https://www.vrt.be/vrtnu/","name":"VRT NU","@type":"BroadcastService"},
|
||||||
|
"@id":"pbs-pub-3a7ec233-da95-4c1e-9b2b-cf5fdfebcbe8",
|
||||||
|
"@type":"BroadcastEvent"
|
||||||
|
}],
|
||||||
|
"video":{
|
||||||
|
"name":"Het journaal - Aflevering 365 (Seizoen 2021)",
|
||||||
|
"description":"Het journaal 19u van vrijdag 31 december 2021. Bekijk aflevering 365 van seizoen 2021 met VRT NU via de site of app.",
|
||||||
|
"thumbnailUrl":"//images.vrt.be/width1280/2021/12/31/80d5ed00-6a64-11ec-b07d-02b7b76bf47f.jpg",
|
||||||
|
"expires":"2022-01-30T23:55:00.000+01:00",
|
||||||
|
"hasPart":[
|
||||||
|
{"name":"Explosie Turnhout","startOffset":70,"@type":"Clip"},
|
||||||
|
{"name":"Jaarwisseling","startOffset":440,"@type":"Clip"},
|
||||||
|
{"name":"Natuurbranden Colorado","startOffset":1179,"@type":"Clip"},
|
||||||
|
{"name":"Klimaatverandering","startOffset":1263,"@type":"Clip"},
|
||||||
|
{"name":"Zacht weer","startOffset":1367,"@type":"Clip"},
|
||||||
|
{"name":"Financiële balans","startOffset":1383,"@type":"Clip"},
|
||||||
|
{"name":"Club Brugge","startOffset":1484,"@type":"Clip"},
|
||||||
|
{"name":"Mentale gezondheid bij topsporters","startOffset":1575,"@type":"Clip"},
|
||||||
|
{"name":"Olympische Winterspelen","startOffset":1728,"@type":"Clip"},
|
||||||
|
{"name":"Sober oudjaar in Nederland","startOffset":1873,"@type":"Clip"}
|
||||||
|
],
|
||||||
|
"duration":"PT34M39.23S",
|
||||||
|
"uploadDate":"2021-12-31T19:00:00.000+01:00",
|
||||||
|
"@id":"vid-9457d0c6-b8ac-4aba-b5e1-15aa3a3295b5",
|
||||||
|
"@type":"VideoObject"
|
||||||
|
},
|
||||||
|
"genre":["Nieuws en actua"],
|
||||||
|
"episodeNumber":365,
|
||||||
|
"partOfSeries":{"name":"Het journaal","@id":"222831405527","@type":"TVSeries"},
|
||||||
|
"partOfSeason":{"name":"Seizoen 2021","@id":"961809365527","@type":"TVSeason"},
|
||||||
|
"@context":"https://schema.org","@id":"961685295527","@type":"TVEpisode"}</script>
|
||||||
|
''',
|
||||||
|
{
|
||||||
|
'chapters': [
|
||||||
|
{"title": "Explosie Turnhout", "start_time": 70, "end_time": 440},
|
||||||
|
{"title": "Jaarwisseling", "start_time": 440, "end_time": 1179},
|
||||||
|
{"title": "Natuurbranden Colorado", "start_time": 1179, "end_time": 1263},
|
||||||
|
{"title": "Klimaatverandering", "start_time": 1263, "end_time": 1367},
|
||||||
|
{"title": "Zacht weer", "start_time": 1367, "end_time": 1383},
|
||||||
|
{"title": "Financiële balans", "start_time": 1383, "end_time": 1484},
|
||||||
|
{"title": "Club Brugge", "start_time": 1484, "end_time": 1575},
|
||||||
|
{"title": "Mentale gezondheid bij topsporters", "start_time": 1575, "end_time": 1728},
|
||||||
|
{"title": "Olympische Winterspelen", "start_time": 1728, "end_time": 1873},
|
||||||
|
{"title": "Sober oudjaar in Nederland", "start_time": 1873, "end_time": 2079.23}
|
||||||
|
],
|
||||||
|
'title': 'Het journaal - Aflevering 365 (Seizoen 2021)'
|
||||||
|
}, {}
|
||||||
|
),
|
||||||
(
|
(
|
||||||
# test multiple thumbnails in a list
|
# test multiple thumbnails in a list
|
||||||
r'''
|
r'''
|
||||||
|
|
|
@ -1429,6 +1429,23 @@ def extract_interaction_statistic(e):
|
||||||
continue
|
continue
|
||||||
info[count_key] = interaction_count
|
info[count_key] = interaction_count
|
||||||
|
|
||||||
|
def extract_chapter_information(e):
|
||||||
|
chapters = [{
|
||||||
|
'title': part.get('name'),
|
||||||
|
'start_time': part.get('startOffset'),
|
||||||
|
'end_time': part.get('endOffset'),
|
||||||
|
} for part in e.get('hasPart', []) if part.get('@type') == 'Clip']
|
||||||
|
for idx, (last_c, current_c, next_c) in enumerate(zip(
|
||||||
|
[{'end_time': 0}] + chapters, chapters, chapters[1:])):
|
||||||
|
current_c['end_time'] = current_c['end_time'] or next_c['start_time']
|
||||||
|
current_c['start_time'] = current_c['start_time'] or last_c['end_time']
|
||||||
|
if None in current_c.values():
|
||||||
|
self.report_warning(f'Chapter {idx} contains broken data. Not extracting chapters')
|
||||||
|
return
|
||||||
|
if chapters:
|
||||||
|
chapters[-1]['end_time'] = chapters[-1]['end_time'] or info['duration']
|
||||||
|
info['chapters'] = chapters
|
||||||
|
|
||||||
def extract_video_object(e):
|
def extract_video_object(e):
|
||||||
assert e['@type'] == 'VideoObject'
|
assert e['@type'] == 'VideoObject'
|
||||||
author = e.get('author')
|
author = e.get('author')
|
||||||
|
@ -1452,6 +1469,7 @@ def extract_video_object(e):
|
||||||
'view_count': int_or_none(e.get('interactionCount')),
|
'view_count': int_or_none(e.get('interactionCount')),
|
||||||
})
|
})
|
||||||
extract_interaction_statistic(e)
|
extract_interaction_statistic(e)
|
||||||
|
extract_chapter_information(e)
|
||||||
|
|
||||||
def traverse_json_ld(json_ld, at_top_level=True):
|
def traverse_json_ld(json_ld, at_top_level=True):
|
||||||
for e in json_ld:
|
for e in json_ld:
|
||||||
|
|
Loading…
Reference in a new issue