From 06167fbbd3c407ab77e2c7f5031d1ec93886946f Mon Sep 17 00:00:00 2001 From: pukkandan Date: Wed, 27 Jan 2021 20:32:51 +0530 Subject: [PATCH] #31 Features from animelover1984/youtube-dl * Add `--get-comments` * [youtube] Extract comments * [billibilli] Added BiliBiliSearchIE, BilibiliChannelIE * [billibilli] Extract comments * [billibilli] Better video extraction * Write playlist data to infojson * [FFmpegMetadata] Embed infojson inside the video * [EmbedThumbnail] Try embedding in mp4 using ffprobe and `-disposition` * [EmbedThumbnail] Treat mka like mkv and mov like mp4 * [EmbedThumbnail] Embed in ogg/opus * [VideoRemuxer] Conditionally remux video * [VideoRemuxer] Add `-movflags +faststart` when remuxing from mp4 * [ffmpeg] Print entire stderr in verbose when there is error * [EmbedSubtitle] Warn when embedding ass in mp4 * [avanto] Use NFLTokenGenerator if possible --- README.md | 7 +- requirements.txt | 1 + youtube_dlc/YoutubeDL.py | 20 +- youtube_dlc/__init__.py | 3 +- youtube_dlc/extractor/anvato.py | 14 +- youtube_dlc/extractor/bilibili.py | 209 +++++++++++++++++++- youtube_dlc/extractor/extractors.py | 2 + youtube_dlc/extractor/youtube.py | 160 ++++++++++++++- youtube_dlc/options.py | 8 +- youtube_dlc/postprocessor/embedthumbnail.py | 118 ++++++++--- youtube_dlc/postprocessor/ffmpeg.py | 98 +++++++-- youtube_dlc/utils.py | 11 ++ 12 files changed, 583 insertions(+), 68 deletions(-) create mode 100644 requirements.txt diff --git a/README.md b/README.md index f65d4b00f..d21093f22 100644 --- a/README.md +++ b/README.md @@ -375,6 +375,8 @@ ## Filesystem Options: --write-annotations Write video annotations to a .annotations.xml file --no-write-annotations Do not write video annotations (default) + --get-comments Retrieve video comments to be placed in the + .info.json file --load-info-json FILE JSON file containing the video information (created with the "--write-info-json" option) @@ -575,7 +577,10 @@ ## Post-Processing Options: --remux-video FORMAT Remux the video into another container if necessary (currently supported: mp4|mkv). If target container does not support the - video/audio codec, remuxing will fail + video/audio codec, remuxing will fail. You + can specify multiple rules; eg. + "aac>m4a/mov>mp4/mkv" will remux aac to + m4a, mov to mp4 and anything else to mkv. --recode-video FORMAT Re-encode the video into another format if re-encoding is necessary (currently supported: mp4|flv|ogg|webm|mkv|avi) diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 000000000..26ced3f58 --- /dev/null +++ b/requirements.txt @@ -0,0 +1 @@ +mutagen diff --git a/youtube_dlc/YoutubeDL.py b/youtube_dlc/YoutubeDL.py index 41386a778..bf57d4765 100644 --- a/youtube_dlc/YoutubeDL.py +++ b/youtube_dlc/YoutubeDL.py @@ -202,6 +202,8 @@ class YoutubeDL(object): logtostderr: Log messages to stderr instead of stdout. writedescription: Write the video description to a .description file writeinfojson: Write the video description to a .info.json file + writecomments: Extract video comments. This will not be written to disk + unless writeinfojson is also given writeannotations: Write the video annotations to a .annotations.xml file writethumbnail: Write the thumbnail image to a file write_all_thumbnails: Write all thumbnail formats to files @@ -930,9 +932,7 @@ def extract_info(self, url, download=True, ie_key=None, info_dict=None, extra_in self.to_screen("[%s] %s: has already been recorded in archive" % ( ie_key, temp_id)) break - return self.__extract_info(url, ie, download, extra_info, process, info_dict) - else: self.report_error('no suitable InfoExtractor for URL %s' % url) @@ -1101,6 +1101,21 @@ def __process_playlist(self, ie_result, download): playlist = ie_result.get('title') or ie_result.get('id') self.to_screen('[download] Downloading playlist: %s' % playlist) + if self.params.get('writeinfojson', False): + infofn = replace_extension( + self.prepare_filepath(self.prepare_filename(ie_result), 'infojson'), + 'info.json', ie_result.get('ext')) + if self.params.get('overwrites', True) and os.path.exists(encodeFilename(infofn)): + self.to_screen('[info] Playlist description metadata is already present') + else: + self.to_screen('[info] Writing description playlist metadata as JSON to: ' + infofn) + playlist_info = dict(ie_result) + playlist_info.pop('entries') + try: + write_json_file(self.filter_requested_info(playlist_info), infofn) + except (OSError, IOError): + self.report_error('Cannot write playlist description metadata to JSON file ' + infofn) + playlist_results = [] playliststart = self.params.get('playliststart', 1) - 1 @@ -2105,6 +2120,7 @@ def dl(name, info, subtitle=False): except (OSError, IOError): self.report_error('Cannot write metadata to JSON file ' + infofn) return + info_dict['__infojson_filepath'] = infofn thumbdir = os.path.dirname(self.prepare_filepath(filename, 'thumbnail')) for thumbfn in self._write_thumbnails(info_dict, temp_filename): diff --git a/youtube_dlc/__init__.py b/youtube_dlc/__init__.py index 5f97b51ff..6a790339d 100644 --- a/youtube_dlc/__init__.py +++ b/youtube_dlc/__init__.py @@ -413,7 +413,8 @@ def parse_retries(retries): 'updatetime': opts.updatetime, 'writedescription': opts.writedescription, 'writeannotations': opts.writeannotations, - 'writeinfojson': opts.writeinfojson, + 'writeinfojson': opts.writeinfojson or opts.getcomments, + 'getcomments': opts.getcomments, 'writethumbnail': opts.writethumbnail, 'write_all_thumbnails': opts.write_all_thumbnails, 'writelink': opts.writelink, diff --git a/youtube_dlc/extractor/anvato.py b/youtube_dlc/extractor/anvato.py index b7398563b..a6410311c 100644 --- a/youtube_dlc/extractor/anvato.py +++ b/youtube_dlc/extractor/anvato.py @@ -9,6 +9,7 @@ import time from .common import InfoExtractor +from .anvato_token_generator import NFLTokenGenerator from ..aes import aes_encrypt from ..compat import compat_str from ..utils import ( @@ -203,6 +204,10 @@ class AnvatoIE(InfoExtractor): 'telemundo': 'anvato_mcp_telemundo_web_prod_c5278d51ad46fda4b6ca3d0ea44a7846a054f582' } + _TOKEN_GENERATORS = { + 'GXvEgwyJeWem8KCYXfeoHWknwP48Mboj': NFLTokenGenerator, + } + _API_KEY = '3hwbSuqqT690uxjNYBktSQpa5ZrpYYR0Iofx7NcJHyA' _ANVP_RE = r']+\bdata-anvp\s*=\s*(["\'])(?P(?:(?!\1).)+)\1' @@ -262,9 +267,12 @@ def _get_video_json(self, access_key, video_id): 'anvrid': anvrid, 'anvts': server_time, } - api['anvstk'] = md5_text('%s|%s|%d|%s' % ( - access_key, anvrid, server_time, - self._ANVACK_TABLE.get(access_key, self._API_KEY))) + if access_key in self._TOKEN_GENERATORS: + api['anvstk2'] = self._TOKEN_GENERATORS[access_key].generate(self, access_key, video_id) + else: + api['anvstk'] = md5_text('%s|%s|%d|%s' % ( + access_key, anvrid, server_time, + self._ANVACK_TABLE.get(access_key, self._API_KEY))) return self._download_json( video_data_url, video_id, transform_source=strip_jsonp, diff --git a/youtube_dlc/extractor/bilibili.py b/youtube_dlc/extractor/bilibili.py index d39ee8ffe..d8a4a224f 100644 --- a/youtube_dlc/extractor/bilibili.py +++ b/youtube_dlc/extractor/bilibili.py @@ -2,9 +2,10 @@ from __future__ import unicode_literals import hashlib +import json import re -from .common import InfoExtractor +from .common import InfoExtractor, SearchInfoExtractor from ..compat import ( compat_parse_qs, compat_urlparse, @@ -32,13 +33,14 @@ class BiliBiliIE(InfoExtractor): (?: video/[aA][vV]| anime/(?P\d+)/play\# - )(?P\d+)| - video/[bB][vV](?P[^/?#&]+) + )(?P\d+)| + video/[bB][vV](?P[^/?#&]+) ) + (?:/?\?p=(?P\d+))? ''' _TESTS = [{ - 'url': 'http://www.bilibili.tv/video/av1074402/', + 'url': 'http://www.bilibili.com/video/av1074402/', 'md5': '5f7d29e1a2872f3df0cf76b1f87d3788', 'info_dict': { 'id': '1074402', @@ -56,6 +58,10 @@ class BiliBiliIE(InfoExtractor): # Tested in BiliBiliBangumiIE 'url': 'http://bangumi.bilibili.com/anime/1869/play#40062', 'only_matching': True, + }, { + # bilibili.tv + 'url': 'http://www.bilibili.tv/video/av1074402/', + 'only_matching': True, }, { 'url': 'http://bangumi.bilibili.com/anime/5802/play#100643', 'md5': '3f721ad1e75030cc06faf73587cfec57', @@ -124,12 +130,20 @@ def _real_extract(self, url): url, smuggled_data = unsmuggle_url(url, {}) mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') or mobj.group('id_bv') + video_id = mobj.group('id_bv') or mobj.group('id') + + av_id, bv_id = self._get_video_id_set(video_id, mobj.group('id_bv') is not None) + video_id = av_id + anime_id = mobj.group('anime_id') + page_id = mobj.group('page') webpage = self._download_webpage(url, video_id) if 'anime/' not in url: cid = self._search_regex( + r'\bcid(?:["\']:|=)(\d+),["\']page(?:["\']:|=)' + str(page_id), webpage, 'cid', + default=None + ) or self._search_regex( r'\bcid(?:["\']:|=)(\d+)', webpage, 'cid', default=None ) or compat_parse_qs(self._search_regex( @@ -207,9 +221,9 @@ def _real_extract(self, url): break title = self._html_search_regex( - (']+\btitle=(["\'])(?P(?:(?!\1).)+)\1', - '(?s)<h1[^>]*>(?P<title>.+?)</h1>'), webpage, 'title', - group='title') + (r'<h1[^>]+\btitle=(["\'])(?P<title>(?:(?!\1).)+)\1', + r'(?s)<h1[^>]*>(?P<title>.+?)</h1>'), webpage, 'title', + group='title') + ('_p' + str(page_id) if page_id is not None else '') description = self._html_search_meta('description', webpage) timestamp = unified_timestamp(self._html_search_regex( r'<time[^>]+datetime="([^"]+)"', webpage, 'upload time', @@ -219,7 +233,8 @@ def _real_extract(self, url): # TODO 'view_count' requires deobfuscating Javascript info = { - 'id': video_id, + 'id': str(video_id) if page_id is None else '%s_p%s' % (video_id, page_id), + 'cid': cid, 'title': title, 'description': description, 'timestamp': timestamp, @@ -235,27 +250,134 @@ def _real_extract(self, url): 'uploader': uploader_mobj.group('name'), 'uploader_id': uploader_mobj.group('id'), }) + if not info.get('uploader'): info['uploader'] = self._html_search_meta( 'author', webpage, 'uploader', default=None) + comments = None + if self._downloader.params.get('getcomments', False): + comments = self._get_all_comment_pages(video_id) + + raw_danmaku = self._get_raw_danmaku(video_id, cid) + + raw_tags = self._get_tags(video_id) + tags = list(map(lambda x: x['tag_name'], raw_tags)) + + top_level_info = { + 'raw_danmaku': raw_danmaku, + 'comments': comments, + 'comment_count': len(comments) if comments is not None else None, + 'tags': tags, + 'raw_tags': raw_tags, + } + + ''' + # Requires https://github.com/m13253/danmaku2ass which is licenced under GPL3 + # See https://github.com/animelover1984/youtube-dl + danmaku = NiconicoIE.CreateDanmaku(raw_danmaku, commentType='Bilibili', x=1024, y=576) + entries[0]['subtitles'] = { + 'danmaku': [{ + 'ext': 'ass', + 'data': danmaku + }] + } + ''' + for entry in entries: entry.update(info) if len(entries) == 1: + entries[0].update(top_level_info) return entries[0] else: for idx, entry in enumerate(entries): entry['id'] = '%s_part%d' % (video_id, (idx + 1)) - return { + global_info = { '_type': 'multi_video', 'id': video_id, + 'bv_id': bv_id, 'title': title, 'description': description, 'entries': entries, } + global_info.update(info) + global_info.update(top_level_info) + + return global_info + + def _get_video_id_set(self, id, is_bv): + query = {'bvid': id} if is_bv else {'aid': id} + response = self._download_json( + "http://api.bilibili.cn/x/web-interface/view", + id, query=query, + note='Grabbing original ID via API') + + if response['code'] == -400: + raise ExtractorError('Video ID does not exist', expected=True, video_id=id) + elif response['code'] != 0: + raise ExtractorError('Unknown error occurred during API check (code %s)' % response['code'], expected=True, video_id=id) + return (response['data']['aid'], response['data']['bvid']) + + # recursive solution to getting every page of comments for the video + # we can stop when we reach a page without any comments + def _get_all_comment_pages(self, video_id, commentPageNumber=0): + comment_url = "https://api.bilibili.com/x/v2/reply?jsonp=jsonp&pn=%s&type=1&oid=%s&sort=2&_=1567227301685" % (commentPageNumber, video_id) + json_str = self._download_webpage( + comment_url, video_id, + note='Extracting comments from page %s' % (commentPageNumber)) + replies = json.loads(json_str)['data']['replies'] + if replies is None: + return [] + return self._get_all_children(replies) + self._get_all_comment_pages(video_id, commentPageNumber + 1) + + # extracts all comments in the tree + def _get_all_children(self, replies): + if replies is None: + return [] + + ret = [] + for reply in replies: + author = reply['member']['uname'] + author_id = reply['member']['mid'] + id = reply['rpid'] + text = reply['content']['message'] + timestamp = reply['ctime'] + parent = reply['parent'] if reply['parent'] != 0 else 'root' + + comment = { + "author": author, + "author_id": author_id, + "id": id, + "text": text, + "timestamp": timestamp, + "parent": parent, + } + ret.append(comment) + + # from the JSON, the comment structure seems arbitrarily deep, but I could be wrong. + # Regardless, this should work. + ret += self._get_all_children(reply['replies']) + + return ret + + def _get_raw_danmaku(self, video_id, cid): + # This will be useful if I decide to scrape all pages instead of doing them individually + # cid_url = "https://www.bilibili.com/widget/getPageList?aid=%s" % (video_id) + # cid_str = self._download_webpage(cid_url, video_id, note=False) + # cid = json.loads(cid_str)[0]['cid'] + + danmaku_url = "https://comment.bilibili.com/%s.xml" % (cid) + danmaku = self._download_webpage(danmaku_url, video_id, note='Downloading danmaku comments') + return danmaku + + def _get_tags(self, video_id): + tags_url = "https://api.bilibili.com/x/tag/archive/tags?aid=%s" % (video_id) + tags_json = self._download_json(tags_url, video_id, note='Downloading tags') + return tags_json['data'] + class BiliBiliBangumiIE(InfoExtractor): _VALID_URL = r'https?://bangumi\.bilibili\.com/anime/(?P<id>\d+)' @@ -324,6 +446,73 @@ def _real_extract(self, url): season_info.get('bangumi_title'), season_info.get('evaluate')) +class BilibiliChannelIE(InfoExtractor): + _VALID_URL = r'https?://space.bilibili\.com/(?P<id>\d+)' + # May need to add support for pagination? Need to find a user with many video uploads to test + _API_URL = "https://api.bilibili.com/x/space/arc/search?mid=%s&pn=1&ps=25&jsonp=jsonp" + _TEST = {} # TODO: Add tests + + def _real_extract(self, url): + list_id = self._match_id(url) + json_str = self._download_webpage(self._API_URL % list_id, "None") + + json_parsed = json.loads(json_str) + entries = [{ + '_type': 'url', + 'ie_key': BiliBiliIE.ie_key(), + 'url': ('https://www.bilibili.com/video/%s' % + entry['bvid']), + 'id': entry['bvid'], + } for entry in json_parsed['data']['list']['vlist']] + + return { + '_type': 'playlist', + 'id': list_id, + 'entries': entries + } + + +class BiliBiliSearchIE(SearchInfoExtractor): + IE_DESC = 'Bilibili video search, "bilisearch" keyword' + _MAX_RESULTS = 100000 + _SEARCH_KEY = 'bilisearch' + MAX_NUMBER_OF_RESULTS = 1000 + + def _get_n_results(self, query, n): + """Get a specified number of results for a query""" + + entries = [] + pageNumber = 0 + while True: + pageNumber += 1 + # FIXME + api_url = "https://api.bilibili.com/x/web-interface/search/type?context=&page=%s&order=pubdate&keyword=%s&duration=0&tids_2=&__refresh__=true&search_type=video&tids=0&highlight=1" % (pageNumber, query) + json_str = self._download_webpage( + api_url, "None", query={"Search_key": query}, + note='Extracting results from page %s' % pageNumber) + data = json.loads(json_str)['data'] + + # FIXME: this is hideous + if "result" not in data: + return { + '_type': 'playlist', + 'id': query, + 'entries': entries[:n] + } + + videos = data['result'] + for video in videos: + e = self.url_result(video['arcurl'], 'BiliBili', str(video['aid'])) + entries.append(e) + + if(len(entries) >= n or len(videos) >= BiliBiliSearchIE.MAX_NUMBER_OF_RESULTS): + return { + '_type': 'playlist', + 'id': query, + 'entries': entries[:n] + } + + class BilibiliAudioBaseIE(InfoExtractor): def _call_api(self, path, sid, query=None): if not query: diff --git a/youtube_dlc/extractor/extractors.py b/youtube_dlc/extractor/extractors.py index 10fd4a0b5..753778cc2 100644 --- a/youtube_dlc/extractor/extractors.py +++ b/youtube_dlc/extractor/extractors.py @@ -122,10 +122,12 @@ from .bild import BildIE from .bilibili import ( BiliBiliIE, + BiliBiliSearchIE, BiliBiliBangumiIE, BilibiliAudioIE, BilibiliAudioAlbumIE, BiliBiliPlayerIE, + BilibiliChannelIE, ) from .biobiochiletv import BioBioChileTVIE from .bitchute import ( diff --git a/youtube_dlc/extractor/youtube.py b/youtube_dlc/extractor/youtube.py index 7f3485db7..265c29a5a 100644 --- a/youtube_dlc/extractor/youtube.py +++ b/youtube_dlc/extractor/youtube.py @@ -2424,9 +2424,10 @@ def _extract_count(count_name): default=None )) - # annotations - video_annotations = None - if self._downloader.params.get('writeannotations', False): + # get xsrf for annotations or comments + get_annotations = self._downloader.params.get('writeannotations', False) + get_comments = self._downloader.params.get('getcomments', False) + if get_annotations or get_comments: xsrf_token = None ytcfg = self._extract_ytcfg(video_id, video_webpage) if ytcfg: @@ -2435,6 +2436,10 @@ def _extract_count(count_name): xsrf_token = self._search_regex( r'([\'"])XSRF_TOKEN\1\s*:\s*([\'"])(?P<xsrf_token>(?:(?!\2).)+)\2', video_webpage, 'xsrf token', group='xsrf_token', fatal=False) + + # annotations + video_annotations = None + if get_annotations: invideo_url = try_get( player_response, lambda x: x['annotations'][0]['playerAnnotationsUrlsRenderer']['invideoUrl'], compat_str) if xsrf_token and invideo_url: @@ -2454,6 +2459,153 @@ def _extract_count(count_name): chapters = self._extract_chapters(video_webpage, description_original, video_id, video_duration) + # Get comments + # TODO: Refactor and move to seperate function + if get_comments: + expected_video_comment_count = 0 + video_comments = [] + + def find_value(html, key, num_chars=2, separator='"'): + pos_begin = html.find(key) + len(key) + num_chars + pos_end = html.find(separator, pos_begin) + return html[pos_begin: pos_end] + + def search_dict(partial, key): + if isinstance(partial, dict): + for k, v in partial.items(): + if k == key: + yield v + else: + for o in search_dict(v, key): + yield o + elif isinstance(partial, list): + for i in partial: + for o in search_dict(i, key): + yield o + + try: + ncd = next(search_dict(yt_initial_data, 'nextContinuationData')) + continuations = [(ncd['continuation'], ncd['clickTrackingParams'])] + # Handle videos where comments have been disabled entirely + except StopIteration: + continuations = [] + + def get_continuation(continuation, itct, session_token, replies=False): + query = { + 'pbj': 1, + 'ctoken': continuation, + 'continuation': continuation, + 'itct': itct, + } + if replies: + query['action_get_comment_replies'] = 1 + else: + query['action_get_comments'] = 1 + + while True: + content, handle = self._download_webpage_handle( + 'https://www.youtube.com/comment_service_ajax', + video_id, + note=False, + expected_status=[413], + data=urlencode_postdata({ + 'session_token': session_token + }), + query=query, + headers={ + 'Accept': '*/*', + 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:76.0) Gecko/20100101 Firefox/76.0', + 'X-YouTube-Client-Name': '1', + 'X-YouTube-Client-Version': '2.20201202.06.01' + } + ) + + response_code = handle.getcode() + if (response_code == 200): + return self._parse_json(content, video_id) + if (response_code == 413): # Sometimes google makes continuations that are too big to be accepted by themselves. Grade A engineering + # self.to_screen(json.dumps(query)) + # self.to_screen('Google API rate limit detected; waiting 30 seconds before continuing') + # time.sleep(30) + # continue + return None + raise ExtractorError('Unexpected HTTP error code: %s' % response_code) + + first_continuation = True + while continuations: + continuation, itct = continuations.pop() + comment_response = get_continuation(continuation, itct, xsrf_token) + if not comment_response: + continue + if list(search_dict(comment_response, 'externalErrorMessage')): + raise ExtractorError('Error returned from server: ' + next(search_dict(comment_response, 'externalErrorMessage'))) + + item_section = comment_response['response']['continuationContents']['itemSectionContinuation'] + if first_continuation: + expected_video_comment_count = int(item_section['header']['commentsHeaderRenderer']['countText']['runs'][0]['text'].replace(' Comments', '').replace('1 Comment', '1').replace(',', '')) + first_continuation = False + if 'contents' not in item_section: + # continuation returned no comments? + # set an empty array as to not break the for loop + item_section['contents'] = [] + + for meta_comment in item_section['contents']: + comment = meta_comment['commentThreadRenderer']['comment']['commentRenderer'] + video_comments.append({ + 'id': comment['commentId'], + 'text': ''.join([c['text'] for c in comment['contentText']['runs']]), + 'time_text': comment['publishedTimeText']['runs'][0]['text'], + 'author': comment.get('authorText', {}).get('simpleText', ''), + 'votes': comment.get('voteCount', {}).get('simpleText', '0'), + 'author_thumbnail': comment['authorThumbnail']['thumbnails'][-1]['url'], + 'parent': 'root' + }) + if 'replies' not in meta_comment['commentThreadRenderer']: + continue + + reply_continuation = meta_comment['commentThreadRenderer']['replies']['commentRepliesRenderer']['continuations'][0]['nextContinuationData'] + continuation = reply_continuation['continuation'] + itct = reply_continuation['clickTrackingParams'] + while True: + time.sleep(1) + replies_data = get_continuation(continuation, itct, xsrf_token, True) + if not replies_data or 'continuationContents' not in replies_data[1]['response']: + break + + if self._downloader.params.get('verbose', False): + self.to_screen('[debug] Comments downloaded (chain %s) %s of ~%s' % (comment['commentId'], len(video_comments), expected_video_comment_count)) + reply_comment_meta = replies_data[1]['response']['continuationContents']['commentRepliesContinuation'] + for reply_meta in replies_data[1]['response']['continuationContents']['commentRepliesContinuation']['contents']: + reply_comment = reply_meta['commentRenderer'] + video_comments.append({ + 'id': reply_comment['commentId'], + 'text': ''.join([c['text'] for c in reply_comment['contentText']['runs']]), + 'time_text': reply_comment['publishedTimeText']['runs'][0]['text'], + 'author': reply_comment.get('authorText', {}).get('simpleText', ''), + 'votes': reply_comment.get('voteCount', {}).get('simpleText', '0'), + 'author_thumbnail': reply_comment['authorThumbnail']['thumbnails'][-1]['url'], + 'parent': comment['commentId'] + }) + if 'continuations' not in reply_comment_meta or len(reply_comment_meta['continuations']) == 0: + break + + continuation = reply_comment_meta['continuations'][0]['nextContinuationData']['continuation'] + itct = reply_comment_meta['continuations'][0]['nextContinuationData']['clickTrackingParams'] + + self.to_screen('Comments downloaded %s of ~%s' % (len(video_comments), expected_video_comment_count)) + + if 'continuations' in item_section: + new_continuations = [ + (ncd['nextContinuationData']['continuation'], ncd['nextContinuationData']['clickTrackingParams']) + for ncd in item_section['continuations']] + continuations += new_continuations + time.sleep(1) + + self.to_screen('Total comments downloaded %s of ~%s' % (len(video_comments), expected_video_comment_count)) + else: + expected_video_comment_count = None + video_comments = None + # Look for the DASH manifest if self._downloader.params.get('youtube_include_dash_manifest', True): dash_mpd_fatal = True @@ -2572,6 +2724,8 @@ def decrypt_sig(mobj): 'release_year': release_year, 'subscriber_count': subscriber_count, 'playable_in_embed': playable_in_embed, + 'comments': video_comments, + 'comment_count': expected_video_comment_count, } diff --git a/youtube_dlc/options.py b/youtube_dlc/options.py index 89c5cf6be..749a6c6e0 100644 --- a/youtube_dlc/options.py +++ b/youtube_dlc/options.py @@ -934,6 +934,10 @@ def _dict_from_multiple_values_options_callback( '--no-write-annotations', action='store_false', dest='writeannotations', help='Do not write video annotations (default)') + filesystem.add_option( + '--get-comments', + action='store_true', dest='getcomments', default=False, + help='Retrieve video comments to be placed in the .info.json file') filesystem.add_option( '--load-info-json', '--load-info', dest='load_info_filename', metavar='FILE', @@ -1014,7 +1018,9 @@ def _dict_from_multiple_values_options_callback( metavar='FORMAT', dest='remuxvideo', default=None, help=( 'Remux the video into another container if necessary (currently supported: mp4|mkv). ' - 'If target container does not support the video/audio codec, remuxing will fail')) + 'If target container does not support the video/audio codec, remuxing will fail. ' + 'You can specify multiple rules; eg. "aac>m4a/mov>mp4/mkv" will remux aac to m4a, mov to mp4 ' + 'and anything else to mkv.')) postproc.add_option( '--recode-video', metavar='FORMAT', dest='recodevideo', default=None, diff --git a/youtube_dlc/postprocessor/embedthumbnail.py b/youtube_dlc/postprocessor/embedthumbnail.py index 98a3531f1..bad005cca 100644 --- a/youtube_dlc/postprocessor/embedthumbnail.py +++ b/youtube_dlc/postprocessor/embedthumbnail.py @@ -4,6 +4,15 @@ import os import subprocess +import struct +import re +import base64 + +try: + import mutagen + _has_mutagen = True +except ImportError: + _has_mutagen = False from .ffmpeg import FFmpegPostProcessor @@ -11,11 +20,12 @@ check_executable, encodeArgument, encodeFilename, + error_to_compat_str, PostProcessingError, prepend_extension, + process_communicate_or_kill, replace_extension, shell_quote, - process_communicate_or_kill, ) @@ -73,6 +83,7 @@ def is_webp(path): # Rename back to unescaped for further processing os.rename(encodeFilename(escaped_thumbnail_jpg_filename), encodeFilename(thumbnail_jpg_filename)) thumbnail_filename = thumbnail_jpg_filename + thumbnail_ext = 'jpg' success = True if info['ext'] == 'mp3': @@ -83,47 +94,92 @@ def is_webp(path): self.to_screen('Adding thumbnail to "%s"' % filename) self.run_ffmpeg_multiple_files([filename, thumbnail_filename], temp_filename, options) - elif info['ext'] == 'mkv': - options = [ - '-c', 'copy', '-map', '0', '-dn', '-attach', thumbnail_filename, - '-metadata:s:t', 'mimetype=image/jpeg', '-metadata:s:t', 'filename=cover.jpg'] + elif info['ext'] in ['mkv', 'mka']: + options = ['-c', 'copy', '-map', '0', '-dn'] + + mimetype = 'image/%s' % ('png' if thumbnail_ext == 'png' else 'jpeg') + old_stream, new_stream = self.get_stream_number( + filename, ('tags', 'mimetype'), mimetype) + if old_stream is not None: + options.extend(['-map', '-0:%d' % old_stream]) + new_stream -= 1 + options.extend([ + '-attach', thumbnail_filename, + '-metadata:s:%d' % new_stream, 'mimetype=%s' % mimetype, + '-metadata:s:%d' % new_stream, 'filename=cover.%s' % thumbnail_ext]) self.to_screen('Adding thumbnail to "%s"' % filename) - self.run_ffmpeg_multiple_files([filename], temp_filename, options) + self.run_ffmpeg(filename, temp_filename, options) - elif info['ext'] in ['m4a', 'mp4']: - if not check_executable('AtomicParsley', ['-v']): - raise EmbedThumbnailPPError('AtomicParsley was not found. Please install.') + elif info['ext'] in ['m4a', 'mp4', 'mov']: + try: + options = ['-c', 'copy', '-map', '0', '-dn', '-map', '1'] - cmd = [encodeFilename('AtomicParsley', True), - encodeFilename(filename, True), - encodeArgument('--artwork'), - encodeFilename(thumbnail_filename, True), - encodeArgument('-o'), - encodeFilename(temp_filename, True)] - cmd += [encodeArgument(o) for o in self._configuration_args(exe='AtomicParsley')] + old_stream, new_stream = self.get_stream_number( + filename, ('disposition', 'attached_pic'), 1) + if old_stream is not None: + options.extend(['-map', '-0:%d' % old_stream]) + new_stream -= 1 + options.extend(['-disposition:%s' % new_stream, 'attached_pic']) - self.to_screen('Adding thumbnail to "%s"' % filename) - self.write_debug('AtomicParsley command line: %s' % shell_quote(cmd)) + self.to_screen('Adding thumbnail to "%s"' % filename) + self.run_ffmpeg_multiple_files([filename, thumbnail_filename], temp_filename, options) - p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE) - stdout, stderr = process_communicate_or_kill(p) + except PostProcessingError as err: + self.report_warning('unable to embed using ffprobe & ffmpeg; %s' % error_to_compat_str(err)) + if not check_executable('AtomicParsley', ['-v']): + raise EmbedThumbnailPPError('AtomicParsley was not found. Please install.') - if p.returncode != 0: - msg = stderr.decode('utf-8', 'replace').strip() - raise EmbedThumbnailPPError(msg) - # for formats that don't support thumbnails (like 3gp) AtomicParsley - # won't create to the temporary file - if b'No changes' in stdout: - self.report_warning('The file format doesn\'t support embedding a thumbnail') - success = False + cmd = [encodeFilename('AtomicParsley', True), + encodeFilename(filename, True), + encodeArgument('--artwork'), + encodeFilename(thumbnail_filename, True), + encodeArgument('-o'), + encodeFilename(temp_filename, True)] + cmd += [encodeArgument(o) for o in self._configuration_args(exe='AtomicParsley')] + + self.to_screen('Adding thumbnail to "%s"' % filename) + self.write_debug('AtomicParsley command line: %s' % shell_quote(cmd)) + p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE) + stdout, stderr = process_communicate_or_kill(p) + if p.returncode != 0: + msg = stderr.decode('utf-8', 'replace').strip() + raise EmbedThumbnailPPError(msg) + # for formats that don't support thumbnails (like 3gp) AtomicParsley + # won't create to the temporary file + if b'No changes' in stdout: + self.report_warning('The file format doesn\'t support embedding a thumbnail') + success = False + + elif info['ext'] in ['ogg', 'opus']: + if not _has_mutagen: + raise EmbedThumbnailPPError('module mutagen was not found. Please install.') + size_regex = r',\s*(?P<w>\d+)x(?P<h>\d+)\s*[,\[]' + size_result = self.run_ffmpeg_multiple_files([thumbnail_filename], '', ['-hide_banner']) + mobj = re.search(size_regex, size_result) + width, height = int(mobj.group('w')), int(mobj.group('h')) + mimetype = ('image/%s' % ('png' if thumbnail_ext == 'png' else 'jpeg')).encode('ascii') + + # https://xiph.org/flac/format.html#metadata_block_picture + data = bytearray() + data += struct.pack('>II', 3, len(mimetype)) + data += mimetype + data += struct.pack('>IIIIII', 0, width, height, 8, 0, os.stat(thumbnail_filename).st_size) # 32 if png else 24 + + fin = open(thumbnail_filename, "rb") + data += fin.read() + fin.close() + + temp_filename = filename + f = mutagen.File(temp_filename) + f.tags['METADATA_BLOCK_PICTURE'] = base64.b64encode(data).decode('ascii') + f.save() else: - raise EmbedThumbnailPPError('Only mp3, mkv, m4a and mp4 are supported for thumbnail embedding for now.') + raise EmbedThumbnailPPError('Supported filetypes for thumbnail embedding are: mp3, mkv/mka, ogg/opus, m4a/mp4/mov') - if success: + if success and temp_filename != filename: os.remove(encodeFilename(filename)) os.rename(encodeFilename(temp_filename), encodeFilename(filename)) - files_to_delete = [] if self._already_have_thumbnail else [thumbnail_filename] return files_to_delete, info diff --git a/youtube_dlc/postprocessor/ffmpeg.py b/youtube_dlc/postprocessor/ffmpeg.py index 18696a932..f2be0f415 100644 --- a/youtube_dlc/postprocessor/ffmpeg.py +++ b/youtube_dlc/postprocessor/ffmpeg.py @@ -5,6 +5,7 @@ import subprocess import time import re +import json from .common import AudioConversionError, PostProcessor @@ -20,8 +21,9 @@ subtitles_filename, dfxp2srt, ISO639Utils, - replace_extension, process_communicate_or_kill, + replace_extension, + traverse_dict, ) @@ -201,6 +203,37 @@ def get_audio_codec(self, path): return mobj.group(1) return None + def get_metadata_object(self, path, opts=[]): + if self.probe_basename != 'ffprobe': + if self.probe_available: + self.report_warning('Only ffprobe is supported for metadata extraction') + raise PostProcessingError('ffprobe not found. Please install.') + self.check_version() + + cmd = [ + encodeFilename(self.probe_executable, True), + encodeArgument('-hide_banner'), + encodeArgument('-show_format'), + encodeArgument('-show_streams'), + encodeArgument('-print_format'), + encodeArgument('json'), + ] + + cmd += opts + cmd.append(encodeFilename(self._ffmpeg_filename_argument(path), True)) + if self._downloader.params.get('verbose', False): + self._downloader.to_screen('[debug] ffprobe command line: %s' % shell_quote(cmd)) + p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE) + stdout, stderr = p.communicate() + return json.loads(stdout.decode('utf-8', 'replace')) + + def get_stream_number(self, path, keys, value): + streams = self.get_metadata_object(path)['streams'] + num = next( + (i for i, stream in enumerate(streams) if traverse_dict(stream, keys, casesense=False) == value), + None) + return num, len(streams) + def run_ffmpeg_multiple_files(self, input_paths, out_path, opts): self.check_version() @@ -227,10 +260,12 @@ def run_ffmpeg_multiple_files(self, input_paths, out_path, opts): p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE) stdout, stderr = process_communicate_or_kill(p) if p.returncode != 0: - stderr = stderr.decode('utf-8', 'replace') - msg = stderr.strip().split('\n')[-1] - raise FFmpegPostProcessorError(msg) + stderr = stderr.decode('utf-8', 'replace').strip() + if self._downloader.params.get('verbose', False): + self.report_error(stderr) + raise FFmpegPostProcessorError(stderr.split('\n')[-1]) self.try_utime(out_path, oldest_mtime, oldest_mtime) + return stderr.decode('utf-8', 'replace') def run_ffmpeg(self, path, out_path, opts): self.run_ffmpeg_multiple_files([path], out_path, opts) @@ -240,6 +275,8 @@ def _ffmpeg_filename_argument(self, fn): # interprets that as a protocol) or can start with '-' (-- is broken in # ffmpeg, see https://ffmpeg.org/trac/ffmpeg/ticket/2127 for details) # Also leave '-' intact in order not to break streaming to stdout. + if fn.startswith(('http://', 'https://')): + return fn return 'file:' + fn if fn != '-' else fn @@ -349,21 +386,35 @@ def run(self, information): class FFmpegVideoRemuxerPP(FFmpegPostProcessor): def __init__(self, downloader=None, preferedformat=None): super(FFmpegVideoRemuxerPP, self).__init__(downloader) - self._preferedformat = preferedformat + self._preferedformats = preferedformat.lower().split('/') def run(self, information): path = information['filepath'] - if information['ext'] == self._preferedformat: - self.to_screen('Not remuxing video file %s - already is in target format %s' % (path, self._preferedformat)) + sourceext, targetext = information['ext'].lower(), None + for pair in self._preferedformats: + kv = pair.split('>') + if len(kv) == 1 or kv[0].strip() == sourceext: + targetext = kv[-1].strip() + break + + _skip_msg = ( + 'could not find a mapping for %s' if not targetext + else 'already is in target format %s' if sourceext == targetext + else None) + if _skip_msg: + self.to_screen('Not remuxing media file %s - %s' % (path, _skip_msg % sourceext)) return [], information + options = ['-c', 'copy', '-map', '0', '-dn'] - prefix, sep, ext = path.rpartition('.') - outpath = prefix + sep + self._preferedformat - self.to_screen('Remuxing video from %s to %s, Destination: ' % (information['ext'], self._preferedformat) + outpath) + if targetext in ['mp4', 'm4a', 'mov']: + options.extend(['-movflags', '+faststart']) + prefix, sep, oldext = path.rpartition('.') + outpath = prefix + sep + targetext + self.to_screen('Remuxing video from %s to %s; Destination: %s' % (sourceext, targetext, outpath)) self.run_ffmpeg(path, outpath, options) information['filepath'] = outpath - information['format'] = self._preferedformat - information['ext'] = self._preferedformat + information['format'] = targetext + information['ext'] = targetext return [path], information @@ -406,18 +457,22 @@ def run(self, information): sub_langs = [] sub_filenames = [] webm_vtt_warn = False + mp4_ass_warn = False for lang, sub_info in subtitles.items(): sub_ext = sub_info['ext'] if sub_ext == 'json': - self.to_screen('JSON subtitles cannot be embedded') + self.report_warning('JSON subtitles cannot be embedded') elif ext != 'webm' or ext == 'webm' and sub_ext == 'vtt': sub_langs.append(lang) sub_filenames.append(subtitles_filename(filename, lang, sub_ext, ext)) else: if not webm_vtt_warn and ext == 'webm' and sub_ext != 'vtt': webm_vtt_warn = True - self.to_screen('Only WebVTT subtitles can be embedded in webm files') + self.report_warning('Only WebVTT subtitles can be embedded in webm files') + if not mp4_ass_warn and ext == 'mp4' and sub_ext == 'ass': + mp4_ass_warn = True + self.report_warning('ASS subtitles cannot be properly embedded in mp4 files; expect issues') if not sub_langs: return [], information @@ -441,7 +496,7 @@ def run(self, information): opts.extend(['-metadata:s:s:%d' % i, 'language=%s' % lang_code]) temp_filename = prepend_extension(filename, 'temp') - self.to_screen('Embedding subtitles in \'%s\'' % filename) + self.to_screen('Embedding subtitles in "%s"' % filename) self.run_ffmpeg_multiple_files(input_files, temp_filename, opts) os.remove(encodeFilename(filename)) os.rename(encodeFilename(temp_filename), encodeFilename(filename)) @@ -471,7 +526,6 @@ def add(meta_list, info_list=None): # 1. https://kdenlive.org/en/project/adding-meta-data-to-mp4-video/ # 2. https://wiki.multimedia.cx/index.php/FFmpeg_Metadata # 3. https://kodi.wiki/view/Video_file_tagging - # 4. http://atomicparsley.sourceforge.net/mpeg-4files.html add('title', ('track', 'title')) add('date', 'upload_date') @@ -524,6 +578,18 @@ def ffmpeg_escape(text): in_filenames.append(metadata_filename) options.extend(['-map_metadata', '1']) + if '__infojson_filepath' in info and info['ext'] in ('mkv', 'mka'): + old_stream, new_stream = self.get_stream_number( + filename, ('tags', 'mimetype'), 'application/json') + if old_stream is not None: + options.extend(['-map', '-0:%d' % old_stream]) + new_stream -= 1 + + options.extend([ + '-attach', info['__infojson_filepath'], + '-metadata:s:%d' % new_stream, 'mimetype=application/json' + ]) + self.to_screen('Adding metadata to \'%s\'' % filename) self.run_ffmpeg_multiple_files(in_filenames, temp_filename, options) if chapters: diff --git a/youtube_dlc/utils.py b/youtube_dlc/utils.py index 34a14424a..4aaee0b5f 100644 --- a/youtube_dlc/utils.py +++ b/youtube_dlc/utils.py @@ -5934,3 +5934,14 @@ def load_plugins(name, type, namespace): if plugin_info[0] is not None: plugin_info[0].close() return classes + + +def traverse_dict(dictn, keys, casesense=True): + if not isinstance(dictn, dict): + return None + first_key = keys[0] + if not casesense: + dictn = {key.lower(): val for key, val in dictn.items()} + first_key = first_key.lower() + value = dictn.get(first_key, None) + return value if len(keys) < 2 else traverse_dict(value, keys[1:], casesense)