From 86cb922118b236306310a72657f70426c20e28bb Mon Sep 17 00:00:00 2001 From: pukkandan Date: Thu, 9 Mar 2023 23:13:02 +0530 Subject: [PATCH] [extractor/youtube] Add extractor-arg `include_duplicate_formats` --- README.md | 1 + yt_dlp/extractor/youtube.py | 38 +++++++++++++++++++++---------------- 2 files changed, 23 insertions(+), 16 deletions(-) diff --git a/README.md b/README.md index f28bf8e12..de83e421f 100644 --- a/README.md +++ b/README.md @@ -1787,6 +1787,7 @@ #### youtube * `comment_sort`: `top` or `new` (default) - choose comment sorting mode (on YouTube's side) * `max_comments`: Limit the amount of comments to gather. Comma-separated list of integers representing `max-comments,max-parents,max-replies,max-replies-per-thread`. Default is `all,all,all,all` * E.g. `all,all,1000,10` will get a maximum of 1000 replies total, with up to 10 replies per thread. `1000,all,100` will get a maximum of 1000 comments, with a maximum of 100 replies total +* `include_duplicate_formats`: Extract formats with identical content but different URLs or protocol. This is useful if some of the formats are unavailable or throttled. * `include_incomplete_formats`: Extract formats that cannot be downloaded completely (live dash and post-live m3u8) * `innertube_host`: Innertube API host to use for all API requests; e.g. `studio.youtube.com`, `youtubei.googleapis.com`. Note that cookies exported from one subdomain will not work on others * `innertube_key`: Innertube API key to use for all API requests diff --git a/yt_dlp/extractor/youtube.py b/yt_dlp/extractor/youtube.py index 79174b882..48f822e44 100644 --- a/yt_dlp/extractor/youtube.py +++ b/yt_dlp/extractor/youtube.py @@ -3640,6 +3640,7 @@ def _extract_formats_and_subtitles(self, streaming_data, video_id, player_url, l 'small', 'medium', 'large', 'hd720', 'hd1080', 'hd1440', 'hd2160', 'hd2880', 'highres' ]) streaming_formats = traverse_obj(streaming_data, (..., ('formats', 'adaptiveFormats'), ...)) + all_formats = self._configuration_arg('include_duplicate_formats') for fmt in streaming_formats: if fmt.get('targetDurationSec'): @@ -3648,8 +3649,9 @@ def _extract_formats_and_subtitles(self, streaming_data, video_id, player_url, l itag = str_or_none(fmt.get('itag')) audio_track = fmt.get('audioTrack') or {} stream_id = (itag, audio_track.get('id'), fmt.get('isDrc')) - if stream_id in stream_ids: - continue + if not all_formats: + if stream_id in stream_ids: + continue quality = fmt.get('quality') height = int_or_none(fmt.get('height')) @@ -3739,7 +3741,7 @@ def _extract_formats_and_subtitles(self, streaming_data, video_id, player_url, l try_get(fmt, lambda x: x['projectionType'].replace('RECTANGULAR', '').lower()), try_get(fmt, lambda x: x['spatialAudioType'].replace('SPATIAL_AUDIO_TYPE_', '').lower()), throttled and 'THROTTLED', is_damaged and 'DAMAGED', - self.get_param('verbose') and client_name, + (self.get_param('verbose') or all_formats) and client_name, delim=', '), # Format 22 is likely to be damaged. See https://github.com/yt-dlp/yt-dlp/issues/3372 'source_preference': -10 if throttled else -5 if itag == '22' else -1, @@ -3762,26 +3764,28 @@ def _extract_formats_and_subtitles(self, streaming_data, video_id, player_url, l if mime_mobj: dct['ext'] = mimetype2ext(mime_mobj.group(1)) dct.update(parse_codecs(mime_mobj.group(2))) - + if itag: + itags[itag].add(('https', dct.get('language'))) + stream_ids.append(stream_id) single_stream = 'none' in (dct.get('acodec'), dct.get('vcodec')) if single_stream and dct.get('ext'): dct['container'] = dct['ext'] + '_dash' - if single_stream or itag == '17': - CHUNK_SIZE = 10 << 20 - dct.update({ + + CHUNK_SIZE = 10 << 20 + if dct['filesize']: + yield { + **dct, + 'format_id': f'{dct["format_id"]}-dashy' if all_formats else dct['format_id'], 'protocol': 'http_dash_segments', 'fragments': [{ 'url': update_url_query(dct['url'], { 'range': f'{range_start}-{min(range_start + CHUNK_SIZE - 1, dct["filesize"])}' }) } for range_start in range(0, dct['filesize'], CHUNK_SIZE)] - } if itag != '17' and dct['filesize'] else { - 'downloader_options': {'http_chunk_size': CHUNK_SIZE} - }) - - if itag: - itags[itag].add(('https', dct.get('language'))) - stream_ids.append(stream_id) + } + if not all_formats: + continue + dct['downloader_options'] = {'http_chunk_size': CHUNK_SIZE} yield dct needs_live_processing = self._needs_live_processing(live_status, duration) @@ -3803,11 +3807,13 @@ def _extract_formats_and_subtitles(self, streaming_data, video_id, player_url, l def process_manifest_format(f, proto, client_name, itag): key = (proto, f.get('language')) - if key in itags[itag]: + if not all_formats and key in itags[itag]: return False itags[itag].add(key) - if any(p != proto for p, _ in itags[itag]): + if itag and all_formats: + f['format_id'] = f'{itag}-{proto}' + elif any(p != proto for p, _ in itags[itag]): f['format_id'] = f'{itag}-{proto}' elif itag: f['format_id'] = itag