From 80e8493ee7c3083f4e215794e4a67ba5265f24f7 Mon Sep 17 00:00:00 2001 From: pukkandan Date: Wed, 18 May 2022 06:42:43 +0530 Subject: [PATCH] [utils] `is_html`: Handle double BOM Closes #2885 --- yt_dlp/extractor/generic.py | 15 +++++++++++++++ yt_dlp/utils.py | 13 ++++++------- 2 files changed, 21 insertions(+), 7 deletions(-) diff --git a/yt_dlp/extractor/generic.py b/yt_dlp/extractor/generic.py index dda2b1eef..b0fc176ef 100644 --- a/yt_dlp/extractor/generic.py +++ b/yt_dlp/extractor/generic.py @@ -2527,6 +2527,21 @@ class GenericIE(InfoExtractor): 'upload_date': '20220504', }, }, + { + # Webpage contains double BOM + 'url': 'https://www.filmarkivet.se/movies/paris-d-moll/', + 'md5': 'df02cadc719dcc63d43288366f037754', + 'info_dict': { + 'id': 'paris-d-moll', + 'ext': 'mp4', + 'upload_date': '20220518', + 'title': 'Paris d-moll', + 'description': 'md5:319e37ea5542293db37e1e13072fe330', + 'thumbnail': 'https://www.filmarkivet.se/wp-content/uploads/parisdmoll2.jpg', + 'timestamp': 1652833414, + 'age_limit': 0, + } + } ] def report_following_redirect(self, new_url): diff --git a/yt_dlp/utils.py b/yt_dlp/utils.py index 48a94415d..3b0e6750c 100644 --- a/yt_dlp/utils.py +++ b/yt_dlp/utils.py @@ -3290,14 +3290,13 @@ def is_html(first_bytes): (b'\xff\xfe', 'utf-16-le'), (b'\xfe\xff', 'utf-16-be'), ] - for bom, enc in BOMS: - if first_bytes.startswith(bom): - s = first_bytes[len(bom):].decode(enc, 'replace') - break - else: - s = first_bytes.decode('utf-8', 'replace') - return re.match(r'^\s*<', s) + encoding = 'utf-8' + for bom, enc in BOMS: + while first_bytes.startswith(bom): + encoding, first_bytes = enc, first_bytes[len(bom):] + + return re.match(r'^\s*<', first_bytes.decode(encoding, 'replace')) def determine_protocol(info_dict):