This commit is contained in:
Raymond Hill 2018-01-05 13:15:56 -05:00
parent fcd2124ad3
commit a0375bb6a3
No known key found for this signature in database
GPG key ID: 25E1490B761470C2
3 changed files with 102 additions and 23 deletions

View file

@ -517,6 +517,7 @@ var onMessage = function(request, sender, callback) {
// already been injected.
if (
µb.canFilterResponseBody === false ||
µb.textEncode === undefined ||
µb.textEncode.normalizeCharset(request.charset) === undefined
) {
response.scriptlets = µb.scriptletFilteringEngine.retrieve(request);

View file

@ -25,16 +25,40 @@
µBlock.textEncode = (function() {
if ( µBlock.canFilterResponseBody !== true ) { return; }
// charset aliases extracted from:
// https://github.com/inexorabletash/text-encoding/blob/b4e5bc26e26e51f56e3daa9f13138c79f49d3c34/lib/encoding.js#L342
var normalizedCharset = new Map([
[ 'utf8', 'utf-8' ],
[ 'unicode-1-1-utf-8', 'utf-8' ],
[ 'utf-8', 'utf-8' ],
[ 'windows-1250', 'windows-1250' ],
[ 'cp1250', 'windows-1250' ],
[ 'x-cp1250', 'windows-1250' ],
[ 'windows-1251', 'windows-1251' ],
[ 'cp1251', 'windows-1251' ],
[ 'x-cp1251', 'windows-1251' ],
[ 'windows-1252', 'windows-1252' ],
[ 'ansi_x3.4-1968', 'windows-1252' ],
[ 'ascii', 'windows-1252' ],
[ 'cp1252', 'windows-1252' ],
[ 'cp819', 'windows-1252' ],
[ 'csisolatin1', 'windows-1252' ],
[ 'ibm819', 'windows-1252' ],
[ 'iso-8859-1', 'windows-1252' ],
[ 'iso-ir-100', 'windows-1252' ],
[ 'iso8859-1', 'windows-1252' ],
[ 'iso88591', 'windows-1252' ],
[ 'iso_8859-1', 'windows-1252' ],
[ 'iso_8859-1:1987', 'windows-1252' ],
[ 'l1', 'windows-1252' ],
[ 'latin1', 'windows-1252' ],
[ 'us-ascii', 'windows-1252' ],
[ 'x-cp1252', 'windows-1252' ],
]);
// http://www.unicode.org/Public/MAPPINGS/VENDORS/MICSFT/WINDOWS/CP1250.TXT
@ -77,7 +101,17 @@
/* 0x0478 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
/* 0x0480 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
/* 0x0488 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
/* 0x0490 */ 0xA5, 0xB4, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
/* 0x0490 */ 0xA5, 0xB4, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
]);
// https://www.unicode.org/Public/MAPPINGS/VENDORS/MICSFT/WINDOWS/CP1252.TXT
var cp1252_range0 = new Uint8Array([
/* 0x0150 */ 0x00, 0x00, 0x8C, 0x9C, 0x00, 0x00, 0x00, 0x00,
/* 0x0158 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
/* 0x0160 */ 0x8A, 0x9A, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
/* 0x0168 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
/* 0x0170 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
/* 0x0178 */ 0x9F, 0x00, 0x00, 0x00, 0x00, 0x8E, 0x9E, 0x00
]);
var cp125x_range0 = new Uint8Array([
@ -171,6 +205,47 @@
}
}
return buf.slice(0, o);
},
'windows-1252': function(buf) {
var i = 0, n = buf.byteLength, o = 0, c;
while ( i < n ) {
c = buf[i++];
if ( c < 0x80 ) {
buf[o++] = c;
} else {
if ( (c & 0xE0) === 0xC0 ) {
c = (c & 0x1F) << 6;
c |= (buf[i++] & 0x3F);
} else if ( (c & 0xF0) === 0xE0 ) {
c = (c & 0x0F) << 12;
c |= (buf[i++] & 0x3F) << 6;
c |= (buf[i++] & 0x3F);
} else if ( (c & 0xF8) === 0xF0 ) {
c = (c & 0x07) << 18;
c |= (buf[i++] & 0x3F) << 12;
c |= (buf[i++] & 0x3F) << 6;
c |= (buf[i++] & 0x3F);
}
if ( c < 0x100 ) {
buf[o++] = c;
} else if ( c >= 0x150 && c < 0x180 ) {
buf[o++] = cp1252_range0[c - 0x150];
} else if ( c >= 0x2010 && c < 0x2040 ) {
buf[o++] = cp125x_range0[c - 0x2010];
} else if ( c === 0x192 ) {
buf[o++] = 0x83;
} else if ( c === 0x2C6 ) {
buf[o++] = 0x88;
} else if ( c === 0x2DC ) {
buf[o++] = 0x98;
} else if ( c === 0x20AC ) {
buf[o++] = 0x80;
} else if ( c === 0x2122 ) {
buf[o++] = 0x99;
}
}
}
return buf.slice(0, o);
}
};

View file

@ -577,7 +577,7 @@ var filterDocument = (function() {
var µb = µBlock,
filterers = new Map(),
domParser, xmlSerializer,
textDecoderCharset, textDecoder, textEncoder;
utf8TextDecoder, textDecoder, textEncoder;
var reContentTypeDocument = /^(?:text\/html|application\/xhtml+xml)/i,
reContentTypeCharset = /charset=['"]?([^'" ]+)/i;
@ -737,29 +737,17 @@ var filterDocument = (function() {
textEncoder = new TextEncoder();
}
// In case of unknown charset, assume utf-8.
if (
filterer.charset === undefined && textDecoderCharset !== 'utf-8' ||
filterer.charset !== undefined && filterer.charset !== textDecoderCharset
) {
textDecoder = undefined;
}
if ( textDecoder === undefined ) {
try {
textDecoder = new TextDecoder(filterer.charset);
textDecoderCharset = filterer.charset || 'utf-8';
} catch(ex) {
textDecoder = new TextDecoder();
textDecoderCharset = 'utf-8';
}
}
var doc = domParser.parseFromString(
textDecoder.decode(filterer.buffer),
'text/html'
);
var doc;
// If stream encoding is still unknnown, try to extract from document.
if ( filterer.charset === undefined ) {
if ( utf8TextDecoder === undefined ) {
utf8TextDecoder = new TextDecoder();
}
doc = domParser.parseFromString(
utf8TextDecoder.decode(filterer.buffer.slice(0, 1024)),
'text/html'
);
filterer.charset = µb.textEncode.normalizeCharset(charsetFromDoc(doc));
if ( filterer.charset === undefined ) {
streamClose(filterer);
@ -767,6 +755,21 @@ var filterDocument = (function() {
}
}
if (
textDecoder !== undefined &&
textDecoder.encoding !== filterer.charset
) {
textDecoder = undefined;
}
if ( textDecoder === undefined ) {
textDecoder = new TextDecoder(filterer.charset);
}
doc = domParser.parseFromString(
textDecoder.decode(filterer.buffer),
'text/html'
);
var modified = false;
if ( filterer.selectors !== undefined ) {
if ( µb.htmlFilteringEngine.apply(doc, filterer) ) {