uBlock/src/js/utils.js

608 lines
19 KiB
JavaScript
Raw Normal View History

2014-06-24 00:42:43 +02:00
/*******************************************************************************
uBlock Origin - a browser extension to block requests.
Copyright (C) 2014-present Raymond Hill
2014-06-24 00:42:43 +02:00
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see {http://www.gnu.org/licenses/}.
Home: https://github.com/gorhill/uBlock
*/
'use strict';
2014-06-24 00:42:43 +02:00
/******************************************************************************/
// A standalone URL tokenizer will allow us to use URL tokens in more than
// just static filtering engine. This opens the door to optimize other
// filtering engine parts aside static filtering. This also allows:
// - Tokenize only on demand.
// - To potentially avoid tokenizing when same URL is fed to tokenizer.
// - Benchmarking shows this to be a common occurrence.
2017-05-19 14:45:19 +02:00
//
// https://github.com/gorhill/uBlock/issues/2630
2017-05-20 02:22:26 +02:00
// Slice input URL into a list of safe-integer token values, instead of a list
2017-05-19 14:45:19 +02:00
// of substrings. The assumption is that with dealing only with numeric
// values, less underlying memory allocations, and also as a consequence
// less work for the garbage collector down the road.
// Another assumption is that using a numeric-based key value for Map() is
// more efficient than string-based key value (but that is something I would
// have to benchmark).
2017-05-20 02:22:26 +02:00
// Benchmark for string-based tokens vs. safe-integer token values:
2017-05-19 14:45:19 +02:00
// https://gorhill.github.io/obj-vs-set-vs-map/tokenize-to-str-vs-to-int.html
µBlock.urlTokenizer = {
setURL: function(url) {
if ( url !== this._urlIn ) {
this._urlIn = url;
this._urlOut = url.toLowerCase();
this._tokenized = false;
}
return this._urlOut;
},
// Tokenize on demand.
getTokens: function() {
if ( this._tokenized === false ) {
this._tokenize();
this._tokenized = true;
}
return this._tokens;
},
2017-05-19 14:45:19 +02:00
tokenHashFromString: function(s) {
var l = s.length;
if ( l === 0 ) { return 0; }
if ( l === 1 ) {
if ( s === '*' ) { return 63; }
if ( s === '.' ) { return 62; }
}
2017-05-19 14:45:19 +02:00
var vtc = this._validTokenChars,
th = vtc[s.charCodeAt(0)];
for ( var i = 1; i !== 8 && i !== l; i++ ) {
th = th * 64 + vtc[s.charCodeAt(i)];
}
return th;
},
// https://github.com/chrisaljoudi/uBlock/issues/1118
// We limit to a maximum number of tokens.
_tokenize: function() {
var tokens = this._tokens,
2017-05-19 14:45:19 +02:00
url = this._urlOut,
l = url.length;
if ( l === 0 ) { tokens[0] = 0; return; }
if ( l > 2048 ) {
url = url.slice(0, 2048);
l = 2048;
}
2017-05-19 14:45:19 +02:00
var i = 0, j = 0, v, n, ti, th,
vtc = this._validTokenChars;
for (;;) {
for (;;) {
if ( i === l ) { tokens[j] = 0; return; }
v = vtc[url.charCodeAt(i++)];
if ( v !== 0 ) { break; }
}
th = v; ti = i - 1; n = 1;
for (;;) {
if ( i === l ) { break; }
v = vtc[url.charCodeAt(i++)];
if ( v === 0 ) { break; }
if ( n === 8 ) { continue; }
th = th * 64 + v;
n += 1;
2017-05-19 14:45:19 +02:00
}
tokens[j++] = th;
tokens[j++] = ti;
2016-10-11 17:53:28 +02:00
}
},
_urlIn: '',
_urlOut: '',
_tokenized: false,
2017-05-19 14:45:19 +02:00
_tokens: [ 0 ],
_validTokenChars: (function() {
var vtc = new Uint8Array(128),
chars = '0123456789%abcdefghijklmnopqrstuvwxyz',
i = chars.length;
while ( i-- ) {
vtc[chars.charCodeAt(i)] = i + 1;
}
return vtc;
})()
};
2014-06-24 00:42:43 +02:00
/******************************************************************************/
µBlock.formatCount = function(count) {
if ( typeof count !== 'number' ) {
return '';
}
var s = count.toFixed(0);
if ( count >= 1000 ) {
if ( count < 10000 ) {
2014-12-24 14:11:22 +01:00
s = '>' + s.slice(0,1) + 'k';
} else if ( count < 100000 ) {
2014-12-24 14:11:22 +01:00
s = s.slice(0,2) + 'k';
} else if ( count < 1000000 ) {
2014-12-24 14:11:22 +01:00
s = s.slice(0,3) + 'k';
} else if ( count < 10000000 ) {
s = s.slice(0,1) + 'M';
} else {
s = s.slice(0,-6) + 'M';
}
}
return s;
2014-06-24 00:42:43 +02:00
};
2014-08-20 15:24:16 +02:00
// https://www.youtube.com/watch?v=DyvzfyqYm_s
/******************************************************************************/
2016-08-13 22:42:58 +02:00
2016-10-13 19:25:57 +02:00
µBlock.dateNowToSensibleString = function() {
var now = new Date(Date.now() - (new Date()).getTimezoneOffset() * 60000);
return now.toISOString().replace(/\.\d+Z$/, '')
.replace(/:/g, '.')
.replace('T', '_');
};
/******************************************************************************/
2016-08-13 22:42:58 +02:00
µBlock.LineIterator = function(text, offset) {
this.text = text;
this.textLen = this.text.length;
2016-08-13 22:42:58 +02:00
this.offset = offset || 0;
};
µBlock.LineIterator.prototype.next = function(offset) {
if ( offset !== undefined ) {
this.offset += offset;
}
2016-08-13 22:42:58 +02:00
var lineEnd = this.text.indexOf('\n', this.offset);
if ( lineEnd === -1 ) {
lineEnd = this.text.indexOf('\r', this.offset);
if ( lineEnd === -1 ) {
lineEnd = this.textLen;
2016-08-13 22:42:58 +02:00
}
}
var line = this.text.slice(this.offset, lineEnd);
this.offset = lineEnd + 1;
return line;
};
µBlock.LineIterator.prototype.charCodeAt = function(offset) {
return this.text.charCodeAt(this.offset + offset);
};
2016-08-13 22:42:58 +02:00
µBlock.LineIterator.prototype.eot = function() {
return this.offset >= this.textLen;
};
/******************************************************************************/
// The field iterator is less CPU-intensive than when using native
// String.split().
µBlock.FieldIterator = function(sep) {
this.text = '';
this.sep = sep;
this.sepLen = sep.length;
this.offset = 0;
};
µBlock.FieldIterator.prototype.first = function(text) {
this.text = text;
this.offset = 0;
return this.next();
};
µBlock.FieldIterator.prototype.next = function() {
var end = this.text.indexOf(this.sep, this.offset);
if ( end === -1 ) {
end = this.text.length;
}
var field = this.text.slice(this.offset, end);
this.offset = end + this.sepLen;
return field;
};
µBlock.FieldIterator.prototype.remainder = function() {
return this.text.slice(this.offset);
};
/******************************************************************************/
µBlock.CompiledLineIO = {
serialize: JSON.stringify,
unserialize: JSON.parse,
blockStartPrefix: '#block-start-', // ensure no special regex characters
blockEndPrefix: '#block-end-', // ensure no special regex characters
Writer: function() {
this.io = µBlock.CompiledLineIO;
this.blockId = undefined;
this.block = undefined;
this.stringifier = this.io.serialize;
this.blocks = new Map();
this.properties = new Map();
},
Reader: function(raw, blockId) {
this.io = µBlock.CompiledLineIO;
this.block = '';
this.len = 0;
this.offset = 0;
this.line = '';
this.parser = this.io.unserialize;
this.blocks = new Map();
this.properties = new Map();
let reBlockStart = new RegExp(
'^' + this.io.blockStartPrefix + '(\\d+)\\n',
'gm'
);
let match = reBlockStart.exec(raw);
while ( match !== null ) {
let beg = match.index + match[0].length;
let end = raw.indexOf(this.io.blockEndPrefix + match[1], beg);
this.blocks.set(parseInt(match[1], 10), raw.slice(beg, end));
reBlockStart.lastIndex = end;
match = reBlockStart.exec(raw);
}
if ( blockId !== undefined ) {
this.select(blockId);
}
}
};
µBlock.CompiledLineIO.Writer.prototype = {
push: function(args) {
this.block[this.block.length] = this.stringifier(args);
},
select: function(blockId) {
if ( blockId === this.blockId ) { return; }
this.blockId = blockId;
this.block = this.blocks.get(blockId);
if ( this.block === undefined ) {
this.blocks.set(blockId, (this.block = []));
}
},
toString: function() {
let result = [];
for ( let [ id, lines ] of this.blocks ) {
if ( lines.length === 0 ) { continue; }
result.push(
this.io.blockStartPrefix + id,
lines.join('\n'),
this.io.blockEndPrefix + id
);
}
return result.join('\n');
}
};
µBlock.CompiledLineIO.Reader.prototype = {
next: function() {
if ( this.offset === this.len ) {
this.line = '';
return false;
}
let pos = this.block.indexOf('\n', this.offset);
if ( pos !== -1 ) {
this.line = this.block.slice(this.offset, pos);
this.offset = pos + 1;
} else {
this.line = this.block.slice(this.offset);
this.offset = this.len;
}
return true;
},
select: function(blockId) {
this.block = this.blocks.get(blockId) || '';
this.len = this.block.length;
this.offset = 0;
return this;
},
fingerprint: function() {
return this.line;
},
args: function() {
return this.parser(this.line);
}
};
/******************************************************************************/
// I want this helper to be self-maintained, callers must not worry about
// this helper cleaning after itself by asking them to reset it when it is no
// longer needed. A timer will be used for self-garbage-collect.
// Cleaning up 10s after last hit sounds reasonable.
µBlock.stringDeduplicater = {
strings: new Map(),
timer: undefined,
last: 0,
lookup: function(s) {
let t = this.strings.get(s);
if ( t === undefined ) {
t = this.strings.set(s, s).get(s);
if ( this.timer === undefined ) {
this.timer = vAPI.setTimeout(() => { this.cleanup(); }, 10000);
}
}
this.last = Date.now();
return t;
},
cleanup: function() {
if ( (Date.now() - this.last) < 10000 ) {
this.timer = vAPI.setTimeout(() => { this.cleanup(); }, 10000);
} else {
this.timer = undefined;
this.strings.clear();
}
}
};
/******************************************************************************/
2016-09-16 23:41:17 +02:00
µBlock.openNewTab = function(details) {
if ( details.url.startsWith('logger-ui.html') ) {
if ( details.shiftKey ) {
this.changeUserSettings(
'alwaysDetachLogger',
!this.userSettings.alwaysDetachLogger
);
2016-09-16 23:41:17 +02:00
}
details.popup = this.userSettings.alwaysDetachLogger;
if ( details.popup ) {
const url = new URL(vAPI.getURL(details.url));
url.searchParams.set('popup', '1');
details.url = url.href;
let popupLoggerBox;
try {
popupLoggerBox = JSON.parse(
vAPI.localStorage.getItem('popupLoggerBox')
);
} catch(ex) {
}
if ( popupLoggerBox !== undefined ) {
details.box = popupLoggerBox;
}
}
2016-09-16 23:41:17 +02:00
}
vAPI.tabs.open(details);
};
/******************************************************************************/
2017-01-27 19:44:52 +01:00
2017-10-21 19:43:46 +02:00
µBlock.MRUCache = function(size) {
this.size = size;
this.array = [];
this.map = new Map();
this.resetTime = Date.now();
2017-10-21 19:43:46 +02:00
};
µBlock.MRUCache.prototype = {
add: function(key, value) {
var found = this.map.has(key);
this.map.set(key, value);
if ( !found ) {
if ( this.array.length === this.size ) {
this.map.delete(this.array.pop());
}
this.array.unshift(key);
}
},
remove: function(key) {
if ( this.map.has(key) ) {
this.array.splice(this.array.indexOf(key), 1);
}
},
lookup: function(key) {
var value = this.map.get(key);
if ( value !== undefined && this.array[0] !== key ) {
var i = this.array.indexOf(key);
do {
this.array[i] = this.array[i-1];
} while ( --i );
this.array[0] = key;
2017-10-21 19:43:46 +02:00
}
return value;
},
reset: function() {
this.array = [];
this.map.clear();
this.resetTime = Date.now();
2017-10-21 19:43:46 +02:00
}
};
/******************************************************************************/
2017-11-09 18:53:05 +01:00
// https://developer.mozilla.org/en-US/docs/Web/JavaScript/Guide/Regular_Expressions
µBlock.escapeRegex = function(s) {
return s.replace(/[.*+?^${}()|[\]\\]/g, '\\$&');
};
/******************************************************************************/
µBlock.decomposeHostname = (function() {
// For performance purpose, as simple tests as possible
let reHostnameVeryCoarse = /[g-z_-]/;
let reIPv4VeryCoarse = /\.\d+$/;
let toBroaderHostname = function(hostname) {
let pos = hostname.indexOf('.');
if ( pos !== -1 ) {
return hostname.slice(pos + 1);
}
return hostname !== '*' && hostname !== '' ? '*' : '';
};
2018-08-09 17:31:25 +02:00
let toBroaderIPv4Address = function(ipaddress) {
if ( ipaddress === '*' || ipaddress === '' ) { return ''; }
let pos = ipaddress.lastIndexOf('.');
if ( pos === -1 ) { return '*'; }
return ipaddress.slice(0, pos);
};
let toBroaderIPv6Address = function(ipaddress) {
return ipaddress !== '*' && ipaddress !== '' ? '*' : '';
};
return function decomposeHostname(hostname, decomposed) {
if ( decomposed.length === 0 || decomposed[0] !== hostname ) {
2018-08-09 17:31:25 +02:00
let broaden;
if ( reHostnameVeryCoarse.test(hostname) === false ) {
if ( reIPv4VeryCoarse.test(hostname) ) {
2018-08-09 17:31:25 +02:00
broaden = toBroaderIPv4Address;
} else if ( hostname.startsWith('[') ) {
broaden = toBroaderIPv6Address;
}
}
if ( broaden === undefined ) {
broaden = toBroaderHostname;
}
decomposed[0] = hostname;
let i = 1;
for (;;) {
hostname = broaden(hostname);
if ( hostname === '' ) { break; }
decomposed[i++] = hostname;
}
decomposed.length = i;
}
return decomposed;
};
})();
/******************************************************************************/
// TODO: evaluate using TextEncoder/TextDecoder
µBlock.orphanizeString = function(s) {
return JSON.parse(JSON.stringify(s));
};
Refactor selfie generation into a more flexible persistence mechanism The motivation is to address the higher peak memory usage at launch time with 3rd-gen HNTrie when a selfie was present. The selfie generation prior to this change was to collect all filtering data into a single data structure, and then to serialize that whole structure at once into storage (using JSON.stringify). However, HNTrie serialization requires that a large UintArray32 be converted into a plain JS array, which itslef would be indirectly converted into a JSON string. This was the main reason why peak memory usage would be higher at launch from selfie, since the JSON string would need to be wholly unserialized into JS objects, which themselves would need to be converted into more specialized data structures (like that Uint32Array one). The solution to lower peak memory usage at launch is to refactor selfie generation to allow a more piecemeal approach: each filtering component is given the ability to serialize itself rather than to be forced to be embedded in the master selfie. With this approach, the HNTrie buffer can now serialize to its own storage by converting the buffer data directly into a string which can be directly sent to storage. This avoiding expensive intermediate steps such as converting into a JS array and then to a JSON string. As part of the refactoring, there was also opportunistic code upgrade to ES6 and Promise (eventually all of uBO's code will be proper ES6). Additionally, the polyfill to bring getBytesInUse() to Firefox has been revisited to replace the rather expensive previous implementation with an implementation with virtually no overhead.
2019-02-14 19:33:55 +01:00
/******************************************************************************/
// Custom base128 encoder/decoder
//
// TODO:
// Could expand the LZ4 codec API to be able to return UTF8-safe string
// representation of a compressed buffer, and thus the code below could be
// moved LZ4 codec-side.
µBlock.base128 = {
encode: function(arrbuf, arrlen) {
const inbuf = new Uint8Array(arrbuf, 0, arrlen);
const inputLength = arrlen;
let _7cnt = Math.floor(inputLength / 7);
let outputLength = _7cnt * 8;
let _7rem = inputLength % 7;
if ( _7rem !== 0 ) {
outputLength += 1 + _7rem;
}
const outbuf = new Uint8Array(outputLength);
let msbits, v;
let i = 0, j = 0;
while ( _7cnt-- ) {
v = inbuf[i+0];
msbits = (v & 0x80) >>> 7;
outbuf[j+1] = v & 0x7F;
v = inbuf[i+1];
msbits |= (v & 0x80) >>> 6;
outbuf[j+2] = v & 0x7F;
v = inbuf[i+2];
msbits |= (v & 0x80) >>> 5;
outbuf[j+3] = v & 0x7F;
v = inbuf[i+3];
msbits |= (v & 0x80) >>> 4;
outbuf[j+4] = v & 0x7F;
v = inbuf[i+4];
msbits |= (v & 0x80) >>> 3;
outbuf[j+5] = v & 0x7F;
v = inbuf[i+5];
msbits |= (v & 0x80) >>> 2;
outbuf[j+6] = v & 0x7F;
v = inbuf[i+6];
msbits |= (v & 0x80) >>> 1;
outbuf[j+7] = v & 0x7F;
outbuf[j+0] = msbits;
i += 7; j += 8;
}
if ( _7rem > 0 ) {
msbits = 0;
for ( let ir = 0; ir < _7rem; ir++ ) {
v = inbuf[i+ir];
msbits |= (v & 0x80) >>> (7 - ir);
outbuf[j+ir+1] = v & 0x7F;
}
outbuf[j+0] = msbits;
}
const textDecoder = new TextDecoder();
return textDecoder.decode(outbuf);
},
// TODO:
// Surprisingly, there does not seem to be any performance gain when
// first converting the input string into a Uint8Array through
// TextEncoder. Investigate again to confirm original findings and
// to find out whether results have changed. Not using TextEncoder()
// to create an intermediate input buffer lower peak memory usage
// at selfie load time.
//
// const textEncoder = new TextEncoder();
// const inbuf = textEncoder.encode(instr);
// const inputLength = inbuf.byteLength;
decode: function(instr, arrbuf) {
const inputLength = instr.length;
let _8cnt = inputLength >>> 3;
let outputLength = _8cnt * 7;
let _8rem = inputLength % 8;
if ( _8rem !== 0 ) {
outputLength += _8rem - 1;
}
const outbuf = arrbuf instanceof ArrayBuffer === false
? new Uint8Array(outputLength)
: new Uint8Array(arrbuf);
let msbits;
let i = 0, j = 0;
while ( _8cnt-- ) {
msbits = instr.charCodeAt(i+0);
outbuf[j+0] = msbits << 7 & 0x80 | instr.charCodeAt(i+1);
outbuf[j+1] = msbits << 6 & 0x80 | instr.charCodeAt(i+2);
outbuf[j+2] = msbits << 5 & 0x80 | instr.charCodeAt(i+3);
outbuf[j+3] = msbits << 4 & 0x80 | instr.charCodeAt(i+4);
outbuf[j+4] = msbits << 3 & 0x80 | instr.charCodeAt(i+5);
outbuf[j+5] = msbits << 2 & 0x80 | instr.charCodeAt(i+6);
outbuf[j+6] = msbits << 1 & 0x80 | instr.charCodeAt(i+7);
i += 8; j += 7;
}
if ( _8rem > 1 ) {
msbits = instr.charCodeAt(i+0);
for ( let ir = 1; ir < _8rem; ir++ ) {
outbuf[j+ir-1] = msbits << (8-ir) & 0x80 | instr.charCodeAt(i+ir);
}
}
return outbuf;
},
decodeSize: function(instr) {
const size = (instr.length >>> 3) * 7;
const rem = instr.length & 7;
return rem === 0 ? size : size + rem - 1;
},
};