mirror of
https://github.com/gorhill/uBlock.git
synced 2024-11-11 17:41:03 +01:00
546 lines
18 KiB
JavaScript
546 lines
18 KiB
JavaScript
/*******************************************************************************
|
|
|
|
uBlock Origin - a browser extension to block requests.
|
|
Copyright (C) 2017-present Raymond Hill
|
|
|
|
This program is free software: you can redistribute it and/or modify
|
|
it under the terms of the GNU General Public License as published by
|
|
the Free Software Foundation, either version 3 of the License, or
|
|
(at your option) any later version.
|
|
|
|
This program is distributed in the hope that it will be useful,
|
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
GNU General Public License for more details.
|
|
|
|
You should have received a copy of the GNU General Public License
|
|
along with this program. If not, see {http://www.gnu.org/licenses/}.
|
|
|
|
Home: https://github.com/gorhill/uBlock
|
|
*/
|
|
|
|
/* globals WebAssembly */
|
|
/* exported hnTrieManager */
|
|
|
|
'use strict';
|
|
|
|
/*******************************************************************************
|
|
|
|
The original prototype was to develop an idea I had about using jump indices
|
|
in a TypedArray for quickly matching hostnames (or more generally strings)[1].
|
|
Once I had a working, un-optimized prototype, I realized I had ended up
|
|
with something formally named a "trie": <https://en.wikipedia.org/wiki/Trie>,
|
|
hence the name. I have no idea whether the implementation here or one
|
|
resembling it has been done elsewhere.
|
|
|
|
"HN" in HNTrieBuilder stands for "HostName", because the trie is specialized
|
|
to deal with matching hostnames -- which is a bit more complicated than
|
|
matching plain strings.
|
|
|
|
For example, `www.abc.com` is deemed matching `abc.com`, because the former
|
|
is a subdomain of the latter. The opposite is of course not true.
|
|
|
|
The resulting read-only tries created as a result of using hnTrieManager are
|
|
simply just typed arrays filled with integers. The matching algorithm is
|
|
just a matter of reading/comparing these integers, and further using them as
|
|
indices in the array as a way to move around in the trie.
|
|
|
|
[1] To solve <https://github.com/gorhill/uBlock/issues/3193>
|
|
|
|
*/
|
|
|
|
const hnTrieManager = {
|
|
tree: null,
|
|
treesz: 0,
|
|
trie: new Uint8Array(65536),
|
|
trie32: null,
|
|
triesz: 256, // bytes 0-254: decoded needle, byte 255: needle length
|
|
id: 0,
|
|
needle: '',
|
|
wasmLoading: null,
|
|
wasmMemory: null,
|
|
cleanupToken: 0,
|
|
cleanupTimer: undefined,
|
|
|
|
reset: function() {
|
|
if ( this.wasmMemory === null && this.trie.byteLength > 65536 ) {
|
|
this.trie = new Uint8Array(65536);
|
|
this.trie32 = null;
|
|
} else {
|
|
this.trie.fill(0);
|
|
}
|
|
this.triesz = 256;
|
|
this.needle = '';
|
|
this.id += 1;
|
|
},
|
|
|
|
readyToUse: function() {
|
|
return this.wasmLoading instanceof Promise
|
|
? this.wasmLoading
|
|
: Promise.resolve();
|
|
},
|
|
|
|
isValidRef: function(ref) {
|
|
return ref !== null && ref.id === this.id;
|
|
},
|
|
|
|
setNeedle: function(needle) {
|
|
if ( needle !== this.needle ) {
|
|
const buf = this.trie;
|
|
let i = needle.length;
|
|
if ( i > 255 ) { i = 255; }
|
|
buf[255] = i;
|
|
while ( i-- ) {
|
|
buf[i] = needle.charCodeAt(i);
|
|
}
|
|
this.needle = needle;
|
|
}
|
|
return this;
|
|
},
|
|
|
|
matchesJS: function(itrie) {
|
|
const buf = this.trie;
|
|
const buf32 = this.trie32;
|
|
let ineedle = buf[255];
|
|
for (;;) {
|
|
ineedle -= 1;
|
|
const nchar = ineedle === -1 ? 0 : buf[ineedle];
|
|
for (;;) {
|
|
const tchar = buf[itrie+8]; // quick test: first character
|
|
if ( tchar === nchar ) { break; }
|
|
if ( tchar === 0 && nchar === 0x2E ) { return 1; }
|
|
itrie = buf32[itrie >>> 2];
|
|
if ( itrie === 0 ) { return 0; } // no more descendants
|
|
}
|
|
if ( nchar === 0 ) { return 1; }
|
|
let lxtra = buf[itrie+9]; // length of extra charaters
|
|
if ( lxtra !== 0 ) { // cell is only one character
|
|
if ( lxtra > ineedle ) { return 0; }
|
|
let ixtra = itrie + 10;
|
|
lxtra += ixtra;
|
|
do {
|
|
ineedle -= 1;
|
|
if ( buf[ineedle] !== buf[ixtra] ) { return 0; }
|
|
ixtra += 1;
|
|
} while ( ixtra !== lxtra );
|
|
}
|
|
itrie = buf32[itrie + 4 >>> 2];
|
|
if ( itrie === 0 ) {
|
|
return ineedle === 0 || buf[ineedle-1] === 0x2E ? 1 : 0;
|
|
}
|
|
}
|
|
},
|
|
matchesWASM: null,
|
|
matches: null,
|
|
|
|
start: function() {
|
|
if ( this.trie32 === null ) {
|
|
this.trie32 = new Uint32Array(this.trie.buffer);
|
|
}
|
|
this.treesz = 0;
|
|
if ( this.tree === null ) {
|
|
this.tree = new Uint32Array(16384);
|
|
}
|
|
this.tree[0] = 0;
|
|
this.tree[1] = 0;
|
|
this.tree[2] = 0;
|
|
},
|
|
|
|
/***************************************************************************
|
|
|
|
Since this trie is specialized for matching hostnames, the stored
|
|
strings are reversed internally, because of hostname comparison logic:
|
|
|
|
Correct matching:
|
|
index 0123456
|
|
abc.com
|
|
|
|
|
www.abc.com
|
|
index 01234567890
|
|
|
|
Incorrect matching (typically used for plain strings):
|
|
index 0123456
|
|
abc.com
|
|
|
|
|
www.abc.com
|
|
index 01234567890
|
|
|
|
*/
|
|
|
|
add: function(hn) {
|
|
let ichar = hn.length - 1;
|
|
if ( ichar === -1 ) { return; }
|
|
// 256 * 3 + 3 = 771
|
|
if ( this.treesz + 771 >= this.tree.length ) {
|
|
this.growTree();
|
|
}
|
|
if ( ichar > 254 ) { ichar = 254; }
|
|
let c = hn.charCodeAt(ichar),
|
|
i = 0, inext;
|
|
for (;;) {
|
|
if ( this.tree[i+2] !== c ) { // match not found
|
|
inext = this.tree[i]; // move to descendant
|
|
if ( inext === 0 ) { break; } // no descendant
|
|
} else { // match found
|
|
if ( c === 0 ) { return; }
|
|
inext = this.tree[i+1]; // move to sibling
|
|
ichar -= 1;
|
|
c = ichar === -1 ? 0 : hn.charCodeAt(ichar);
|
|
}
|
|
i = inext;
|
|
}
|
|
// Any new string added will always cause a new descendant to be
|
|
// created. The only time this is not the case is when trying to
|
|
// store a string which is already in the trie.
|
|
inext = this.treesz; // new descendant cell
|
|
this.tree[i] = inext;
|
|
this.tree[inext+0] = 0; // jump index to descendant
|
|
this.tree[inext+1] = 0; // jump index to sibling
|
|
this.tree[inext+2] = c; // character code
|
|
this.treesz += 3;
|
|
if ( c === 0 ) { return; } // character zero is always last cell
|
|
do {
|
|
i = inext; // new branch sprouting made from
|
|
ichar -= 1; // all characters left to store
|
|
c = ichar === -1 ? 0 : hn.charCodeAt(ichar);
|
|
inext = this.treesz;
|
|
this.tree[i+1] = inext;
|
|
this.tree[inext+0] = 0;
|
|
this.tree[inext+1] = 0;
|
|
this.tree[inext+2] = c;
|
|
this.treesz += 3;
|
|
} while ( c!== 0 );
|
|
},
|
|
|
|
growTree: function() {
|
|
let tree = new Uint32Array(this.tree.length + 16384);
|
|
tree.set(this.tree);
|
|
this.tree = tree;
|
|
},
|
|
|
|
/***************************************************************************
|
|
|
|
Before vacuuming, each cell is 3 entry-long:
|
|
- Jump index to descendant (if any)
|
|
- Jump index to sibling (if any)
|
|
- character code
|
|
|
|
All strings stored in the un-vacuumed trie are zero-terminated, and the
|
|
character zero does occupy a cell like any other character. Let's
|
|
use _ to represent character zero for sake of comments. The asterisk
|
|
will be used to highlight a node with a descendant.
|
|
|
|
Cases, before vacuuming:
|
|
|
|
abc.com, abc.org: 16 cells
|
|
*
|
|
_ -- a -- b -- c -- . -- c -- o -- m
|
|
_ -- a -- b -- c -- . -- o -- r -- g
|
|
|
|
abc.com, xyz.com: 12 cells
|
|
*
|
|
_ -- a -- b -- c -- . -- c -- o -- m
|
|
_ -- x -- y -- z
|
|
|
|
ab.com, b.com: 8 cells
|
|
*
|
|
_ -- a -- b -- . -- c -- o -- m
|
|
_
|
|
|
|
b.com, ab.com: 8 cells
|
|
*
|
|
_ -- b -- . -- c -- o -- m
|
|
_ -- a
|
|
|
|
Vacuuming is the process of merging sibling cells with no descendants.
|
|
Cells with descendants can't be merged.
|
|
|
|
Each time we arrive at the end of a horizontal branch (sibling jump
|
|
index is 0), we walk back to the nearest previous node with descendants,
|
|
and repeat the process. Since there is no index information on where to
|
|
come back, a stack is used to remember cells with descendants (descendant
|
|
jump index is non zero) encountered on the way
|
|
|
|
After vacuuming, each cell is 4+n entry-long:
|
|
- Jump index to descendant (if any)
|
|
- Jump index to sibling (if any)
|
|
- character code
|
|
- length of merged character code(s)
|
|
|
|
Cases, after vacuuming:
|
|
|
|
abc.com, abc.org: 2 cells
|
|
*
|
|
[abc.co]m
|
|
[abc.or]g
|
|
|
|
abc.com, xyz.com: 3 cells
|
|
*
|
|
[ab]c -- [.co]m
|
|
[xy]z
|
|
|
|
ab.com, b.com: 3 cells
|
|
*
|
|
a -- [b.co]m
|
|
_
|
|
|
|
b.com, ab.com: 3 cells
|
|
*
|
|
_ -- [b.co]m
|
|
a
|
|
|
|
It's possible for a character zero cell to have descendants.
|
|
|
|
It's not possible for a character zero cell to have next siblings.
|
|
|
|
This will have to be taken into account during both vacuuming and
|
|
matching.
|
|
|
|
Character zero cells with no descendant are discarded during vacuuming.
|
|
Character zero cells with a descendant, or character zero cells which
|
|
are a decendant are kept into the vacuumed trie.
|
|
|
|
A vacuumed trie is very efficient memory- and lookup-wise, but is also
|
|
read-only: no string can be added or removed. The read-only trie is
|
|
really just a self-sufficient array of integers, and can easily be
|
|
exported/imported as a JSON array. It is theoretically possible to
|
|
"decompile" a trie (vacuumed or not) into the set of strings originally
|
|
added to it (in the order they were added with the current
|
|
implementation), but so far I do not need this feature.
|
|
|
|
New vacuum output array format:
|
|
byte 0..2: offset to descendant
|
|
byte 3..5: offset to sibling
|
|
byte 6: first character
|
|
byte 7: number of extra characters
|
|
Offset & count values are little-endian.
|
|
|
|
4 + 4 + 1 + 1 = 10 bytes for one character, otherwise
|
|
4 + 4 + 1 + 1 + n = 10 + n bytes for one + n character(s)
|
|
*/
|
|
|
|
finish: function() {
|
|
if ( this.treesz === 0 ) { return null; }
|
|
const input = this.tree,
|
|
iout0 = this.triesz,
|
|
forks = [];
|
|
let output = this.trie,
|
|
output32 = this.trie32,
|
|
iout1 = iout0,
|
|
iout2 = output.byteLength,
|
|
iin = 0;
|
|
for (;;) {
|
|
if ( (iout1 + 266) >= iout2 ) {
|
|
this.growTrie();
|
|
output = this.trie;
|
|
output32 = this.trie32;
|
|
iout2 = output.byteLength;
|
|
}
|
|
let iout = iout1;
|
|
output32[iout >>> 2] = 0;
|
|
output32[iout + 4 >>> 2] = 0;
|
|
output[iout+8] = input[iin+2]; // first character
|
|
output[iout+9] = 0; // extra character count
|
|
iout1 += 10;
|
|
if ( input[iin] !== 0 ) { // cell with descendant
|
|
forks.push(iout, iin); // defer processing
|
|
}
|
|
for (;;) { // merge sibling cell(s)
|
|
iin = input[iin+1]; // sibling cell
|
|
if ( iin === 0 ) { break; } // no more sibling cell
|
|
if ( input[iin] !== 0 ) { break; } // cell with a descendant
|
|
if ( input[iin+2] === 0 ) { break; } // don't merge \x00
|
|
output[iout1] = input[iin+2]; // add character data
|
|
iout1 += 1;
|
|
}
|
|
if ( iout1 !== iout + 10 ) { // cells were merged
|
|
output[iout+9] = iout1 - iout - 10; // so adjust count
|
|
}
|
|
iout1 = (iout1 + 3) & ~3; // align to i32
|
|
if ( iin !== 0 && input[iin] !== 0 ) { // can't merge this cell
|
|
output32[iout + 4 >>> 2] = iout1;
|
|
continue;
|
|
}
|
|
if ( forks.length === 0 ) { break; } // no more descendants: bye
|
|
iin = forks.pop(); // process next descendant
|
|
iout = forks.pop();
|
|
iin = input[iin];
|
|
output32[iout >>> 2] = iout1;
|
|
}
|
|
this.triesz = iout1;
|
|
this.cleanupAsync();
|
|
return new HNTrieRef(iout0);
|
|
},
|
|
|
|
fromIterable: function(hostnames) {
|
|
this.start();
|
|
const hns = Array.from(hostnames).sort(function(a, b) {
|
|
return a.length - b.length;
|
|
});
|
|
// https://github.com/gorhill/uBlock/issues/3328
|
|
// Must sort from shortest to longest.
|
|
for ( let hn of hns ) {
|
|
this.add(hn);
|
|
}
|
|
return this.finish();
|
|
},
|
|
|
|
fromDomainOpt: function(hostnames) {
|
|
return this.fromIterable(hostnames.split('|'));
|
|
},
|
|
|
|
growTrie: function() {
|
|
let trie;
|
|
if ( this.wasmMemory === null ) {
|
|
trie = new Uint8Array(this.trie.byteLength + 65536);
|
|
trie.set(this.trie);
|
|
} else {
|
|
this.wasmMemory.grow(1);
|
|
trie = new Uint8Array(this.wasmMemory.buffer);
|
|
}
|
|
this.trie = trie;
|
|
this.trie32 = new Uint32Array(this.trie.buffer);
|
|
},
|
|
|
|
cleanupAsync: function() {
|
|
if ( this.cleanupTimer === undefined ) {
|
|
this.cleanupToken = this.triesz;
|
|
this.cleanupTimer = setTimeout(( ) => {
|
|
this.cleanupTimer = undefined;
|
|
if ( this.cleanupToken !== this.triesz ) {
|
|
this.cleanupAsync();
|
|
} else {
|
|
this.tree = null;
|
|
}
|
|
}, 10000);
|
|
}
|
|
},
|
|
|
|
// For debugging purpose
|
|
// TODO: currently broken, needs to be fixed as per new buffer format.
|
|
/*
|
|
print: function(offset) {
|
|
let i = offset, cc = [], indent = 0,
|
|
forks = [];
|
|
for (;;) {
|
|
if ( buf[i] !== 0 ) {
|
|
forks.push(i, indent);
|
|
}
|
|
cc.unshift(buf[i+2]);
|
|
for ( let ic = 0; ic < buf[i+3]; ic++ ) {
|
|
cc.unshift(buf[i+4+ic]);
|
|
}
|
|
console.log('\xB7'.repeat(indent) + String.fromCharCode.apply(null, cc));
|
|
indent += cc.length;
|
|
cc = [];
|
|
i = buf[i+1];
|
|
if ( i === 0 ) {
|
|
if ( forks.length === 0 ) { break; }
|
|
indent = forks.pop();
|
|
i = forks.pop();
|
|
i = buf[i];
|
|
}
|
|
}
|
|
},
|
|
*/
|
|
};
|
|
|
|
/******************************************************************************/
|
|
|
|
(function() {
|
|
// Default to javascript version.
|
|
hnTrieManager.matches = hnTrieManager.matchesJS;
|
|
|
|
if (
|
|
typeof WebAssembly !== 'object' ||
|
|
typeof WebAssembly.instantiateStreaming !== 'function'
|
|
) {
|
|
return;
|
|
}
|
|
|
|
// Soft-dependency on vAPI so that the code here can be used outside of
|
|
// uBO (i.e. tests, benchmarks)
|
|
if (
|
|
typeof vAPI === 'object' &&
|
|
vAPI.webextFlavor.soup.has('firefox') === false
|
|
) {
|
|
return;
|
|
}
|
|
|
|
// Soft-dependency on µBlock's advanced settings so that the code here can
|
|
// be used outside of uBO (i.e. tests, benchmarks)
|
|
if (
|
|
typeof µBlock === 'object' &&
|
|
µBlock.hiddenSettings.disableWebAssembly === true
|
|
) {
|
|
return;
|
|
}
|
|
|
|
// The wasm module will work only if CPU is natively little-endian,
|
|
// as we use native uint32 array in our trie-creation js code.
|
|
const uint32s = new Uint32Array(1);
|
|
const uint8s = new Uint8Array(uint32s.buffer);
|
|
uint32s[0] = 1;
|
|
if ( uint8s[0] !== 1 ) { return; }
|
|
|
|
// The directory from which the current script was fetched should also
|
|
// contain the related WASM file. The script is fetched from a trusted
|
|
// location, and consequently so will be the related WASM file.
|
|
let workingDir;
|
|
{
|
|
const url = new URL(document.currentScript.src);
|
|
const match = /[^\/]+$/.exec(url.pathname);
|
|
if ( match !== null ) {
|
|
url.pathname = url.pathname.slice(0, match.index);
|
|
}
|
|
workingDir = url.href;
|
|
}
|
|
|
|
const memory = new WebAssembly.Memory({ initial: 1 });
|
|
|
|
hnTrieManager.wasmLoading = WebAssembly.instantiateStreaming(
|
|
fetch(workingDir + 'wasm/hntrie.wasm'),
|
|
{ imports: { memory } }
|
|
).then(result => {
|
|
hnTrieManager.wasmLoading = null;
|
|
if ( !result || !result.instance ) { return; }
|
|
const pageCount = hnTrieManager.trie.byteLength >>> 16;
|
|
if ( pageCount > 1 ) {
|
|
memory.grow(pageCount - 1);
|
|
}
|
|
const trie = new Uint8Array(memory.buffer);
|
|
trie.set(hnTrieManager.trie);
|
|
hnTrieManager.trie = trie;
|
|
if ( hnTrieManager.trie32 !== null ) {
|
|
hnTrieManager.trie32 = new Uint32Array(memory.buffer);
|
|
}
|
|
hnTrieManager.wasmMemory = memory;
|
|
hnTrieManager.matchesWASM = result.instance.exports.matches;
|
|
hnTrieManager.matches = hnTrieManager.matchesWASM;
|
|
}).catch(reason => {
|
|
hnTrieManager.wasmLoading = null;
|
|
console.error(reason);
|
|
});
|
|
})();
|
|
|
|
/******************************************************************************/
|
|
|
|
const HNTrieRef = function(offset) {
|
|
this.id = hnTrieManager.id;
|
|
this.offset = offset;
|
|
};
|
|
|
|
HNTrieRef.prototype = {
|
|
isValid: function() {
|
|
return this.id === hnTrieManager.id;
|
|
},
|
|
matches: function(needle) {
|
|
return hnTrieManager.setNeedle(needle).matches(this.offset);
|
|
},
|
|
matchesJS: function(needle) {
|
|
return hnTrieManager.setNeedle(needle).matchesJS(this.offset);
|
|
},
|
|
matchesWASM: function(needle) {
|
|
return hnTrieManager.setNeedle(needle).matchesWASM(this.offset);
|
|
},
|
|
};
|