Add HNTrie-based filter classes to store origin-only filters

Related issue:
- https://github.com/uBlockOrigin/uBlock-issues/issues/528#issuecomment-484408622

Following STrie-related work in above issue, I noticed that a large
number of filters in EasyList were filters which only had to match
against the document origin. For instance, among just the top 10
most populous buckets, there were four such buckets with over
hundreds of entries each:

- bits: 72, token: "http", 146 entries
- bits: 72, token: "https", 139 entries
- bits: 88, token: "http", 122 entries
- bits: 88, token: "https", 118 entries

These filters in these buckets have to be matched against all
the network requests.

In order to leverage HNTrie for these filters[1], they are now handled
in a special way so as to ensure they all end up in a single HNTrie
(per bucket), which means that instead of scanning hundreds of entries
per URL, there is now a single scan per bucket per URL for these
apply-everywhere filters.

Now, any filter which fulfill ALL the following condition will be
processed in a special manner internally:

- Is of the form `|https://` or `|http://` or `*`; and
- Does have a `domain=` option; and
- Does not have a negated domain in its `domain=` option; and
- Does not have `csp=` option; and
- Does not have a `redirect=` option

If a filter does not fulfill ALL the conditions above, no change
in behavior.

A filter which matches ALL of the above will be processed in a special
manner:

- The `domain=` option will be decomposed so as to create as many
  distinct filter as there is distinct value in the `domain=` option
- This also apply to the `badfilter` version of the filter, which
  means it now become possible to `badfilter` only one of the
  distinct filter without having to `badfilter` all of them.
- The logger will always report these special filters with only a
  single hostname in the `domain=` option.

***

[1] HNTrie is currently WASM-ed on Firefox.
This commit is contained in:
Raymond Hill 2019-04-19 16:33:46 -04:00
parent fd9df4b374
commit 3f3a1543ea
No known key found for this signature in database
GPG key ID: 25E1490B761470C2
4 changed files with 590 additions and 440 deletions

View file

@ -137,8 +137,8 @@ const µBlock = (function() { // jshint ignore:line
// Read-only // Read-only
systemSettings: { systemSettings: {
compiledMagic: 8, // Increase when compiled format changes compiledMagic: 10, // Increase when compiled format changes
selfieMagic: 9 // Increase when selfie format changes selfieMagic: 10 // Increase when selfie format changes
}, },
restoreBackupSettings: { restoreBackupSettings: {

File diff suppressed because it is too large Load diff

View file

@ -46,7 +46,9 @@ const STRIE_CHAR1_SLOT = STRIE_TRIE0_SLOT + 3; // 67 / 268
const STRIE_TRIE0_START = STRIE_TRIE0_SLOT + 4 << 2; // 272 const STRIE_TRIE0_START = STRIE_TRIE0_SLOT + 4 << 2; // 272
const STrieContainer = function(details) { const STrieContainer = class {
constructor(details) {
if ( details instanceof Object === false ) { details = {}; } if ( details instanceof Object === false ) { details = {}; }
const len = (details.byteLength || 0) + STRIE_PAGE_SIZE-1 & ~(STRIE_PAGE_SIZE-1); const len = (details.byteLength || 0) + STRIE_PAGE_SIZE-1 & ~(STRIE_PAGE_SIZE-1);
this.buf = new Uint8Array(Math.max(len, 131072)); this.buf = new Uint8Array(Math.max(len, 131072));
@ -55,20 +57,18 @@ const STrieContainer = function(details) {
this.buf32[STRIE_TRIE1_SLOT] = this.buf32[STRIE_TRIE0_SLOT]; this.buf32[STRIE_TRIE1_SLOT] = this.buf32[STRIE_TRIE0_SLOT];
this.buf32[STRIE_CHAR0_SLOT] = details.char0 || 65536; this.buf32[STRIE_CHAR0_SLOT] = details.char0 || 65536;
this.buf32[STRIE_CHAR1_SLOT] = this.buf32[STRIE_CHAR0_SLOT]; this.buf32[STRIE_CHAR1_SLOT] = this.buf32[STRIE_CHAR0_SLOT];
}; }
STrieContainer.prototype = {
//-------------------------------------------------------------------------- //--------------------------------------------------------------------------
// Public methods // Public methods
//-------------------------------------------------------------------------- //--------------------------------------------------------------------------
reset: function() { reset() {
this.buf32[STRIE_TRIE1_SLOT] = this.buf32[STRIE_TRIE0_SLOT]; this.buf32[STRIE_TRIE1_SLOT] = this.buf32[STRIE_TRIE0_SLOT];
this.buf32[STRIE_CHAR1_SLOT] = this.buf32[STRIE_CHAR0_SLOT]; this.buf32[STRIE_CHAR1_SLOT] = this.buf32[STRIE_CHAR0_SLOT];
}, }
matches: function(iroot, a, al) { matches(iroot, a, al) {
const ar = a.length; const ar = a.length;
const char0 = this.buf32[STRIE_CHAR0_SLOT]; const char0 = this.buf32[STRIE_CHAR0_SLOT];
let icell = iroot; let icell = iroot;
@ -102,9 +102,9 @@ STrieContainer.prototype = {
if ( icell === 0 || this.buf32[icell+2] === 0 ) { return al; } if ( icell === 0 || this.buf32[icell+2] === 0 ) { return al; }
if ( al === ar ) { return -1; } if ( al === ar ) { return -1; }
} }
}, }
createOne: function(args) { createOne(args) {
if ( Array.isArray(args) ) { if ( Array.isArray(args) ) {
return new this.STrieRef(this, args[0], args[1]); return new this.STrieRef(this, args[0], args[1]);
} }
@ -118,13 +118,13 @@ STrieContainer.prototype = {
this.buf32[iroot+1] = 0; this.buf32[iroot+1] = 0;
this.buf32[iroot+2] = 0; this.buf32[iroot+2] = 0;
return new this.STrieRef(this, iroot, 0); return new this.STrieRef(this, iroot, 0);
}, }
compileOne: function(trieRef) { compileOne(trieRef) {
return [ trieRef.iroot, trieRef.size ]; return [ trieRef.iroot, trieRef.size ];
}, }
add: function(iroot, s) { add(iroot, s) {
const lschar = s.length; const lschar = s.length;
if ( lschar === 0 ) { return 0; } if ( lschar === 0 ) { return 0; }
let ischar = 0; let ischar = 0;
@ -221,26 +221,17 @@ STrieContainer.prototype = {
} }
return 1; return 1;
} }
}, }
optimize: function() { optimize() {
this.shrinkBuf(); this.shrinkBuf();
return { return {
byteLength: this.buf.byteLength, byteLength: this.buf.byteLength,
char0: this.buf32[STRIE_CHAR0_SLOT], char0: this.buf32[STRIE_CHAR0_SLOT],
}; };
},
fromIterable: function(hostnames, add) {
if ( add === undefined ) { add = 'add'; }
const trieRef = this.createOne();
for ( const hn of hostnames ) {
trieRef[add](hn);
} }
return trieRef;
},
serialize: function(encoder) { serialize(encoder) {
if ( encoder instanceof Object ) { if ( encoder instanceof Object ) {
return encoder.encode( return encoder.encode(
this.buf32.buffer, this.buf32.buffer,
@ -254,9 +245,9 @@ STrieContainer.prototype = {
this.buf32[STRIE_CHAR1_SLOT] + 3 >>> 2 this.buf32[STRIE_CHAR1_SLOT] + 3 >>> 2
) )
); );
}, }
unserialize: function(selfie, decoder) { unserialize(selfie, decoder) {
const shouldDecode = typeof selfie === 'string'; const shouldDecode = typeof selfie === 'string';
let byteLength = shouldDecode let byteLength = shouldDecode
? decoder.decodeSize(selfie) ? decoder.decodeSize(selfie)
@ -272,23 +263,13 @@ STrieContainer.prototype = {
} else { } else {
this.buf32.set(selfie); this.buf32.set(selfie);
} }
}, }
//--------------------------------------------------------------------------
// Class to hold reference to a specific trie
//--------------------------------------------------------------------------
STrieRef: function(container, iroot, size) {
this.container = container;
this.iroot = iroot;
this.size = size;
},
//-------------------------------------------------------------------------- //--------------------------------------------------------------------------
// Private methods // Private methods
//-------------------------------------------------------------------------- //--------------------------------------------------------------------------
addCell: function(idown, iright, v) { addCell(idown, iright, v) {
let icell = this.buf32[STRIE_TRIE1_SLOT]; let icell = this.buf32[STRIE_TRIE1_SLOT];
this.buf32[STRIE_TRIE1_SLOT] = icell + 12; this.buf32[STRIE_TRIE1_SLOT] = icell + 12;
icell >>>= 2; icell >>>= 2;
@ -296,9 +277,9 @@ STrieContainer.prototype = {
this.buf32[icell+1] = iright; this.buf32[icell+1] = iright;
this.buf32[icell+2] = v; this.buf32[icell+2] = v;
return icell; return icell;
}, }
addSegment: function(segment) { addSegment(segment) {
const lsegchar = segment.length; const lsegchar = segment.length;
if ( lsegchar === 0 ) { return 0; } if ( lsegchar === 0 ) { return 0; }
let char1 = this.buf32[STRIE_CHAR1_SLOT]; let char1 = this.buf32[STRIE_CHAR1_SLOT];
@ -309,9 +290,9 @@ STrieContainer.prototype = {
} while ( i !== lsegchar ); } while ( i !== lsegchar );
this.buf32[STRIE_CHAR1_SLOT] = char1; this.buf32[STRIE_CHAR1_SLOT] = char1;
return (lsegchar << 24) | isegchar; return (lsegchar << 24) | isegchar;
}, }
growBuf: function(trieGrow, charGrow) { growBuf(trieGrow, charGrow) {
const char0 = Math.max( const char0 = Math.max(
(this.buf32[STRIE_TRIE1_SLOT] + trieGrow + STRIE_PAGE_SIZE-1) & ~(STRIE_PAGE_SIZE-1), (this.buf32[STRIE_TRIE1_SLOT] + trieGrow + STRIE_PAGE_SIZE-1) & ~(STRIE_PAGE_SIZE-1),
this.buf32[STRIE_CHAR0_SLOT] this.buf32[STRIE_CHAR0_SLOT]
@ -322,16 +303,16 @@ STrieContainer.prototype = {
this.buf.length this.buf.length
); );
this.resizeBuf(bufLen, char0); this.resizeBuf(bufLen, char0);
}, }
shrinkBuf: function() { shrinkBuf() {
const char0 = this.buf32[STRIE_TRIE1_SLOT] + 24; const char0 = this.buf32[STRIE_TRIE1_SLOT] + 24;
const char1 = char0 + this.buf32[STRIE_CHAR1_SLOT] - this.buf32[STRIE_CHAR0_SLOT]; const char1 = char0 + this.buf32[STRIE_CHAR1_SLOT] - this.buf32[STRIE_CHAR0_SLOT];
const bufLen = char1 + 256; const bufLen = char1 + 256;
this.resizeBuf(bufLen, char0); this.resizeBuf(bufLen, char0);
}, }
resizeBuf: function(bufLen, char0) { resizeBuf(bufLen, char0) {
bufLen = bufLen + STRIE_PAGE_SIZE-1 & ~(STRIE_PAGE_SIZE-1); bufLen = bufLen + STRIE_PAGE_SIZE-1 & ~(STRIE_PAGE_SIZE-1);
if ( if (
bufLen === this.buf.length && bufLen === this.buf.length &&
@ -375,23 +356,35 @@ STrieContainer.prototype = {
this.buf32[STRIE_CHAR0_SLOT] = char0; this.buf32[STRIE_CHAR0_SLOT] = char0;
this.buf32[STRIE_CHAR1_SLOT] = char0 + charDataLen; this.buf32[STRIE_CHAR1_SLOT] = char0 + charDataLen;
} }
}, }
}; };
/******************************************************************************/ /*******************************************************************************
STrieContainer.prototype.STrieRef.prototype = { Class to hold reference to a specific trie
add: function(pattern) {
*/
STrieContainer.prototype.STrieRef = class {
constructor(container, iroot, size) {
this.container = container;
this.iroot = iroot;
this.size = size;
}
add(pattern) {
if ( this.container.add(this.iroot, pattern) === 1 ) { if ( this.container.add(this.iroot, pattern) === 1 ) {
this.size += 1; this.size += 1;
return true; return true;
} }
return false; return false;
}, }
matches: function(a, al) {
matches(a, al) {
return this.container.matches(this.iroot, a, al); return this.container.matches(this.iroot, a, al);
}, }
[Symbol.iterator]: function() {
[Symbol.iterator]() {
return { return {
value: undefined, value: undefined,
done: false, done: false,
@ -441,5 +434,5 @@ STrieContainer.prototype.STrieRef.prototype = {
forks: [], forks: [],
textDecoder: new TextDecoder() textDecoder: new TextDecoder()
}; };
}, }
}; };

View file

@ -41,70 +41,101 @@
// Benchmark for string-based tokens vs. safe-integer token values: // Benchmark for string-based tokens vs. safe-integer token values:
// https://gorhill.github.io/obj-vs-set-vs-map/tokenize-to-str-vs-to-int.html // https://gorhill.github.io/obj-vs-set-vs-map/tokenize-to-str-vs-to-int.html
µBlock.urlTokenizer = { µBlock.urlTokenizer = new (class {
setURL: function(url) { constructor() {
this._chars = '0123456789%abcdefghijklmnopqrstuvwxyz';
this._validTokenChars = new Uint8Array(128);
for ( let i = 0, n = this._chars.length; i < n; i++ ) {
this._validTokenChars[this._chars.charCodeAt(i)] = i + 1;
}
this._charsEx = '0123456789%abcdefghijklmnopqrstuvwxyz*.';
this._validTokenCharsEx = new Uint8Array(128);
for ( let i = 0, n = this._charsEx.length; i < n; i++ ) {
this._validTokenCharsEx[this._charsEx.charCodeAt(i)] = i + 1;
}
this.dotTokenHash = this.tokenHashFromString('.');
this.anyTokenHash = this.tokenHashFromString('..');
this.anyHTTPSTokenHash = this.tokenHashFromString('..https');
this.anyHTTPTokenHash = this.tokenHashFromString('..http');
this.noTokenHash = this.tokenHashFromString('*');
this._urlIn = '';
this._urlOut = '';
this._tokenized = false;
this._tokens = [ 0 ];
}
setURL(url) {
if ( url !== this._urlIn ) { if ( url !== this._urlIn ) {
this._urlIn = url; this._urlIn = url;
this._urlOut = url.toLowerCase(); this._urlOut = url.toLowerCase();
this._tokenized = false; this._tokenized = false;
} }
return this._urlOut; return this._urlOut;
}, }
// Tokenize on demand. // Tokenize on demand.
getTokens: function() { getTokens() {
if ( this._tokenized === false ) { if ( this._tokenized ) { return this._tokens; }
this._tokenize(); let i = this._tokenize();
i = this._appendTokenAt(i, this.anyTokenHash, 0);
if ( this._urlOut.startsWith('https://') ) {
i = this._appendTokenAt(i, this.anyHTTPSTokenHash, 0);
} else if ( this._urlOut.startsWith('http://') ) {
i = this._appendTokenAt(i, this.anyHTTPTokenHash, 0);
}
i = this._appendTokenAt(i, this.noTokenHash, 0);
this._tokens[i] = 0;
this._tokenized = true; this._tokenized = true;
}
return this._tokens; return this._tokens;
},
tokenHashFromString: function(s) {
var l = s.length;
if ( l === 0 ) { return 0; }
if ( l === 1 ) {
if ( s === '*' ) { return 63; }
if ( s === '.' ) { return 62; }
} }
var vtc = this._validTokenChars,
th = vtc[s.charCodeAt(0)]; _appendTokenAt(i, th, ti) {
for ( var i = 1; i !== 8 && i !== l; i++ ) { this._tokens[i+0] = th;
this._tokens[i+1] = ti;
return i + 2;
}
tokenHashFromString(s) {
const l = s.length;
if ( l === 0 ) { return 0; }
const vtc = this._validTokenCharsEx;
let th = vtc[s.charCodeAt(0)];
for ( let i = 1; i !== 8 && i !== l; i++ ) {
th = th * 64 + vtc[s.charCodeAt(i)]; th = th * 64 + vtc[s.charCodeAt(i)];
} }
return th; return th;
}, }
stringFromTokenHash: function(th) { stringFromTokenHash(th) {
if ( th === 0 ) { return ''; } if ( th === 0 ) { return ''; }
if ( th === 63 ) { return '*'; }
if ( th === 62 ) { return '.'; }
const chars = '0123456789%abcdefghijklmnopqrstuvwxyz';
let s = ''; let s = '';
while ( th > 0 ) { while ( th > 0 ) {
s = `${chars.charAt((th & 0b111111)-1)}${s}`; s = `${this._charsEx.charAt((th & 0b111111)-1)}${s}`;
th /= 64; th /= 64;
} }
return s; return s;
}, }
// https://github.com/chrisaljoudi/uBlock/issues/1118 // https://github.com/chrisaljoudi/uBlock/issues/1118
// We limit to a maximum number of tokens. // We limit to a maximum number of tokens.
_tokenize: function() { _tokenize() {
var tokens = this._tokens, const tokens = this._tokens;
url = this._urlOut, let url = this._urlOut;
l = url.length; let l = url.length;
if ( l === 0 ) { tokens[0] = 0; return; } if ( l === 0 ) { return 0; }
if ( l > 2048 ) { if ( l > 2048 ) {
url = url.slice(0, 2048); url = url.slice(0, 2048);
l = 2048; l = 2048;
} }
var i = 0, j = 0, v, n, ti, th, const vtc = this._validTokenChars;
vtc = this._validTokenChars; let i = 0, j = 0, v, n, ti, th;
for (;;) { for (;;) {
for (;;) { for (;;) {
if ( i === l ) { tokens[j] = 0; return; } if ( i === l ) { return j; }
v = vtc[url.charCodeAt(i++)]; v = vtc[url.charCodeAt(i++)];
if ( v !== 0 ) { break; } if ( v !== 0 ) { break; }
} }
@ -117,25 +148,12 @@
th = th * 64 + v; th = th * 64 + v;
n += 1; n += 1;
} }
tokens[j++] = th; tokens[j+0] = th;
tokens[j++] = ti; tokens[j+1] = ti;
j += 2;
} }
},
_urlIn: '',
_urlOut: '',
_tokenized: false,
_tokens: [ 0 ],
_validTokenChars: (function() {
var vtc = new Uint8Array(128),
chars = '0123456789%abcdefghijklmnopqrstuvwxyz',
i = chars.length;
while ( i-- ) {
vtc[chars.charCodeAt(i)] = i + 1;
} }
return vtc; })();
})()
};
/******************************************************************************/ /******************************************************************************/