From 69a43e07c4bc017f3320a669c1e80147c17dddcf Mon Sep 17 00:00:00 2001 From: Raymond Hill Date: Fri, 26 Apr 2019 17:14:00 -0400 Subject: [PATCH] Ignore unknown tokens in urlTokenizer.getTokens() Given that all tokens extracted from one single URL are potentially iterated multiple times in a single URL-matching cycle, it pays to ignore extracted tokens which are known to not be used anywhere in the static filtering engine. The gain in processing a single network request in the static filtering engine can become especially high when dealing with long and random-looking URLs, which URLs have a high likelihood of containing a majority of tokens which are known to not be in use. --- src/js/background.js | 2 +- src/js/static-net-filtering.js | 15 ++++++++---- src/js/utils.js | 44 +++++++++++++++++++++++++++------- 3 files changed, 47 insertions(+), 14 deletions(-) diff --git a/src/js/background.js b/src/js/background.js index c498faf97..7a0c45089 100644 --- a/src/js/background.js +++ b/src/js/background.js @@ -138,7 +138,7 @@ const µBlock = (function() { // jshint ignore:line // Read-only systemSettings: { compiledMagic: 12, // Increase when compiled format changes - selfieMagic: 11 // Increase when selfie format changes + selfieMagic: 12 // Increase when selfie format changes }, restoreBackupSettings: { diff --git a/src/js/static-net-filtering.js b/src/js/static-net-filtering.js index a6d7678b8..f01d4fad7 100644 --- a/src/js/static-net-filtering.js +++ b/src/js/static-net-filtering.js @@ -797,7 +797,7 @@ const FilterWildcard2HnAnchored = class { } }; -FilterWildcard2HnAnchored.prototype.reSeparators = /[^0-9a-z.%_-]/; +FilterWildcard2HnAnchored.prototype.reSeparators = /[^\w%.-]/; registerFilterClass(FilterWildcard2HnAnchored); @@ -2163,7 +2163,7 @@ const reGoodToken = /[%0-9a-z]{2,}/g; const reRegexToken = /[%0-9A-Za-z]{2,}/g; const reRegexTokenAbort = /[([]/; const reRegexBadPrefix = /(^|[^\\]\.|[*?{}\\])$/; -const reRegexBadSuffix = /^([^\\]\.|\\[dw]|[([{}?*]|$)/; +const reRegexBadSuffix = /^([^\\]\.|\\[dw]|[([{}?*.]|$)/; const badTokens = new Set([ 'com', @@ -2296,6 +2296,7 @@ FilterContainer.prototype.reset = function() { this.categories = new Map(); this.dataFilters = new Map(); this.filterParser.reset(); + this.urlTokenizer.resetKnownTokens(); // This will invalidate all tries FilterHostnameDict.reset(); @@ -2317,6 +2318,7 @@ FilterContainer.prototype.freeze = function() { const filterDataHolderId = FilterDataHolder.fid; const redirectTypeValue = typeNameToTypeValue.redirect; const unserialize = µb.CompiledLineIO.unserialize; + const knownTokens = this.urlTokenizer.knownTokens; for ( const line of this.goodFilters ) { if ( this.badFilters.has(line) ) { @@ -2348,6 +2350,7 @@ FilterContainer.prototype.freeze = function() { entry.next = bucket; } this.dataFilters.set(tokenHash, entry); + knownTokens[tokenHash & 0xFFFF] = 1; continue; } @@ -2394,6 +2397,8 @@ FilterContainer.prototype.freeze = function() { continue; } + knownTokens[tokenHash & 0xFFFF] = 1; + if ( entry === undefined ) { bucket.set(tokenHash, filterFromCompiledData(fdata)); continue; @@ -2484,6 +2489,7 @@ FilterContainer.prototype.toSelfie = function(path) { discardedCount: this.discardedCount, categories: categoriesToSelfie(this.categories), dataFilters: dataFiltersToSelfie(this.dataFilters), + urlTokenizer: this.urlTokenizer.toSelfie(), }) ) ]); @@ -2525,6 +2531,7 @@ FilterContainer.prototype.fromSelfie = function(path) { this.allowFilterCount = selfie.allowFilterCount; this.blockFilterCount = selfie.blockFilterCount; this.discardedCount = selfie.discardedCount; + this.urlTokenizer.fromSelfie(selfie.urlTokenizer); for ( const [ catbits, bucket ] of selfie.categories ) { const tokenMap = new Map(); for ( const [ token, fdata ] of bucket ) { @@ -2742,8 +2749,8 @@ FilterContainer.prototype.matchAndFetchData = function(dataType, requestURL, out toAdd = new Map(), toRemove = new Map(); - let tokenHashes = this.urlTokenizer.getTokens(), - i = 0; + const tokenHashes = this.urlTokenizer.getTokens(); + let i = 0; while ( i < 32 ) { let tokenHash = tokenHashes[i++]; if ( tokenHash === 0 ) { break; } diff --git a/src/js/utils.js b/src/js/utils.js index 129b03994..696a0b696 100644 --- a/src/js/utils.js +++ b/src/js/utils.js @@ -65,6 +65,9 @@ this._urlOut = ''; this._tokenized = false; this._tokens = [ 0 ]; + + this.knownTokens = new Uint8Array(65536); + this.resetKnownTokens(); } setURL(url) { @@ -76,6 +79,15 @@ return this._urlOut; } + resetKnownTokens() { + this.knownTokens.fill(0); + this.knownTokens[this.dotTokenHash & 0xFFFF] = 1; + this.knownTokens[this.anyTokenHash & 0xFFFF] = 1; + this.knownTokens[this.anyHTTPSTokenHash & 0xFFFF] = 1; + this.knownTokens[this.anyHTTPTokenHash & 0xFFFF] = 1; + this.knownTokens[this.noTokenHash & 0xFFFF] = 1; + } + // Tokenize on demand. getTokens() { if ( this._tokenized ) { return this._tokens; } @@ -92,12 +104,6 @@ return this._tokens; } - _appendTokenAt(i, th, ti) { - this._tokens[i+0] = th; - this._tokens[i+1] = ti; - return i + 2; - } - tokenHashFromString(s) { const l = s.length; if ( l === 0 ) { return 0; } @@ -119,9 +125,26 @@ return s; } + toSelfie() { + return µBlock.base64.encode( + this.knownTokens.buffer, + this.knownTokens.byteLength + ); + } + + fromSelfie(selfie) { + return µBlock.base64.decode(selfie, this.knownTokens.buffer); + } + // https://github.com/chrisaljoudi/uBlock/issues/1118 // We limit to a maximum number of tokens. + _appendTokenAt(i, th, ti) { + this._tokens[i+0] = th; + this._tokens[i+1] = ti; + return i + 2; + } + _tokenize() { const tokens = this._tokens; let url = this._urlOut; @@ -131,6 +154,7 @@ url = url.slice(0, 2048); l = 2048; } + const knownTokens = this.knownTokens; const vtc = this._validTokenChars; let i = 0, j = 0, v, n, ti, th; for (;;) { @@ -148,9 +172,11 @@ th = th * 64 + v; n += 1; } - tokens[j+0] = th; - tokens[j+1] = ti; - j += 2; + if ( knownTokens[th & 0xFFFF] !== 0 ) { + tokens[j+0] = th; + tokens[j+1] = ti; + j += 2; + } } } })();