From 69a43e07c4bc017f3320a669c1e80147c17dddcf Mon Sep 17 00:00:00 2001
From: Raymond Hill <rhill@raymondhill.net>
Date: Fri, 26 Apr 2019 17:14:00 -0400
Subject: [PATCH] Ignore unknown tokens in urlTokenizer.getTokens()

Given that all tokens extracted from one single URL are potentially
iterated multiple times in a single URL-matching cycle, it pays to
ignore extracted tokens which are known to not be used anywhere in
the static filtering engine.

The gain in processing a single network request in the static
filtering engine can become especially high when dealing with
long and random-looking URLs, which URLs have a high likelihood
of containing a majority of tokens which are known to not be in
use.
---
 src/js/background.js           |  2 +-
 src/js/static-net-filtering.js | 15 ++++++++----
 src/js/utils.js                | 44 +++++++++++++++++++++++++++-------
 3 files changed, 47 insertions(+), 14 deletions(-)

diff --git a/src/js/background.js b/src/js/background.js
index c498faf97..7a0c45089 100644
--- a/src/js/background.js
+++ b/src/js/background.js
@@ -138,7 +138,7 @@ const µBlock = (function() { // jshint ignore:line
         // Read-only
         systemSettings: {
             compiledMagic: 12,  // Increase when compiled format changes
-            selfieMagic: 11     // Increase when selfie format changes
+            selfieMagic: 12     // Increase when selfie format changes
         },
 
         restoreBackupSettings: {
diff --git a/src/js/static-net-filtering.js b/src/js/static-net-filtering.js
index a6d7678b8..f01d4fad7 100644
--- a/src/js/static-net-filtering.js
+++ b/src/js/static-net-filtering.js
@@ -797,7 +797,7 @@ const FilterWildcard2HnAnchored = class {
     }
 };
 
-FilterWildcard2HnAnchored.prototype.reSeparators = /[^0-9a-z.%_-]/;
+FilterWildcard2HnAnchored.prototype.reSeparators = /[^\w%.-]/;
 
 registerFilterClass(FilterWildcard2HnAnchored);
 
@@ -2163,7 +2163,7 @@ const reGoodToken = /[%0-9a-z]{2,}/g;
 const reRegexToken = /[%0-9A-Za-z]{2,}/g;
 const reRegexTokenAbort = /[([]/;
 const reRegexBadPrefix = /(^|[^\\]\.|[*?{}\\])$/;
-const reRegexBadSuffix = /^([^\\]\.|\\[dw]|[([{}?*]|$)/;
+const reRegexBadSuffix = /^([^\\]\.|\\[dw]|[([{}?*.]|$)/;
 
 const badTokens = new Set([
     'com',
@@ -2296,6 +2296,7 @@ FilterContainer.prototype.reset = function() {
     this.categories = new Map();
     this.dataFilters = new Map();
     this.filterParser.reset();
+    this.urlTokenizer.resetKnownTokens();
 
     // This will invalidate all tries
     FilterHostnameDict.reset();
@@ -2317,6 +2318,7 @@ FilterContainer.prototype.freeze = function() {
     const filterDataHolderId = FilterDataHolder.fid;
     const redirectTypeValue = typeNameToTypeValue.redirect;
     const unserialize = µb.CompiledLineIO.unserialize;
+    const knownTokens = this.urlTokenizer.knownTokens;
 
     for ( const line of this.goodFilters ) {
         if ( this.badFilters.has(line) ) {
@@ -2348,6 +2350,7 @@ FilterContainer.prototype.freeze = function() {
                 entry.next = bucket;
             }
             this.dataFilters.set(tokenHash, entry);
+            knownTokens[tokenHash & 0xFFFF] = 1;
             continue;
         }
 
@@ -2394,6 +2397,8 @@ FilterContainer.prototype.freeze = function() {
             continue;
         }
 
+        knownTokens[tokenHash & 0xFFFF] = 1;
+
         if ( entry === undefined ) {
             bucket.set(tokenHash, filterFromCompiledData(fdata));
             continue;
@@ -2484,6 +2489,7 @@ FilterContainer.prototype.toSelfie = function(path) {
                 discardedCount: this.discardedCount,
                 categories: categoriesToSelfie(this.categories),
                 dataFilters: dataFiltersToSelfie(this.dataFilters),
+                urlTokenizer: this.urlTokenizer.toSelfie(),
             })
         )
     ]);
@@ -2525,6 +2531,7 @@ FilterContainer.prototype.fromSelfie = function(path) {
             this.allowFilterCount = selfie.allowFilterCount;
             this.blockFilterCount = selfie.blockFilterCount;
             this.discardedCount = selfie.discardedCount;
+            this.urlTokenizer.fromSelfie(selfie.urlTokenizer);
             for ( const [ catbits, bucket ] of selfie.categories ) {
                 const tokenMap = new Map();
                 for ( const [ token, fdata ] of bucket ) {
@@ -2742,8 +2749,8 @@ FilterContainer.prototype.matchAndFetchData = function(dataType, requestURL, out
         toAdd = new Map(),
         toRemove = new Map();
 
-    let tokenHashes = this.urlTokenizer.getTokens(),
-        i = 0;
+    const tokenHashes = this.urlTokenizer.getTokens();
+    let i = 0;
     while ( i < 32 ) {
         let tokenHash = tokenHashes[i++];
         if ( tokenHash === 0 ) { break; }
diff --git a/src/js/utils.js b/src/js/utils.js
index 129b03994..696a0b696 100644
--- a/src/js/utils.js
+++ b/src/js/utils.js
@@ -65,6 +65,9 @@
         this._urlOut = '';
         this._tokenized = false;
         this._tokens = [ 0 ];
+
+        this.knownTokens = new Uint8Array(65536);
+        this.resetKnownTokens();
     }
 
     setURL(url) {
@@ -76,6 +79,15 @@
         return this._urlOut;
     }
 
+    resetKnownTokens() {
+        this.knownTokens.fill(0);
+        this.knownTokens[this.dotTokenHash & 0xFFFF] = 1;
+        this.knownTokens[this.anyTokenHash & 0xFFFF] = 1;
+        this.knownTokens[this.anyHTTPSTokenHash & 0xFFFF] = 1;
+        this.knownTokens[this.anyHTTPTokenHash & 0xFFFF] = 1;
+        this.knownTokens[this.noTokenHash & 0xFFFF] = 1;
+    }
+
     // Tokenize on demand.
     getTokens() {
         if ( this._tokenized ) { return this._tokens; }
@@ -92,12 +104,6 @@
         return this._tokens;
     }
 
-    _appendTokenAt(i, th, ti) {
-        this._tokens[i+0] = th;
-        this._tokens[i+1] = ti;
-        return i + 2;
-    }
-
     tokenHashFromString(s) {
         const l = s.length;
         if ( l === 0 ) { return 0; }
@@ -119,9 +125,26 @@
         return s;
     }
 
+    toSelfie() {
+        return µBlock.base64.encode(
+            this.knownTokens.buffer,
+            this.knownTokens.byteLength
+        );
+    }
+
+    fromSelfie(selfie) {
+        return µBlock.base64.decode(selfie, this.knownTokens.buffer);
+    }
+
     // https://github.com/chrisaljoudi/uBlock/issues/1118
     // We limit to a maximum number of tokens.
 
+    _appendTokenAt(i, th, ti) {
+        this._tokens[i+0] = th;
+        this._tokens[i+1] = ti;
+        return i + 2;
+    }
+
     _tokenize() {
         const tokens = this._tokens;
         let url = this._urlOut;
@@ -131,6 +154,7 @@
             url = url.slice(0, 2048);
             l = 2048;
         }
+        const knownTokens = this.knownTokens;
         const vtc = this._validTokenChars;
         let i = 0, j = 0, v, n, ti, th;
         for (;;) {
@@ -148,9 +172,11 @@
                 th = th * 64 + v;
                 n += 1;
             }
-            tokens[j+0] = th;
-            tokens[j+1] = ti;
-            j += 2;
+            if ( knownTokens[th & 0xFFFF] !== 0 ) {
+                tokens[j+0] = th;
+                tokens[j+1] = ti;
+                j += 2;
+            }
         }
     }
 })();