mirror of
https://github.com/gorhill/uBlock.git
synced 2024-11-13 18:27:35 +01:00
Introduce three more specialized filter classes to avoid regexes
Performance- and memory-related work. Three more classes have been created to avoid regex-based filters internally. Purpose is to enforce filters which have only one single wildcard in their pattern, a common occurrence. The filter pattern is split in two literal string segments. Similar as above, with the added condition that the filter is hostname-anchored (`||`). The "Wildcard2" variant is a further specialization to enforce filters where the only wildcard is immediately preceded by the `^` special character, again a very common occurrence. Using two literal string segments in lieu of regexes allows to quickly detect a mismatch by just testing the first segment. Additionally, this reduces memory footprint as regexes are much more expensive memory-wise than plain strings. These three new filter classes allow to replace the use of 5276 regex-based filters internally with plain string-based filters. Often-called isHnAnchored() has been further fine-tuned to avoid as much work as possible. I have also observed that using an arrow function for closure-purpose helps measurably performance, as per built-in benchmark.
This commit is contained in:
parent
dfd6076a5e
commit
99390390fc
2 changed files with 232 additions and 29 deletions
|
@ -137,7 +137,7 @@ const µBlock = (function() { // jshint ignore:line
|
||||||
|
|
||||||
// Read-only
|
// Read-only
|
||||||
systemSettings: {
|
systemSettings: {
|
||||||
compiledMagic: 10, // Increase when compiled format changes
|
compiledMagic: 11, // Increase when compiled format changes
|
||||||
selfieMagic: 11 // Increase when selfie format changes
|
selfieMagic: 11 // Increase when selfie format changes
|
||||||
},
|
},
|
||||||
|
|
||||||
|
|
|
@ -102,6 +102,8 @@ const typeValueToTypeName = {
|
||||||
|
|
||||||
const BlockImportant = BlockAction | Important;
|
const BlockImportant = BlockAction | Important;
|
||||||
|
|
||||||
|
const reIsWildcarded = /[\^\*]/;
|
||||||
|
|
||||||
// ABP filters: https://adblockplus.org/en/filters
|
// ABP filters: https://adblockplus.org/en/filters
|
||||||
// regex tester: http://regex101.com/
|
// regex tester: http://regex101.com/
|
||||||
|
|
||||||
|
@ -110,10 +112,39 @@ const BlockImportant = BlockAction | Important;
|
||||||
// See the following as short-lived registers, used during evaluation. They are
|
// See the following as short-lived registers, used during evaluation. They are
|
||||||
// valid until the next evaluation.
|
// valid until the next evaluation.
|
||||||
|
|
||||||
let pageHostnameRegister = '',
|
let pageHostnameRegister = '';
|
||||||
requestHostnameRegister = '';
|
let requestHostnameRegister = '';
|
||||||
//var filterRegister = null;
|
|
||||||
//var categoryRegister = '';
|
/******************************************************************************/
|
||||||
|
|
||||||
|
// First character of match must be within the hostname part of the url.
|
||||||
|
//
|
||||||
|
// https://github.com/gorhill/uBlock/issues/1929
|
||||||
|
// Match only hostname label boundaries.
|
||||||
|
|
||||||
|
const isHnAnchored = (( ) => {
|
||||||
|
let lastLen = 0, lastBeg = -1, lastEnd = -1;
|
||||||
|
|
||||||
|
return (url, matchStart) => {
|
||||||
|
const len = requestHostnameRegister.length;
|
||||||
|
if ( len !== lastLen || url.endsWith('://', lastBeg) === false ) {
|
||||||
|
lastBeg = len !== 0 ? url.indexOf('://') : -1;
|
||||||
|
if ( lastBeg !== -1 ) {
|
||||||
|
lastBeg += 3;
|
||||||
|
lastEnd = lastBeg + len;
|
||||||
|
} else {
|
||||||
|
lastEnd = -1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return matchStart < lastEnd && (
|
||||||
|
matchStart === lastBeg ||
|
||||||
|
matchStart > lastBeg &&
|
||||||
|
url.charCodeAt(matchStart - 1) === 0x2E /* '.' */
|
||||||
|
);
|
||||||
|
};
|
||||||
|
})();
|
||||||
|
|
||||||
|
/******************************************************************************/
|
||||||
|
|
||||||
// Local helpers
|
// Local helpers
|
||||||
|
|
||||||
|
@ -204,27 +235,6 @@ const toLogDataInternal = function(categoryBits, tokenHash, filter) {
|
||||||
return logData;
|
return logData;
|
||||||
};
|
};
|
||||||
|
|
||||||
// First character of match must be within the hostname part of the url.
|
|
||||||
//
|
|
||||||
// https://github.com/gorhill/uBlock/issues/1929
|
|
||||||
// Match only hostname label boundaries.
|
|
||||||
const isHnAnchored = (function() {
|
|
||||||
let hostname = '';
|
|
||||||
let beg = -1, end = -1;
|
|
||||||
|
|
||||||
return function(url, matchStart) {
|
|
||||||
if ( requestHostnameRegister !== hostname ) {
|
|
||||||
const hn = requestHostnameRegister;
|
|
||||||
beg = hn !== '' ? url.indexOf(hn) : -1;
|
|
||||||
end = beg !== -1 ? beg + hn.length : -1;
|
|
||||||
hostname = hn;
|
|
||||||
}
|
|
||||||
if ( matchStart < beg || matchStart >= end ) { return false; }
|
|
||||||
return matchStart === beg ||
|
|
||||||
url.charCodeAt(matchStart - 1) === 0x2E /* '.' */;
|
|
||||||
};
|
|
||||||
})();
|
|
||||||
|
|
||||||
/*******************************************************************************
|
/*******************************************************************************
|
||||||
|
|
||||||
Each filter class will register itself in the map. A filter class
|
Each filter class will register itself in the map. A filter class
|
||||||
|
@ -536,6 +546,52 @@ FilterPlainHnAnchored.prototype.trieableId = 1;
|
||||||
|
|
||||||
registerFilterClass(FilterPlainHnAnchored);
|
registerFilterClass(FilterPlainHnAnchored);
|
||||||
|
|
||||||
|
/*******************************************************************************
|
||||||
|
|
||||||
|
Filters with only one single occurrence of wildcard `*`
|
||||||
|
|
||||||
|
*/
|
||||||
|
|
||||||
|
const FilterWildcard1 = class {
|
||||||
|
constructor(s0, s1) {
|
||||||
|
this.s0 = s0;
|
||||||
|
this.s1 = s1;
|
||||||
|
}
|
||||||
|
|
||||||
|
match(url) {
|
||||||
|
const pos = url.indexOf(this.s0);
|
||||||
|
return pos !== -1 && url.indexOf(this.s1, pos + this.s0.length) !== -1;
|
||||||
|
}
|
||||||
|
|
||||||
|
logData() {
|
||||||
|
return {
|
||||||
|
raw: `${this.s0}*${this.s1}`,
|
||||||
|
regex: rawToRegexStr(`${this.s0}*${this.s1}`, 0),
|
||||||
|
compiled: this.compile()
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
compile() {
|
||||||
|
return [ this.fid, this.s0, this.s1 ];
|
||||||
|
}
|
||||||
|
|
||||||
|
static compile(details) {
|
||||||
|
if ( details.anchor !== 0 ) { return; }
|
||||||
|
const s = details.f;
|
||||||
|
let pos = s.indexOf('*');
|
||||||
|
if ( pos === -1 ) { return; }
|
||||||
|
if ( reIsWildcarded.test(s.slice(pos + 1)) ) { return; }
|
||||||
|
if ( reIsWildcarded.test(s.slice(0, pos)) ) { return; }
|
||||||
|
return [ FilterWildcard1.fid, s.slice(0, pos), s.slice(pos + 1) ];
|
||||||
|
}
|
||||||
|
|
||||||
|
static load(args) {
|
||||||
|
return new FilterWildcard1(args[1], args[2]);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
registerFilterClass(FilterWildcard1);
|
||||||
|
|
||||||
/******************************************************************************/
|
/******************************************************************************/
|
||||||
|
|
||||||
const FilterGeneric = class {
|
const FilterGeneric = class {
|
||||||
|
@ -571,6 +627,8 @@ const FilterGeneric = class {
|
||||||
}
|
}
|
||||||
|
|
||||||
static compile(details) {
|
static compile(details) {
|
||||||
|
const compiled = FilterWildcard1.compile(details);
|
||||||
|
if ( compiled !== undefined ) { return compiled; }
|
||||||
return [ FilterGeneric.fid, details.f, details.anchor ];
|
return [ FilterGeneric.fid, details.f, details.anchor ];
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -583,6 +641,117 @@ FilterGeneric.prototype.re = null;
|
||||||
|
|
||||||
registerFilterClass(FilterGeneric);
|
registerFilterClass(FilterGeneric);
|
||||||
|
|
||||||
|
/*******************************************************************************
|
||||||
|
|
||||||
|
Hostname-anchored filters with only one occurrence of wildcard `*`
|
||||||
|
|
||||||
|
*/
|
||||||
|
|
||||||
|
const FilterWildcard1HnAnchored = class {
|
||||||
|
constructor(s0, s1) {
|
||||||
|
this.s0 = s0;
|
||||||
|
this.s1 = s1;
|
||||||
|
}
|
||||||
|
|
||||||
|
match(url) {
|
||||||
|
const pos = url.indexOf(this.s0);
|
||||||
|
return pos !== -1 &&
|
||||||
|
isHnAnchored(url, pos) &&
|
||||||
|
url.indexOf(this.s1, pos + this.s0.length) !== -1;
|
||||||
|
}
|
||||||
|
|
||||||
|
logData() {
|
||||||
|
return {
|
||||||
|
raw: `||${this.s0}*${this.s1}`,
|
||||||
|
regex: rawToRegexStr(`${this.s0}*${this.s1}`, 0),
|
||||||
|
compiled: this.compile()
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
compile() {
|
||||||
|
return [ this.fid, this.s0, this.s1 ];
|
||||||
|
}
|
||||||
|
|
||||||
|
static compile(details) {
|
||||||
|
if ( (details.anchor & 0x0b001) !== 0 ) { return; }
|
||||||
|
const s = details.f;
|
||||||
|
let pos = s.indexOf('*');
|
||||||
|
if ( pos === -1 ) { return; }
|
||||||
|
if ( reIsWildcarded.test(s.slice(pos + 1)) ) { return; }
|
||||||
|
const needSeparator =
|
||||||
|
pos !== 0 && s.charCodeAt(pos - 1) === 0x5E /* '^' */;
|
||||||
|
if ( needSeparator ) { pos -= 1; }
|
||||||
|
if ( reIsWildcarded.test(s.slice(0, pos)) ) { return; }
|
||||||
|
if ( needSeparator ) {
|
||||||
|
return FilterWildcard2HnAnchored.compile(details, pos);
|
||||||
|
}
|
||||||
|
return [
|
||||||
|
FilterWildcard1HnAnchored.fid,
|
||||||
|
s.slice(0, pos),
|
||||||
|
s.slice(pos + 1),
|
||||||
|
];
|
||||||
|
}
|
||||||
|
|
||||||
|
static load(args) {
|
||||||
|
return new FilterWildcard1HnAnchored(args[1], args[2]);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
registerFilterClass(FilterWildcard1HnAnchored);
|
||||||
|
|
||||||
|
/*******************************************************************************
|
||||||
|
|
||||||
|
Hostname-anchored filters with one occurrence of the wildcard
|
||||||
|
sequence `^*` and no other wildcard-equivalent character
|
||||||
|
|
||||||
|
*/
|
||||||
|
|
||||||
|
const FilterWildcard2HnAnchored = class {
|
||||||
|
constructor(s0, s1) {
|
||||||
|
this.s0 = s0;
|
||||||
|
this.s1 = s1;
|
||||||
|
}
|
||||||
|
|
||||||
|
match(url) {
|
||||||
|
const pos0 = url.indexOf(this.s0);
|
||||||
|
if ( pos0 === -1 || isHnAnchored(url, pos0) === false ) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
const pos1 = pos0 + this.s0.length;
|
||||||
|
const pos2 = url.indexOf(this.s1, pos1);
|
||||||
|
return pos2 !== -1 &&
|
||||||
|
this.reSeparators.test(url.slice(pos1, pos2));
|
||||||
|
}
|
||||||
|
|
||||||
|
logData() {
|
||||||
|
return {
|
||||||
|
raw: `||${this.s0}^*${this.s1}`,
|
||||||
|
regex: rawToRegexStr(`${this.s0}^*${this.s1}`, 0),
|
||||||
|
compiled: this.compile()
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
compile() {
|
||||||
|
return [ this.fid, this.s0, this.s1 ];
|
||||||
|
}
|
||||||
|
|
||||||
|
static compile(details, pos) {
|
||||||
|
return [
|
||||||
|
FilterWildcard2HnAnchored.fid,
|
||||||
|
details.f.slice(0, pos),
|
||||||
|
details.f.slice(pos + 2),
|
||||||
|
];
|
||||||
|
}
|
||||||
|
|
||||||
|
static load(args) {
|
||||||
|
return new FilterWildcard2HnAnchored(args[1], args[2]);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
FilterWildcard2HnAnchored.prototype.reSeparators = /[^0-9a-z.%_-]/;
|
||||||
|
|
||||||
|
registerFilterClass(FilterWildcard2HnAnchored);
|
||||||
|
|
||||||
/******************************************************************************/
|
/******************************************************************************/
|
||||||
|
|
||||||
const FilterGenericHnAnchored = class {
|
const FilterGenericHnAnchored = class {
|
||||||
|
@ -610,6 +779,8 @@ const FilterGenericHnAnchored = class {
|
||||||
}
|
}
|
||||||
|
|
||||||
static compile(details) {
|
static compile(details) {
|
||||||
|
const compiled = FilterWildcard1HnAnchored.compile(details);
|
||||||
|
if ( compiled !== undefined ) { return compiled; }
|
||||||
return [ FilterGenericHnAnchored.fid, details.f ];
|
return [ FilterGenericHnAnchored.fid, details.f ];
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1377,7 +1548,10 @@ const FilterBucket = class {
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if ( this.plainHnAnchoredTrie !== null && isHnAnchored(url, tokenBeg) ) {
|
if (
|
||||||
|
this.plainHnAnchoredTrie !== null &&
|
||||||
|
isHnAnchored(url, tokenBeg)
|
||||||
|
) {
|
||||||
const pos = this.plainHnAnchoredTrie.matches(url, tokenBeg);
|
const pos = this.plainHnAnchoredTrie.matches(url, tokenBeg);
|
||||||
if ( pos !== -1 ) {
|
if ( pos !== -1 ) {
|
||||||
this.plainHnAnchoredFilter.s = url.slice(tokenBeg, pos);
|
this.plainHnAnchoredFilter.s = url.slice(tokenBeg, pos);
|
||||||
|
@ -1524,7 +1698,6 @@ const FilterParser = function() {
|
||||||
this.reHasUnicode = /[^\x00-\x7F]/;
|
this.reHasUnicode = /[^\x00-\x7F]/;
|
||||||
this.reWebsocketAny = /^ws[s*]?(?::\/?\/?)?\*?$/;
|
this.reWebsocketAny = /^ws[s*]?(?::\/?\/?)?\*?$/;
|
||||||
this.reBadCSP = /(?:^|;)\s*report-(?:to|uri)\b/;
|
this.reBadCSP = /(?:^|;)\s*report-(?:to|uri)\b/;
|
||||||
this.reIsWildcarded = /[\^\*]/;
|
|
||||||
this.domainOpt = '';
|
this.domainOpt = '';
|
||||||
this.noTokenHash = µb.urlTokenizer.noTokenHash;
|
this.noTokenHash = µb.urlTokenizer.noTokenHash;
|
||||||
this.unsupportedTypeBit = this.bitFromType('unsupported');
|
this.unsupportedTypeBit = this.bitFromType('unsupported');
|
||||||
|
@ -1917,7 +2090,7 @@ FilterParser.prototype.parse = function(raw) {
|
||||||
this.anchor = 0;
|
this.anchor = 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
this.wildcarded = this.reIsWildcarded.test(s);
|
this.wildcarded = reIsWildcarded.test(s);
|
||||||
|
|
||||||
// This might look weird but we gain memory footprint by not going through
|
// This might look weird but we gain memory footprint by not going through
|
||||||
// toLowerCase(), at least on Chromium. Because copy-on-write?
|
// toLowerCase(), at least on Chromium. Because copy-on-write?
|
||||||
|
@ -2985,6 +3158,36 @@ FilterContainer.prototype.bucketHistogram = function() {
|
||||||
- FilterPlainHnAnchored and FilterPlainPrefix1 are good candidates
|
- FilterPlainHnAnchored and FilterPlainPrefix1 are good candidates
|
||||||
for storing in a plain string trie.
|
for storing in a plain string trie.
|
||||||
|
|
||||||
|
As of 2019-04-25:
|
||||||
|
|
||||||
|
{"FilterPlainHnAnchored" => 11078}
|
||||||
|
{"FilterPlainPrefix1" => 7195}
|
||||||
|
{"FilterPrefix1Trie" => 5720}
|
||||||
|
{"FilterOriginHit" => 3561}
|
||||||
|
{"FilterWildcard2HnAnchored" => 2943}
|
||||||
|
{"FilterPair" => 2391}
|
||||||
|
{"FilterBucket" => 1922}
|
||||||
|
{"FilterWildcard1HnAnchored" => 1910}
|
||||||
|
{"FilterHnAnchoredTrie" => 1586}
|
||||||
|
{"FilterPlainHostname" => 1391}
|
||||||
|
{"FilterOriginHitSet" => 1155}
|
||||||
|
{"FilterPlain" => 634}
|
||||||
|
{"FilterWildcard1" => 423}
|
||||||
|
{"FilterGenericHnAnchored" => 389}
|
||||||
|
{"FilterOriginMiss" => 302}
|
||||||
|
{"FilterGeneric" => 163}
|
||||||
|
{"FilterOriginMissSet" => 150}
|
||||||
|
{"FilterRegex" => 124}
|
||||||
|
{"FilterPlainRightAnchored" => 110}
|
||||||
|
{"FilterGenericHnAndRightAnchored" => 95}
|
||||||
|
{"FilterHostnameDict" => 59}
|
||||||
|
{"FilterPlainLeftAnchored" => 30}
|
||||||
|
{"FilterJustOrigin" => 22}
|
||||||
|
{"FilterHTTPJustOrigin" => 19}
|
||||||
|
{"FilterHTTPSJustOrigin" => 18}
|
||||||
|
{"FilterExactMatch" => 5}
|
||||||
|
{"FilterOriginMixedSet" => 3}
|
||||||
|
|
||||||
*/
|
*/
|
||||||
|
|
||||||
FilterContainer.prototype.filterClassHistogram = function() {
|
FilterContainer.prototype.filterClassHistogram = function() {
|
||||||
|
|
Loading…
Reference in a new issue