uBlock/src/js/static-net-filtering.js

3404 lines
100 KiB
JavaScript
Raw Normal View History

2014-06-24 00:42:43 +02:00
/*******************************************************************************
2016-06-27 03:15:18 +02:00
uBlock Origin - a browser extension to block requests.
2018-07-22 16:47:02 +02:00
Copyright (C) 2014-present Raymond Hill
2014-06-24 00:42:43 +02:00
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see {http://www.gnu.org/licenses/}.
Home: https://github.com/gorhill/uBlock
*/
/* jshint bitwise: false */
/* global punycode */
2016-06-27 03:15:18 +02:00
'use strict';
2014-06-24 00:42:43 +02:00
/******************************************************************************/
µBlock.staticNetFilteringEngine = (( ) => {
2014-06-24 00:42:43 +02:00
/******************************************************************************/
Squashed commit of the following: commit 7c6cacc59b27660fabacb55d668ef099b222a9e6 Author: Raymond Hill <rhill@raymondhill.net> Date: Sat Nov 3 08:52:51 2018 -0300 code review: finalize support for wasm-based hntrie commit 8596ed80e3bdac2c36e3c860b51e7189f6bc8487 Merge: cbe1f2e 000eb82 Author: Raymond Hill <rhill@raymondhill.net> Date: Sat Nov 3 08:41:40 2018 -0300 Merge branch 'master' of github.com:gorhill/uBlock into trie-wasm commit cbe1f2e2f38484d42af3204ec7f1b5decd30f99e Merge: 270fc7f dbb7e80 Author: Raymond Hill <rhill@raymondhill.net> Date: Fri Nov 2 17:43:20 2018 -0300 Merge branch 'master' of github.com:gorhill/uBlock into trie-wasm commit 270fc7f9b3b73d79e6355522c1a42ce782fe7e5c Merge: d2a89cf d693d4f Author: Raymond Hill <rhill@raymondhill.net> Date: Fri Nov 2 16:21:08 2018 -0300 Merge branch 'master' of github.com:gorhill/uBlock into trie-wasm commit d2a89cf28f0816ffd4617c2c7b4ccfcdcc30e1b4 Merge: d7afc78 649f82f Author: Raymond Hill <rhill@raymondhill.net> Date: Fri Nov 2 14:54:58 2018 -0300 Merge branch 'master' of github.com:gorhill/uBlock into trie-wasm commit d7afc78b5f5675d7d34c5a1d0ec3099a77caef49 Author: Raymond Hill <rhill@raymondhill.net> Date: Fri Nov 2 13:56:11 2018 -0300 finalize wasm-based hntrie implementation commit e7b9e043cf36ad055791713e34eb0322dec84627 Author: Raymond Hill <rhill@raymondhill.net> Date: Fri Nov 2 08:14:02 2018 -0300 add first-pass implementation of wasm version of hntrie commit 1015cb34624f3ef73ace58b58fe4e03dfc59897f Author: Raymond Hill <rhill@raymondhill.net> Date: Wed Oct 31 17:16:47 2018 -0300 back up draft work toward experimenting with wasm hntries
2018-11-03 12:58:46 +01:00
const µb = µBlock;
2014-09-14 22:20:40 +02:00
2014-07-14 17:24:59 +02:00
// fedcba9876543210
// | | |||
// | | |||
// | | |||
// | | |||
// | | ||+---- bit 0: [BlockAction | AllowAction]
// | | |+----- bit 1: `important`
// | | +------ bit 2- 3: party [0 - 3]
// | +-------- bit 4- 8: type [0 - 31]
// +------------- bit 9-15: unused
2014-09-20 16:44:04 +02:00
Squashed commit of the following: commit 7c6cacc59b27660fabacb55d668ef099b222a9e6 Author: Raymond Hill <rhill@raymondhill.net> Date: Sat Nov 3 08:52:51 2018 -0300 code review: finalize support for wasm-based hntrie commit 8596ed80e3bdac2c36e3c860b51e7189f6bc8487 Merge: cbe1f2e 000eb82 Author: Raymond Hill <rhill@raymondhill.net> Date: Sat Nov 3 08:41:40 2018 -0300 Merge branch 'master' of github.com:gorhill/uBlock into trie-wasm commit cbe1f2e2f38484d42af3204ec7f1b5decd30f99e Merge: 270fc7f dbb7e80 Author: Raymond Hill <rhill@raymondhill.net> Date: Fri Nov 2 17:43:20 2018 -0300 Merge branch 'master' of github.com:gorhill/uBlock into trie-wasm commit 270fc7f9b3b73d79e6355522c1a42ce782fe7e5c Merge: d2a89cf d693d4f Author: Raymond Hill <rhill@raymondhill.net> Date: Fri Nov 2 16:21:08 2018 -0300 Merge branch 'master' of github.com:gorhill/uBlock into trie-wasm commit d2a89cf28f0816ffd4617c2c7b4ccfcdcc30e1b4 Merge: d7afc78 649f82f Author: Raymond Hill <rhill@raymondhill.net> Date: Fri Nov 2 14:54:58 2018 -0300 Merge branch 'master' of github.com:gorhill/uBlock into trie-wasm commit d7afc78b5f5675d7d34c5a1d0ec3099a77caef49 Author: Raymond Hill <rhill@raymondhill.net> Date: Fri Nov 2 13:56:11 2018 -0300 finalize wasm-based hntrie implementation commit e7b9e043cf36ad055791713e34eb0322dec84627 Author: Raymond Hill <rhill@raymondhill.net> Date: Fri Nov 2 08:14:02 2018 -0300 add first-pass implementation of wasm version of hntrie commit 1015cb34624f3ef73ace58b58fe4e03dfc59897f Author: Raymond Hill <rhill@raymondhill.net> Date: Wed Oct 31 17:16:47 2018 -0300 back up draft work toward experimenting with wasm hntries
2018-11-03 12:58:46 +01:00
const BlockAction = 0 << 0;
const AllowAction = 1 << 0;
const Important = 1 << 1;
const AnyParty = 0 << 2;
const FirstParty = 1 << 2;
const ThirdParty = 2 << 2;
2014-06-24 00:42:43 +02:00
Squashed commit of the following: commit 7c6cacc59b27660fabacb55d668ef099b222a9e6 Author: Raymond Hill <rhill@raymondhill.net> Date: Sat Nov 3 08:52:51 2018 -0300 code review: finalize support for wasm-based hntrie commit 8596ed80e3bdac2c36e3c860b51e7189f6bc8487 Merge: cbe1f2e 000eb82 Author: Raymond Hill <rhill@raymondhill.net> Date: Sat Nov 3 08:41:40 2018 -0300 Merge branch 'master' of github.com:gorhill/uBlock into trie-wasm commit cbe1f2e2f38484d42af3204ec7f1b5decd30f99e Merge: 270fc7f dbb7e80 Author: Raymond Hill <rhill@raymondhill.net> Date: Fri Nov 2 17:43:20 2018 -0300 Merge branch 'master' of github.com:gorhill/uBlock into trie-wasm commit 270fc7f9b3b73d79e6355522c1a42ce782fe7e5c Merge: d2a89cf d693d4f Author: Raymond Hill <rhill@raymondhill.net> Date: Fri Nov 2 16:21:08 2018 -0300 Merge branch 'master' of github.com:gorhill/uBlock into trie-wasm commit d2a89cf28f0816ffd4617c2c7b4ccfcdcc30e1b4 Merge: d7afc78 649f82f Author: Raymond Hill <rhill@raymondhill.net> Date: Fri Nov 2 14:54:58 2018 -0300 Merge branch 'master' of github.com:gorhill/uBlock into trie-wasm commit d7afc78b5f5675d7d34c5a1d0ec3099a77caef49 Author: Raymond Hill <rhill@raymondhill.net> Date: Fri Nov 2 13:56:11 2018 -0300 finalize wasm-based hntrie implementation commit e7b9e043cf36ad055791713e34eb0322dec84627 Author: Raymond Hill <rhill@raymondhill.net> Date: Fri Nov 2 08:14:02 2018 -0300 add first-pass implementation of wasm version of hntrie commit 1015cb34624f3ef73ace58b58fe4e03dfc59897f Author: Raymond Hill <rhill@raymondhill.net> Date: Wed Oct 31 17:16:47 2018 -0300 back up draft work toward experimenting with wasm hntries
2018-11-03 12:58:46 +01:00
const typeNameToTypeValue = {
'no_type': 0 << 4,
2015-03-26 00:28:22 +01:00
'stylesheet': 1 << 4,
'image': 2 << 4,
'object': 3 << 4,
2017-11-03 21:51:28 +01:00
'object_subrequest': 3 << 4,
2015-03-26 00:28:22 +01:00
'script': 4 << 4,
'fetch': 5 << 4,
2015-03-26 00:28:22 +01:00
'xmlhttprequest': 5 << 4,
'sub_frame': 6 << 4,
2015-04-05 16:38:47 +02:00
'font': 7 << 4,
2016-03-07 01:16:46 +01:00
'media': 8 << 4,
'websocket': 9 << 4,
'other': 10 << 4,
'popup': 11 << 4, // start of behavorial filtering
'popunder': 12 << 4,
'main_frame': 13 << 4, // start of 1st-party-only behavorial filtering
'generichide': 14 << 4,
2017-09-16 13:49:43 +02:00
'inline-font': 15 << 4,
'inline-script': 16 << 4,
'data': 17 << 4, // special: a generic data holder
'redirect': 18 << 4,
'webrtc': 19 << 4,
'unsupported': 20 << 4
2014-09-21 02:06:55 +02:00
};
Squashed commit of the following: commit 7c6cacc59b27660fabacb55d668ef099b222a9e6 Author: Raymond Hill <rhill@raymondhill.net> Date: Sat Nov 3 08:52:51 2018 -0300 code review: finalize support for wasm-based hntrie commit 8596ed80e3bdac2c36e3c860b51e7189f6bc8487 Merge: cbe1f2e 000eb82 Author: Raymond Hill <rhill@raymondhill.net> Date: Sat Nov 3 08:41:40 2018 -0300 Merge branch 'master' of github.com:gorhill/uBlock into trie-wasm commit cbe1f2e2f38484d42af3204ec7f1b5decd30f99e Merge: 270fc7f dbb7e80 Author: Raymond Hill <rhill@raymondhill.net> Date: Fri Nov 2 17:43:20 2018 -0300 Merge branch 'master' of github.com:gorhill/uBlock into trie-wasm commit 270fc7f9b3b73d79e6355522c1a42ce782fe7e5c Merge: d2a89cf d693d4f Author: Raymond Hill <rhill@raymondhill.net> Date: Fri Nov 2 16:21:08 2018 -0300 Merge branch 'master' of github.com:gorhill/uBlock into trie-wasm commit d2a89cf28f0816ffd4617c2c7b4ccfcdcc30e1b4 Merge: d7afc78 649f82f Author: Raymond Hill <rhill@raymondhill.net> Date: Fri Nov 2 14:54:58 2018 -0300 Merge branch 'master' of github.com:gorhill/uBlock into trie-wasm commit d7afc78b5f5675d7d34c5a1d0ec3099a77caef49 Author: Raymond Hill <rhill@raymondhill.net> Date: Fri Nov 2 13:56:11 2018 -0300 finalize wasm-based hntrie implementation commit e7b9e043cf36ad055791713e34eb0322dec84627 Author: Raymond Hill <rhill@raymondhill.net> Date: Fri Nov 2 08:14:02 2018 -0300 add first-pass implementation of wasm version of hntrie commit 1015cb34624f3ef73ace58b58fe4e03dfc59897f Author: Raymond Hill <rhill@raymondhill.net> Date: Wed Oct 31 17:16:47 2018 -0300 back up draft work toward experimenting with wasm hntries
2018-11-03 12:58:46 +01:00
const otherTypeBitValue = typeNameToTypeValue.other;
2014-09-21 02:06:55 +02:00
// All network request types to bitmap
// bring origin to 0 (from 4 -- see typeNameToTypeValue)
// left-shift 1 by the above-calculated value
// subtract 1 to set all type bits
const allNetworkTypesBits =
(1 << (otherTypeBitValue >>> 4)) - 1;
const allTypesBits =
allNetworkTypesBits |
1 << (typeNameToTypeValue['popup'] >>> 4) - 1 |
1 << (typeNameToTypeValue['main_frame'] >>> 4) - 1 |
1 << (typeNameToTypeValue['inline-font'] >>> 4) - 1 |
1 << (typeNameToTypeValue['inline-script'] >>> 4) - 1;
const unsupportedTypeBit =
1 << (typeNameToTypeValue['unsupported'] >>> 4) - 1;
Squashed commit of the following: commit 7c6cacc59b27660fabacb55d668ef099b222a9e6 Author: Raymond Hill <rhill@raymondhill.net> Date: Sat Nov 3 08:52:51 2018 -0300 code review: finalize support for wasm-based hntrie commit 8596ed80e3bdac2c36e3c860b51e7189f6bc8487 Merge: cbe1f2e 000eb82 Author: Raymond Hill <rhill@raymondhill.net> Date: Sat Nov 3 08:41:40 2018 -0300 Merge branch 'master' of github.com:gorhill/uBlock into trie-wasm commit cbe1f2e2f38484d42af3204ec7f1b5decd30f99e Merge: 270fc7f dbb7e80 Author: Raymond Hill <rhill@raymondhill.net> Date: Fri Nov 2 17:43:20 2018 -0300 Merge branch 'master' of github.com:gorhill/uBlock into trie-wasm commit 270fc7f9b3b73d79e6355522c1a42ce782fe7e5c Merge: d2a89cf d693d4f Author: Raymond Hill <rhill@raymondhill.net> Date: Fri Nov 2 16:21:08 2018 -0300 Merge branch 'master' of github.com:gorhill/uBlock into trie-wasm commit d2a89cf28f0816ffd4617c2c7b4ccfcdcc30e1b4 Merge: d7afc78 649f82f Author: Raymond Hill <rhill@raymondhill.net> Date: Fri Nov 2 14:54:58 2018 -0300 Merge branch 'master' of github.com:gorhill/uBlock into trie-wasm commit d7afc78b5f5675d7d34c5a1d0ec3099a77caef49 Author: Raymond Hill <rhill@raymondhill.net> Date: Fri Nov 2 13:56:11 2018 -0300 finalize wasm-based hntrie implementation commit e7b9e043cf36ad055791713e34eb0322dec84627 Author: Raymond Hill <rhill@raymondhill.net> Date: Fri Nov 2 08:14:02 2018 -0300 add first-pass implementation of wasm version of hntrie commit 1015cb34624f3ef73ace58b58fe4e03dfc59897f Author: Raymond Hill <rhill@raymondhill.net> Date: Wed Oct 31 17:16:47 2018 -0300 back up draft work toward experimenting with wasm hntries
2018-11-03 12:58:46 +01:00
const typeValueToTypeName = {
1: 'stylesheet',
2: 'image',
3: 'object',
4: 'script',
5: 'xmlhttprequest',
2015-07-13 14:49:58 +02:00
6: 'subdocument',
7: 'font',
2016-03-07 01:16:46 +01:00
8: 'media',
9: 'websocket',
10: 'other',
11: 'popup',
12: 'popunder',
13: 'document',
14: 'generichide',
2017-09-16 13:49:43 +02:00
15: 'inline-font',
16: 'inline-script',
17: 'data',
18: 'redirect',
19: 'webrtc',
20: 'unsupported'
};
Add HNTrie-based filter classes to store origin-only filters Related issue: - https://github.com/uBlockOrigin/uBlock-issues/issues/528#issuecomment-484408622 Following STrie-related work in above issue, I noticed that a large number of filters in EasyList were filters which only had to match against the document origin. For instance, among just the top 10 most populous buckets, there were four such buckets with over hundreds of entries each: - bits: 72, token: "http", 146 entries - bits: 72, token: "https", 139 entries - bits: 88, token: "http", 122 entries - bits: 88, token: "https", 118 entries These filters in these buckets have to be matched against all the network requests. In order to leverage HNTrie for these filters[1], they are now handled in a special way so as to ensure they all end up in a single HNTrie (per bucket), which means that instead of scanning hundreds of entries per URL, there is now a single scan per bucket per URL for these apply-everywhere filters. Now, any filter which fulfill ALL the following condition will be processed in a special manner internally: - Is of the form `|https://` or `|http://` or `*`; and - Does have a `domain=` option; and - Does not have a negated domain in its `domain=` option; and - Does not have `csp=` option; and - Does not have a `redirect=` option If a filter does not fulfill ALL the conditions above, no change in behavior. A filter which matches ALL of the above will be processed in a special manner: - The `domain=` option will be decomposed so as to create as many distinct filter as there is distinct value in the `domain=` option - This also apply to the `badfilter` version of the filter, which means it now become possible to `badfilter` only one of the distinct filter without having to `badfilter` all of them. - The logger will always report these special filters with only a single hostname in the `domain=` option. *** [1] HNTrie is currently WASM-ed on Firefox.
2019-04-19 22:33:46 +02:00
const BlockImportant = BlockAction | Important;
2016-11-08 13:13:26 +01:00
const reIsWildcarded = /[\^\*]/;
2014-06-24 00:42:43 +02:00
// ABP filters: https://adblockplus.org/en/filters
// regex tester: http://regex101.com/
/******************************************************************************/
2015-02-05 00:06:31 +01:00
// See the following as short-lived registers, used during evaluation. They are
// valid until the next evaluation.
let urlRegister = '';
let pageHostnameRegister = '';
let requestHostnameRegister = '';
/******************************************************************************/
// First character of match must be within the hostname part of the url.
//
// https://github.com/gorhill/uBlock/issues/1929
// Match only hostname label boundaries.
const isHnAnchored = (( ) => {
let lastLen = 0, lastBeg = -1, lastEnd = -1;
return (url, matchStart) => {
const len = requestHostnameRegister.length;
if ( len !== lastLen || url.endsWith('://', lastBeg) === false ) {
lastBeg = len !== 0 ? url.indexOf('://') : -1;
if ( lastBeg !== -1 ) {
lastBeg += 3;
lastEnd = lastBeg + len;
} else {
lastEnd = -1;
}
}
return matchStart < lastEnd && (
matchStart === lastBeg ||
matchStart > lastBeg &&
url.charCodeAt(matchStart - 1) === 0x2E /* '.' */
);
};
})();
/******************************************************************************/
2015-02-05 00:06:31 +01:00
2015-06-09 23:01:31 +02:00
// Local helpers
Squashed commit of the following: commit 7c6cacc59b27660fabacb55d668ef099b222a9e6 Author: Raymond Hill <rhill@raymondhill.net> Date: Sat Nov 3 08:52:51 2018 -0300 code review: finalize support for wasm-based hntrie commit 8596ed80e3bdac2c36e3c860b51e7189f6bc8487 Merge: cbe1f2e 000eb82 Author: Raymond Hill <rhill@raymondhill.net> Date: Sat Nov 3 08:41:40 2018 -0300 Merge branch 'master' of github.com:gorhill/uBlock into trie-wasm commit cbe1f2e2f38484d42af3204ec7f1b5decd30f99e Merge: 270fc7f dbb7e80 Author: Raymond Hill <rhill@raymondhill.net> Date: Fri Nov 2 17:43:20 2018 -0300 Merge branch 'master' of github.com:gorhill/uBlock into trie-wasm commit 270fc7f9b3b73d79e6355522c1a42ce782fe7e5c Merge: d2a89cf d693d4f Author: Raymond Hill <rhill@raymondhill.net> Date: Fri Nov 2 16:21:08 2018 -0300 Merge branch 'master' of github.com:gorhill/uBlock into trie-wasm commit d2a89cf28f0816ffd4617c2c7b4ccfcdcc30e1b4 Merge: d7afc78 649f82f Author: Raymond Hill <rhill@raymondhill.net> Date: Fri Nov 2 14:54:58 2018 -0300 Merge branch 'master' of github.com:gorhill/uBlock into trie-wasm commit d7afc78b5f5675d7d34c5a1d0ec3099a77caef49 Author: Raymond Hill <rhill@raymondhill.net> Date: Fri Nov 2 13:56:11 2018 -0300 finalize wasm-based hntrie implementation commit e7b9e043cf36ad055791713e34eb0322dec84627 Author: Raymond Hill <rhill@raymondhill.net> Date: Fri Nov 2 08:14:02 2018 -0300 add first-pass implementation of wasm version of hntrie commit 1015cb34624f3ef73ace58b58fe4e03dfc59897f Author: Raymond Hill <rhill@raymondhill.net> Date: Wed Oct 31 17:16:47 2018 -0300 back up draft work toward experimenting with wasm hntries
2018-11-03 12:58:46 +01:00
const normalizeRegexSource = function(s) {
2015-10-26 16:23:56 +01:00
try {
const re = new RegExp(s);
2016-01-17 02:21:17 +01:00
return re.source;
2015-10-26 16:23:56 +01:00
} catch (ex) {
2016-01-17 02:21:17 +01:00
normalizeRegexSource.message = ex.toString();
2015-10-26 16:23:56 +01:00
}
2016-01-17 02:21:17 +01:00
return '';
2015-10-26 16:23:56 +01:00
};
Squashed commit of the following: commit 7c6cacc59b27660fabacb55d668ef099b222a9e6 Author: Raymond Hill <rhill@raymondhill.net> Date: Sat Nov 3 08:52:51 2018 -0300 code review: finalize support for wasm-based hntrie commit 8596ed80e3bdac2c36e3c860b51e7189f6bc8487 Merge: cbe1f2e 000eb82 Author: Raymond Hill <rhill@raymondhill.net> Date: Sat Nov 3 08:41:40 2018 -0300 Merge branch 'master' of github.com:gorhill/uBlock into trie-wasm commit cbe1f2e2f38484d42af3204ec7f1b5decd30f99e Merge: 270fc7f dbb7e80 Author: Raymond Hill <rhill@raymondhill.net> Date: Fri Nov 2 17:43:20 2018 -0300 Merge branch 'master' of github.com:gorhill/uBlock into trie-wasm commit 270fc7f9b3b73d79e6355522c1a42ce782fe7e5c Merge: d2a89cf d693d4f Author: Raymond Hill <rhill@raymondhill.net> Date: Fri Nov 2 16:21:08 2018 -0300 Merge branch 'master' of github.com:gorhill/uBlock into trie-wasm commit d2a89cf28f0816ffd4617c2c7b4ccfcdcc30e1b4 Merge: d7afc78 649f82f Author: Raymond Hill <rhill@raymondhill.net> Date: Fri Nov 2 14:54:58 2018 -0300 Merge branch 'master' of github.com:gorhill/uBlock into trie-wasm commit d7afc78b5f5675d7d34c5a1d0ec3099a77caef49 Author: Raymond Hill <rhill@raymondhill.net> Date: Fri Nov 2 13:56:11 2018 -0300 finalize wasm-based hntrie implementation commit e7b9e043cf36ad055791713e34eb0322dec84627 Author: Raymond Hill <rhill@raymondhill.net> Date: Fri Nov 2 08:14:02 2018 -0300 add first-pass implementation of wasm version of hntrie commit 1015cb34624f3ef73ace58b58fe4e03dfc59897f Author: Raymond Hill <rhill@raymondhill.net> Date: Wed Oct 31 17:16:47 2018 -0300 back up draft work toward experimenting with wasm hntries
2018-11-03 12:58:46 +01:00
const rawToRegexStr = function(s, anchor) {
// https://www.loggly.com/blog/five-invaluable-techniques-to-improve-regex-performance/
2015-03-17 14:39:03 +01:00
// https://developer.mozilla.org/en/docs/Web/JavaScript/Guide/Regular_Expressions
2016-07-01 21:15:58 +02:00
// Also: remove leading/trailing wildcards -- there is no point.
let reStr = s.replace(rawToRegexStr.escape1, '\\$&')
.replace(rawToRegexStr.escape2, '(?:[^%.0-9a-z_-]|$)')
.replace(rawToRegexStr.escape3, '')
.replace(rawToRegexStr.escape4, '[^ ]*?');
if ( anchor & 0b100 ) {
2017-08-31 20:17:55 +02:00
reStr = (
reStr.startsWith('\\.') ?
rawToRegexStr.reTextHostnameAnchor2 :
rawToRegexStr.reTextHostnameAnchor1
) + reStr;
} else if ( anchor & 0b010 ) {
2015-03-05 01:36:09 +01:00
reStr = '^' + reStr;
}
if ( anchor & 0b001 ) {
reStr += '$';
2016-08-23 16:33:28 +02:00
}
return reStr;
2015-03-02 16:41:51 +01:00
};
rawToRegexStr.escape1 = /[.+?${}()|[\]\\]/g;
rawToRegexStr.escape2 = /\^/g;
rawToRegexStr.escape3 = /^\*|\*$/g;
rawToRegexStr.escape4 = /\*/g;
2017-08-31 20:17:55 +02:00
rawToRegexStr.reTextHostnameAnchor1 = '^[a-z-]+://(?:[^/?#]+\\.)?';
rawToRegexStr.reTextHostnameAnchor2 = '^[a-z-]+://(?:[^/?#]+)?';
2015-03-02 16:41:51 +01:00
// https://github.com/uBlockOrigin/uAssets/issues/4083#issuecomment-436914727
const rawToPlainStr = function(s, anchor) {
if (
anchor === 0 &&
s.charCodeAt(0) === 0x2F /* '/' */ &&
s.length > 2 &&
s.charCodeAt(s.length-1) === 0x2F /* '/' */
) {
s = s + '*';
}
return s;
};
const filterDataSerialize = µb.CompiledLineIO.serialize;
Squashed commit of the following: commit 7c6cacc59b27660fabacb55d668ef099b222a9e6 Author: Raymond Hill <rhill@raymondhill.net> Date: Sat Nov 3 08:52:51 2018 -0300 code review: finalize support for wasm-based hntrie commit 8596ed80e3bdac2c36e3c860b51e7189f6bc8487 Merge: cbe1f2e 000eb82 Author: Raymond Hill <rhill@raymondhill.net> Date: Sat Nov 3 08:41:40 2018 -0300 Merge branch 'master' of github.com:gorhill/uBlock into trie-wasm commit cbe1f2e2f38484d42af3204ec7f1b5decd30f99e Merge: 270fc7f dbb7e80 Author: Raymond Hill <rhill@raymondhill.net> Date: Fri Nov 2 17:43:20 2018 -0300 Merge branch 'master' of github.com:gorhill/uBlock into trie-wasm commit 270fc7f9b3b73d79e6355522c1a42ce782fe7e5c Merge: d2a89cf d693d4f Author: Raymond Hill <rhill@raymondhill.net> Date: Fri Nov 2 16:21:08 2018 -0300 Merge branch 'master' of github.com:gorhill/uBlock into trie-wasm commit d2a89cf28f0816ffd4617c2c7b4ccfcdcc30e1b4 Merge: d7afc78 649f82f Author: Raymond Hill <rhill@raymondhill.net> Date: Fri Nov 2 14:54:58 2018 -0300 Merge branch 'master' of github.com:gorhill/uBlock into trie-wasm commit d7afc78b5f5675d7d34c5a1d0ec3099a77caef49 Author: Raymond Hill <rhill@raymondhill.net> Date: Fri Nov 2 13:56:11 2018 -0300 finalize wasm-based hntrie implementation commit e7b9e043cf36ad055791713e34eb0322dec84627 Author: Raymond Hill <rhill@raymondhill.net> Date: Fri Nov 2 08:14:02 2018 -0300 add first-pass implementation of wasm version of hntrie commit 1015cb34624f3ef73ace58b58fe4e03dfc59897f Author: Raymond Hill <rhill@raymondhill.net> Date: Wed Oct 31 17:16:47 2018 -0300 back up draft work toward experimenting with wasm hntries
2018-11-03 12:58:46 +01:00
const toLogDataInternal = function(categoryBits, tokenHash, filter) {
if ( filter === null ) { return undefined; }
const logData = filter.logData();
logData.compiled = filterDataSerialize([
categoryBits,
tokenHash,
logData.compiled
]);
2017-05-19 14:45:19 +02:00
if ( categoryBits & 0x001 ) {
logData.raw = `@@${logData.raw}`;
}
const opts = [];
2017-05-19 14:45:19 +02:00
if ( categoryBits & 0x002 ) {
opts.push('important');
}
2017-05-19 14:45:19 +02:00
if ( categoryBits & 0x008 ) {
opts.push('third-party');
2017-05-19 14:45:19 +02:00
} else if ( categoryBits & 0x004 ) {
opts.push('first-party');
}
const type = categoryBits & 0x1F0;
2017-09-16 13:49:43 +02:00
if ( type !== 0 && type !== typeNameToTypeValue.data ) {
opts.push(typeValueToTypeName[type >>> 4]);
}
if ( logData.opts !== undefined ) {
opts.push(logData.opts);
}
if ( opts.length !== 0 ) {
logData.raw += '$' + opts.join(',');
}
return logData;
2015-06-09 23:01:31 +02:00
};
/*******************************************************************************
Each filter class will register itself in the map. A filter class
id MUST always stringify to ONE single character.
IMPORTANT: any change which modifies the mapping will have to be
reflected with µBlock.systemSettings.compiledMagic.
*/
Squashed commit of the following: commit 7c6cacc59b27660fabacb55d668ef099b222a9e6 Author: Raymond Hill <rhill@raymondhill.net> Date: Sat Nov 3 08:52:51 2018 -0300 code review: finalize support for wasm-based hntrie commit 8596ed80e3bdac2c36e3c860b51e7189f6bc8487 Merge: cbe1f2e 000eb82 Author: Raymond Hill <rhill@raymondhill.net> Date: Sat Nov 3 08:41:40 2018 -0300 Merge branch 'master' of github.com:gorhill/uBlock into trie-wasm commit cbe1f2e2f38484d42af3204ec7f1b5decd30f99e Merge: 270fc7f dbb7e80 Author: Raymond Hill <rhill@raymondhill.net> Date: Fri Nov 2 17:43:20 2018 -0300 Merge branch 'master' of github.com:gorhill/uBlock into trie-wasm commit 270fc7f9b3b73d79e6355522c1a42ce782fe7e5c Merge: d2a89cf d693d4f Author: Raymond Hill <rhill@raymondhill.net> Date: Fri Nov 2 16:21:08 2018 -0300 Merge branch 'master' of github.com:gorhill/uBlock into trie-wasm commit d2a89cf28f0816ffd4617c2c7b4ccfcdcc30e1b4 Merge: d7afc78 649f82f Author: Raymond Hill <rhill@raymondhill.net> Date: Fri Nov 2 14:54:58 2018 -0300 Merge branch 'master' of github.com:gorhill/uBlock into trie-wasm commit d7afc78b5f5675d7d34c5a1d0ec3099a77caef49 Author: Raymond Hill <rhill@raymondhill.net> Date: Fri Nov 2 13:56:11 2018 -0300 finalize wasm-based hntrie implementation commit e7b9e043cf36ad055791713e34eb0322dec84627 Author: Raymond Hill <rhill@raymondhill.net> Date: Fri Nov 2 08:14:02 2018 -0300 add first-pass implementation of wasm version of hntrie commit 1015cb34624f3ef73ace58b58fe4e03dfc59897f Author: Raymond Hill <rhill@raymondhill.net> Date: Wed Oct 31 17:16:47 2018 -0300 back up draft work toward experimenting with wasm hntries
2018-11-03 12:58:46 +01:00
const filterClasses = [];
let filterClassIdGenerator = 0;
Squashed commit of the following: commit 7c6cacc59b27660fabacb55d668ef099b222a9e6 Author: Raymond Hill <rhill@raymondhill.net> Date: Sat Nov 3 08:52:51 2018 -0300 code review: finalize support for wasm-based hntrie commit 8596ed80e3bdac2c36e3c860b51e7189f6bc8487 Merge: cbe1f2e 000eb82 Author: Raymond Hill <rhill@raymondhill.net> Date: Sat Nov 3 08:41:40 2018 -0300 Merge branch 'master' of github.com:gorhill/uBlock into trie-wasm commit cbe1f2e2f38484d42af3204ec7f1b5decd30f99e Merge: 270fc7f dbb7e80 Author: Raymond Hill <rhill@raymondhill.net> Date: Fri Nov 2 17:43:20 2018 -0300 Merge branch 'master' of github.com:gorhill/uBlock into trie-wasm commit 270fc7f9b3b73d79e6355522c1a42ce782fe7e5c Merge: d2a89cf d693d4f Author: Raymond Hill <rhill@raymondhill.net> Date: Fri Nov 2 16:21:08 2018 -0300 Merge branch 'master' of github.com:gorhill/uBlock into trie-wasm commit d2a89cf28f0816ffd4617c2c7b4ccfcdcc30e1b4 Merge: d7afc78 649f82f Author: Raymond Hill <rhill@raymondhill.net> Date: Fri Nov 2 14:54:58 2018 -0300 Merge branch 'master' of github.com:gorhill/uBlock into trie-wasm commit d7afc78b5f5675d7d34c5a1d0ec3099a77caef49 Author: Raymond Hill <rhill@raymondhill.net> Date: Fri Nov 2 13:56:11 2018 -0300 finalize wasm-based hntrie implementation commit e7b9e043cf36ad055791713e34eb0322dec84627 Author: Raymond Hill <rhill@raymondhill.net> Date: Fri Nov 2 08:14:02 2018 -0300 add first-pass implementation of wasm version of hntrie commit 1015cb34624f3ef73ace58b58fe4e03dfc59897f Author: Raymond Hill <rhill@raymondhill.net> Date: Wed Oct 31 17:16:47 2018 -0300 back up draft work toward experimenting with wasm hntries
2018-11-03 12:58:46 +01:00
const registerFilterClass = function(ctor) {
let fid = filterClassIdGenerator++;
ctor.fid = ctor.prototype.fid = fid;
filterClasses[fid] = ctor;
};
Squashed commit of the following: commit 7c6cacc59b27660fabacb55d668ef099b222a9e6 Author: Raymond Hill <rhill@raymondhill.net> Date: Sat Nov 3 08:52:51 2018 -0300 code review: finalize support for wasm-based hntrie commit 8596ed80e3bdac2c36e3c860b51e7189f6bc8487 Merge: cbe1f2e 000eb82 Author: Raymond Hill <rhill@raymondhill.net> Date: Sat Nov 3 08:41:40 2018 -0300 Merge branch 'master' of github.com:gorhill/uBlock into trie-wasm commit cbe1f2e2f38484d42af3204ec7f1b5decd30f99e Merge: 270fc7f dbb7e80 Author: Raymond Hill <rhill@raymondhill.net> Date: Fri Nov 2 17:43:20 2018 -0300 Merge branch 'master' of github.com:gorhill/uBlock into trie-wasm commit 270fc7f9b3b73d79e6355522c1a42ce782fe7e5c Merge: d2a89cf d693d4f Author: Raymond Hill <rhill@raymondhill.net> Date: Fri Nov 2 16:21:08 2018 -0300 Merge branch 'master' of github.com:gorhill/uBlock into trie-wasm commit d2a89cf28f0816ffd4617c2c7b4ccfcdcc30e1b4 Merge: d7afc78 649f82f Author: Raymond Hill <rhill@raymondhill.net> Date: Fri Nov 2 14:54:58 2018 -0300 Merge branch 'master' of github.com:gorhill/uBlock into trie-wasm commit d7afc78b5f5675d7d34c5a1d0ec3099a77caef49 Author: Raymond Hill <rhill@raymondhill.net> Date: Fri Nov 2 13:56:11 2018 -0300 finalize wasm-based hntrie implementation commit e7b9e043cf36ad055791713e34eb0322dec84627 Author: Raymond Hill <rhill@raymondhill.net> Date: Fri Nov 2 08:14:02 2018 -0300 add first-pass implementation of wasm version of hntrie commit 1015cb34624f3ef73ace58b58fe4e03dfc59897f Author: Raymond Hill <rhill@raymondhill.net> Date: Wed Oct 31 17:16:47 2018 -0300 back up draft work toward experimenting with wasm hntries
2018-11-03 12:58:46 +01:00
const filterFromCompiledData = function(args) {
return filterClasses[args[0]].load(args);
};
/******************************************************************************/
const FilterTrue = class {
match() {
return true;
}
logData() {
return {
raw: '*',
regex: '^',
compiled: this.compile(),
};
}
compile() {
return [ this.fid ];
}
static compile() {
return [ FilterTrue.fid ];
}
static load() {
return FilterTrue.instance;
}
};
FilterTrue.instance = new FilterTrue();
2014-06-24 00:42:43 +02:00
registerFilterClass(FilterTrue);
2014-06-24 00:42:43 +02:00
/******************************************************************************/
const FilterPlain = class {
constructor(s) {
this.s = s;
}
2014-06-24 00:42:43 +02:00
match(url, tokenBeg) {
return url.startsWith(this.s, tokenBeg);
}
2014-06-24 00:42:43 +02:00
logData() {
return {
raw: rawToPlainStr(this.s, 0),
regex: rawToRegexStr(this.s, 0),
compiled: this.compile()
};
}
2014-06-24 00:42:43 +02:00
compile() {
return [ this.fid, this.s, this.tokenBeg ];
}
2014-06-24 00:42:43 +02:00
addToTrie(trie) {
trie.add(this.s, this.tokenBeg);
}
static compile(details) {
return [ FilterPlain.fid, details.f, details.tokenBeg ];
}
2014-09-08 23:46:58 +02:00
static load(args) {
if ( args[2] === 0 ) {
return new FilterPlain(args[1]);
}
if ( args[2] === 1 ) {
return new FilterPlain1(args[1]);
}
return new FilterPlainX(args[1], args[2]);
}
static addToTrie(args, trie) {
trie.add(args[1], args[2]);
}
2015-02-24 00:31:29 +01:00
};
FilterPlain.trieableId = 0;
FilterPlain.prototype.trieableId = FilterPlain.trieableId;
FilterPlain.prototype.tokenBeg = 0;
registerFilterClass(FilterPlain);
2014-09-08 23:46:58 +02:00
const FilterPlain1 = class extends FilterPlain {
match(url, tokenBeg) {
return url.startsWith(this.s, tokenBeg - 1);
}
};
2014-06-24 00:42:43 +02:00
FilterPlain1.prototype.tokenBeg = 1;
2014-06-24 00:42:43 +02:00
2014-09-08 23:46:58 +02:00
const FilterPlainX = class extends FilterPlain {
constructor(s, tokenBeg) {
super(s);
this.tokenBeg = tokenBeg;
}
2014-09-08 23:46:58 +02:00
match(url, tokenBeg) {
return url.startsWith(this.s, tokenBeg - this.tokenBeg);
}
2015-02-24 00:31:29 +01:00
};
2014-08-28 15:59:05 +02:00
/******************************************************************************/
const FilterPlainHostname = class {
constructor(s) {
this.s = s;
}
2014-06-24 00:42:43 +02:00
match() {
const haystack = requestHostnameRegister;
const needle = this.s;
if ( haystack.endsWith(needle) === false ) { return false; }
const offset = haystack.length - needle.length;
return offset === 0 || haystack.charCodeAt(offset - 1) === 0x2E /* '.' */;
}
2014-06-24 00:42:43 +02:00
logData() {
return {
raw: `||${this.s}^`,
regex: rawToRegexStr(`${this.s}^`, 0),
compiled: this.compile()
};
}
2014-08-28 15:59:05 +02:00
compile() {
return [ this.fid, this.s ];
}
2014-09-08 23:46:58 +02:00
static compile(details) {
return [ FilterPlainHostname.fid, details.f ];
}
2015-02-24 00:31:29 +01:00
static load(args) {
return new FilterPlainHostname(args[1]);
}
2014-09-08 23:46:58 +02:00
};
registerFilterClass(FilterPlainHostname);
2014-06-24 00:42:43 +02:00
/******************************************************************************/
const FilterPlainLeftAnchored = class {
constructor(s) {
this.s = s;
}
2014-06-24 00:42:43 +02:00
match(url) {
return url.startsWith(this.s);
}
2014-06-24 00:42:43 +02:00
logData() {
return {
raw: `|${this.s}`,
regex: rawToRegexStr(this.s, 0b010),
compiled: this.compile()
};
}
2014-08-28 15:59:05 +02:00
compile() {
return [ this.fid, this.s ];
}
2014-09-08 23:46:58 +02:00
static compile(details) {
return [ FilterPlainLeftAnchored.fid, details.f ];
}
2015-02-24 00:31:29 +01:00
static load(args) {
return new FilterPlainLeftAnchored(args[1]);
}
2014-09-08 23:46:58 +02:00
};
registerFilterClass(FilterPlainLeftAnchored);
2014-08-28 15:59:05 +02:00
/******************************************************************************/
const FilterPlainRightAnchored = class {
constructor(s) {
this.s = s;
}
2014-06-24 00:42:43 +02:00
match(url) {
return url.endsWith(this.s);
}
2014-06-24 00:42:43 +02:00
logData() {
return {
raw: `${this.s}|`,
regex: rawToRegexStr(this.s, 0b001),
compiled: this.compile()
};
}
2014-08-28 15:59:05 +02:00
compile() {
return [ this.fid, this.s ];
}
2014-09-08 23:46:58 +02:00
static compile(details) {
return [ FilterPlainRightAnchored.fid, details.f ];
}
2015-02-24 00:31:29 +01:00
static load(args) {
return new FilterPlainRightAnchored(args[1]);
}
2014-09-08 23:46:58 +02:00
};
registerFilterClass(FilterPlainRightAnchored);
2014-06-24 00:42:43 +02:00
/******************************************************************************/
const FilterExactMatch = class {
constructor(s) {
this.s = s;
}
2017-10-09 15:28:28 +02:00
match(url) {
return url === this.s;
}
2017-10-09 15:28:28 +02:00
logData() {
return {
raw: `|${this.s}|`,
regex: rawToRegexStr(this.s, 0b011),
compiled: this.compile()
};
}
2017-10-09 15:28:28 +02:00
compile() {
return [ this.fid, this.s ];
}
2017-10-09 15:28:28 +02:00
static compile(details) {
return [ FilterExactMatch.fid, details.f ];
}
2017-10-09 15:28:28 +02:00
static load(args) {
return new FilterExactMatch(args[1]);
}
2017-10-09 15:28:28 +02:00
};
registerFilterClass(FilterExactMatch);
/******************************************************************************/
const FilterPlainHnAnchored = class {
constructor(s) {
this.s = s;
}
2014-06-24 00:42:43 +02:00
match(url, tokenBeg) {
return url.startsWith(this.s, tokenBeg) &&
isHnAnchored(url, tokenBeg);
}
2014-06-24 00:42:43 +02:00
logData() {
return {
raw: `||${this.s}`,
regex: rawToRegexStr(this.s, this.tokenBeg),
compiled: this.compile()
};
}
2014-08-28 15:59:05 +02:00
compile() {
return [ this.fid, this.s, this.tokenBeg ];
}
addToTrie(trie) {
trie.add(this.s, this.tokenBeg);
}
2014-09-08 23:46:58 +02:00
static compile(details) {
return [ FilterPlainHnAnchored.fid, details.f, details.tokenBeg ];
}
static load(args) {
if ( args[2] === 0 ) {
return new FilterPlainHnAnchored(args[1]);
}
return new FilterPlainHnAnchoredX(args[1], args[2]);
}
2015-02-24 00:31:29 +01:00
static addToTrie(args, trie) {
trie.add(args[1], args[2]);
}
2014-09-08 23:46:58 +02:00
};
FilterPlainHnAnchored.trieableId = 1;
FilterPlainHnAnchored.prototype.trieableId = FilterPlainHnAnchored.trieableId;
FilterPlainHnAnchored.prototype.tokenBeg = 0;
registerFilterClass(FilterPlainHnAnchored);
const FilterPlainHnAnchoredX = class extends FilterPlainHnAnchored {
constructor(s, tokenBeg) {
super(s);
this.tokenBeg = tokenBeg;
}
match(url, tokenBeg) {
const beg = tokenBeg - this.tokenBeg;
return url.startsWith(this.s, beg) && isHnAnchored(url, beg);
}
};
/*******************************************************************************
Filters with only one single occurrence of wildcard `*`
*/
const FilterWildcard1 = class {
constructor(s0, s1, tokenBeg) {
this.s0 = s0;
this.s1 = s1;
this.tokenBeg = tokenBeg;
}
match(url, tokenBeg) {
if ( this.tokenBeg >= 0 ) {
const s0Beg = tokenBeg - this.tokenBeg;
return s0Beg >= 0 &&
url.startsWith(this.s0, s0Beg) &&
url.indexOf(this.s1, s0Beg + this.s0.length) !== -1;
}
const s1Beg = tokenBeg + this.tokenBeg;
return s1Beg > 0 &&
url.startsWith(this.s1, s1Beg) &&
url.lastIndexOf(this.s0, s1Beg) !== -1;
}
logData() {
return {
raw: `${this.s0}*${this.s1}`,
regex: rawToRegexStr(`${this.s0}*${this.s1}`, 0),
compiled: this.compile()
};
}
compile() {
return [ this.fid, this.s0, this.s1, this.tokenBeg ];
}
static compile(details) {
if ( details.token === '*' ) { return; }
if ( details.anchor !== 0 ) { return; }
const s = details.f;
let pos = s.indexOf('*');
if ( pos === -1 ) { return; }
if ( reIsWildcarded.test(s.slice(pos + 1)) ) { return; }
if ( reIsWildcarded.test(s.slice(0, pos)) ) { return; }
return [
FilterWildcard1.fid,
s.slice(0, pos),
s.slice(pos + 1),
details.tokenBeg < pos
? details.tokenBeg
: pos + 1 - details.tokenBeg,
];
}
static load(args) {
return new FilterWildcard1(args[1], args[2], args[3]);
}
};
registerFilterClass(FilterWildcard1);
2014-08-28 15:59:05 +02:00
/******************************************************************************/
const FilterGeneric = class {
constructor(s, anchor) {
this.s = s;
this.anchor = anchor;
}
match(url) {
if ( this.re === null ) {
this.re = new RegExp(rawToRegexStr(this.s, this.anchor));
}
return this.re.test(url);
}
2014-06-24 00:42:43 +02:00
logData() {
const out = {
raw: rawToPlainStr(this.s, this.anchor),
regex: this.re.source,
compiled: this.compile()
};
if ( this.anchor & 0x2 ) {
out.raw = `|${out.raw}`;
}
if ( this.anchor & 0x1 ) {
out.raw += '|';
}
return out;
}
compile() {
return [ this.fid, this.s, this.anchor ];
}
2014-08-28 15:59:05 +02:00
static compile(details) {
const compiled = FilterWildcard1.compile(details);
if ( compiled !== undefined ) { return compiled; }
return [ FilterGeneric.fid, details.f, details.anchor ];
}
2014-09-08 23:46:58 +02:00
static load(args) {
return new FilterGeneric(args[1], args[2]);
}
2015-02-24 00:31:29 +01:00
};
FilterGeneric.prototype.re = null;
2014-09-08 23:46:58 +02:00
registerFilterClass(FilterGeneric);
2014-06-24 00:42:43 +02:00
/*******************************************************************************
Hostname-anchored filters with only one occurrence of wildcard `*`
*/
const FilterWildcard1HnAnchored = class {
constructor(s0, s1, tokenBeg) {
this.s0 = s0;
this.s1 = s1;
this.tokenBeg = tokenBeg;
}
match(url, tokenBeg) {
if ( this.tokenBeg >= 0 ) {
const s0Beg = tokenBeg - this.tokenBeg;
return s0Beg >= 0 &&
url.startsWith(this.s0, s0Beg) &&
isHnAnchored(url, s0Beg) &&
url.indexOf(this.s1, s0Beg + this.s0.length) !== -1;
}
const s1Beg = tokenBeg + this.tokenBeg;
if ( s1Beg < 0 || url.startsWith(this.s1, s1Beg) === false ) {
return false;
}
const s0Beg = url.lastIndexOf(this.s0, s1Beg);
return s0Beg !== -1 && isHnAnchored(url, s0Beg);
}
logData() {
return {
raw: `||${this.s0}*${this.s1}`,
regex: rawToRegexStr(`${this.s0}*${this.s1}`, 0),
compiled: this.compile()
};
}
compile() {
return [ this.fid, this.s0, this.s1, this.tokenBeg ];
}
static compile(details) {
if ( details.token === '*' ) { return; }
if ( (details.anchor & 0x0b001) !== 0 ) { return; }
const s = details.f;
let pos = s.indexOf('*');
if ( pos === -1 ) { return; }
if ( reIsWildcarded.test(s.slice(pos + 1)) ) { return; }
const needSeparator =
pos !== 0 && s.charCodeAt(pos - 1) === 0x5E /* '^' */;
if ( needSeparator ) { pos -= 1; }
if ( reIsWildcarded.test(s.slice(0, pos)) ) { return; }
if ( needSeparator ) {
return FilterWildcard2HnAnchored.compile(details, pos);
}
return [
FilterWildcard1HnAnchored.fid,
s.slice(0, pos),
s.slice(pos + 1),
details.tokenBeg < pos
? details.tokenBeg
: pos + 1 - details.tokenBeg,
];
}
static load(args) {
return new FilterWildcard1HnAnchored(args[1], args[2], args[3]);
}
};
registerFilterClass(FilterWildcard1HnAnchored);
/*******************************************************************************
Hostname-anchored filters with one occurrence of the wildcard
sequence `^*` and no other wildcard-equivalent character
*/
const FilterWildcard2HnAnchored = class {
constructor(s0, s1, tokenBeg) {
this.s0 = s0;
this.s1 = s1;
this.tokenBeg = tokenBeg;
}
match(url, tokenBeg) {
let s0End, s1Beg;
if ( this.tokenBeg >= 0 ) {
const s0Beg = tokenBeg - this.tokenBeg;
if ( s0Beg < 0 || url.startsWith(this.s0, s0Beg) === false ) {
return false;
}
if ( isHnAnchored(url, s0Beg) === false ) { return false; }
s0End = s0Beg + this.s0.length;
s1Beg = url.indexOf(this.s1, s0End);
if ( s1Beg === -1 ) { return false; }
} else {
s1Beg = tokenBeg + this.tokenBeg;
if ( s1Beg < 0 || url.startsWith(this.s1, s1Beg) === false ) {
return false;
}
const s0Beg = url.lastIndexOf(this.s0, s1Beg);
if ( s0Beg === -1 || isHnAnchored(url, s0Beg) === false ) {
return false;
}
s0End = s0Beg + this.s0.length;
}
return this.reSeparators.test(url.slice(s0End, s1Beg));
}
logData() {
return {
raw: `||${this.s0}^*${this.s1}`,
regex: rawToRegexStr(`${this.s0}^*${this.s1}`, 0),
compiled: this.compile()
};
}
compile() {
return [ this.fid, this.s0, this.s1, this.tokenBeg ];
}
static compile(details, pos) {
return [
FilterWildcard2HnAnchored.fid,
details.f.slice(0, pos),
details.f.slice(pos + 2),
details.tokenBeg < pos
? details.tokenBeg
: pos + 2 - details.tokenBeg,
];
}
static load(args) {
return new FilterWildcard2HnAnchored(args[1], args[2], args[3]);
}
};
FilterWildcard2HnAnchored.prototype.reSeparators = /[^\w%.-]/;
registerFilterClass(FilterWildcard2HnAnchored);
/******************************************************************************/
2014-09-19 16:59:44 +02:00
const FilterGenericHnAnchored = class {
constructor(s) {
this.s = s;
}
2014-09-19 16:59:44 +02:00
match(url) {
if ( this.re === null ) {
this.re = new RegExp(rawToRegexStr(this.s, this.anchor));
}
return this.re.test(url);
}
logData() {
return {
raw: `||${this.s}`,
regex: rawToRegexStr(this.s, this.anchor & 0b001),
compiled: this.compile()
};
}
2014-09-19 16:59:44 +02:00
compile() {
return [ this.fid, this.s ];
}
2014-09-19 16:59:44 +02:00
static compile(details) {
const compiled = FilterWildcard1HnAnchored.compile(details);
if ( compiled !== undefined ) { return compiled; }
return [ FilterGenericHnAnchored.fid, details.f ];
}
2014-09-19 16:59:44 +02:00
static load(args) {
return new FilterGenericHnAnchored(args[1]);
}
2015-02-24 00:31:29 +01:00
};
FilterGenericHnAnchored.prototype.re = null;
FilterGenericHnAnchored.prototype.anchor = 0x4;
2014-09-19 16:59:44 +02:00
registerFilterClass(FilterGenericHnAnchored);
2014-09-19 16:59:44 +02:00
/******************************************************************************/
const FilterGenericHnAndRightAnchored = class extends FilterGenericHnAnchored {
logData() {
const out = super.logData();
out.raw += '|';
return out;
}
2015-04-27 21:09:19 +02:00
static compile(details) {
return [ FilterGenericHnAndRightAnchored.fid, details.f ];
}
2015-04-27 21:09:19 +02:00
static load(args) {
return new FilterGenericHnAndRightAnchored(args[1]);
}
2015-04-27 21:09:19 +02:00
};
FilterGenericHnAndRightAnchored.prototype.anchor = 0x5;
2015-04-27 21:09:19 +02:00
registerFilterClass(FilterGenericHnAndRightAnchored);
2015-04-27 21:09:19 +02:00
/******************************************************************************/
const FilterRegex = class {
constructor(s) {
this.re = s;
}
2014-06-24 00:42:43 +02:00
match(url) {
if ( typeof this.re === 'string' ) {
this.re = new RegExp(this.re, 'i');
}
return this.re.test(url);
2017-07-11 18:21:08 +02:00
}
2014-06-24 00:42:43 +02:00
logData() {
const s = typeof this.re === 'string' ? this.re : this.re.source;
return {
raw: `/${s}/`,
regex: s,
compiled: this.compile()
};
}
2014-09-08 23:46:58 +02:00
compile() {
return [
this.fid,
typeof this.re === 'string' ? this.re : this.re.source
];
}
2014-09-08 23:46:58 +02:00
static compile(details) {
return [ FilterRegex.fid, details.f ];
}
2015-02-24 00:31:29 +01:00
static load(args) {
return new FilterRegex(args[1]);
}
2014-09-08 23:46:58 +02:00
};
2014-06-24 00:42:43 +02:00
registerFilterClass(FilterRegex);
2014-09-08 23:46:58 +02:00
/******************************************************************************/
// The optimal "class" is picked according to the content of the
// `domain=` filter option.
2014-06-24 00:42:43 +02:00
Add HNTrie-based filter classes to store origin-only filters Related issue: - https://github.com/uBlockOrigin/uBlock-issues/issues/528#issuecomment-484408622 Following STrie-related work in above issue, I noticed that a large number of filters in EasyList were filters which only had to match against the document origin. For instance, among just the top 10 most populous buckets, there were four such buckets with over hundreds of entries each: - bits: 72, token: "http", 146 entries - bits: 72, token: "https", 139 entries - bits: 88, token: "http", 122 entries - bits: 88, token: "https", 118 entries These filters in these buckets have to be matched against all the network requests. In order to leverage HNTrie for these filters[1], they are now handled in a special way so as to ensure they all end up in a single HNTrie (per bucket), which means that instead of scanning hundreds of entries per URL, there is now a single scan per bucket per URL for these apply-everywhere filters. Now, any filter which fulfill ALL the following condition will be processed in a special manner internally: - Is of the form `|https://` or `|http://` or `*`; and - Does have a `domain=` option; and - Does not have a negated domain in its `domain=` option; and - Does not have `csp=` option; and - Does not have a `redirect=` option If a filter does not fulfill ALL the conditions above, no change in behavior. A filter which matches ALL of the above will be processed in a special manner: - The `domain=` option will be decomposed so as to create as many distinct filter as there is distinct value in the `domain=` option - This also apply to the `badfilter` version of the filter, which means it now become possible to `badfilter` only one of the distinct filter without having to `badfilter` all of them. - The logger will always report these special filters with only a single hostname in the `domain=` option. *** [1] HNTrie is currently WASM-ed on Firefox.
2019-04-19 22:33:46 +02:00
const filterOrigin = new (class {
constructor() {
let trieDetails;
try {
trieDetails = JSON.parse(
vAPI.localStorage.getItem('FilterOrigin.trieDetails')
);
} catch(ex) {
}
this.trieContainer = new µBlock.HNTrieContainer(trieDetails);
this.strSlots = [];
this.strToSlotId = new Map();
this.gcTimer = undefined;
Add HNTrie-based filter classes to store origin-only filters Related issue: - https://github.com/uBlockOrigin/uBlock-issues/issues/528#issuecomment-484408622 Following STrie-related work in above issue, I noticed that a large number of filters in EasyList were filters which only had to match against the document origin. For instance, among just the top 10 most populous buckets, there were four such buckets with over hundreds of entries each: - bits: 72, token: "http", 146 entries - bits: 72, token: "https", 139 entries - bits: 88, token: "http", 122 entries - bits: 88, token: "https", 118 entries These filters in these buckets have to be matched against all the network requests. In order to leverage HNTrie for these filters[1], they are now handled in a special way so as to ensure they all end up in a single HNTrie (per bucket), which means that instead of scanning hundreds of entries per URL, there is now a single scan per bucket per URL for these apply-everywhere filters. Now, any filter which fulfill ALL the following condition will be processed in a special manner internally: - Is of the form `|https://` or `|http://` or `*`; and - Does have a `domain=` option; and - Does not have a negated domain in its `domain=` option; and - Does not have `csp=` option; and - Does not have a `redirect=` option If a filter does not fulfill ALL the conditions above, no change in behavior. A filter which matches ALL of the above will be processed in a special manner: - The `domain=` option will be decomposed so as to create as many distinct filter as there is distinct value in the `domain=` option - This also apply to the `badfilter` version of the filter, which means it now become possible to `badfilter` only one of the distinct filter without having to `badfilter` all of them. - The logger will always report these special filters with only a single hostname in the `domain=` option. *** [1] HNTrie is currently WASM-ed on Firefox.
2019-04-19 22:33:46 +02:00
}
compile(details, wrapped) {
const domainOpt = details.domainOpt;
// One hostname
if ( domainOpt.indexOf('|') === -1 ) {
if ( domainOpt.charCodeAt(0) === 0x7E /* '~' */ ) {
return FilterOriginMiss.compile(domainOpt, wrapped);
}
return FilterOriginHit.compile(domainOpt, wrapped);
}
// Many hostnames.
// Must be in set (none negated).
if ( domainOpt.indexOf('~') === -1 ) {
return FilterOriginHitSet.compile(domainOpt, wrapped);
}
// Must not be in set (all negated).
const reAllNegated = /^~(?:[^|~]+\|~)+[^|~]+$/;
if ( reAllNegated.test(domainOpt) ) {
return FilterOriginMissSet.compile(domainOpt, wrapped);
}
// Must be in one set, but not in the other.
return FilterOriginMixedSet.compile(domainOpt, wrapped);
Add HNTrie-based filter classes to store origin-only filters Related issue: - https://github.com/uBlockOrigin/uBlock-issues/issues/528#issuecomment-484408622 Following STrie-related work in above issue, I noticed that a large number of filters in EasyList were filters which only had to match against the document origin. For instance, among just the top 10 most populous buckets, there were four such buckets with over hundreds of entries each: - bits: 72, token: "http", 146 entries - bits: 72, token: "https", 139 entries - bits: 88, token: "http", 122 entries - bits: 88, token: "https", 118 entries These filters in these buckets have to be matched against all the network requests. In order to leverage HNTrie for these filters[1], they are now handled in a special way so as to ensure they all end up in a single HNTrie (per bucket), which means that instead of scanning hundreds of entries per URL, there is now a single scan per bucket per URL for these apply-everywhere filters. Now, any filter which fulfill ALL the following condition will be processed in a special manner internally: - Is of the form `|https://` or `|http://` or `*`; and - Does have a `domain=` option; and - Does not have a negated domain in its `domain=` option; and - Does not have `csp=` option; and - Does not have a `redirect=` option If a filter does not fulfill ALL the conditions above, no change in behavior. A filter which matches ALL of the above will be processed in a special manner: - The `domain=` option will be decomposed so as to create as many distinct filter as there is distinct value in the `domain=` option - This also apply to the `badfilter` version of the filter, which means it now become possible to `badfilter` only one of the distinct filter without having to `badfilter` all of them. - The logger will always report these special filters with only a single hostname in the `domain=` option. *** [1] HNTrie is currently WASM-ed on Firefox.
2019-04-19 22:33:46 +02:00
}
slotIdFromStr(s) {
let slotId = this.strToSlotId.get(s);
if ( slotId !== undefined ) { return slotId; }
slotId = this.strSlots.push(s) - 1;
this.strToSlotId.set(s, slotId);
if ( this.gcTimer !== undefined ) { return slotId; }
this.gcTimer = self.requestIdleCallback(
( ) => {
this.gcTimer = undefined;
this.strToSlotId.clear();
},
{ timeout: 5000 }
);
return slotId;
}
strFromSlotId(slotId) {
return this.strSlots[slotId];
}
logData(out, domainOpt) {
if ( out.opts !== undefined ) { out.opts += ','; }
out.opts = `domain=${domainOpt}`;
return out;
Add HNTrie-based filter classes to store origin-only filters Related issue: - https://github.com/uBlockOrigin/uBlock-issues/issues/528#issuecomment-484408622 Following STrie-related work in above issue, I noticed that a large number of filters in EasyList were filters which only had to match against the document origin. For instance, among just the top 10 most populous buckets, there were four such buckets with over hundreds of entries each: - bits: 72, token: "http", 146 entries - bits: 72, token: "https", 139 entries - bits: 88, token: "http", 122 entries - bits: 88, token: "https", 118 entries These filters in these buckets have to be matched against all the network requests. In order to leverage HNTrie for these filters[1], they are now handled in a special way so as to ensure they all end up in a single HNTrie (per bucket), which means that instead of scanning hundreds of entries per URL, there is now a single scan per bucket per URL for these apply-everywhere filters. Now, any filter which fulfill ALL the following condition will be processed in a special manner internally: - Is of the form `|https://` or `|http://` or `*`; and - Does have a `domain=` option; and - Does not have a negated domain in its `domain=` option; and - Does not have `csp=` option; and - Does not have a `redirect=` option If a filter does not fulfill ALL the conditions above, no change in behavior. A filter which matches ALL of the above will be processed in a special manner: - The `domain=` option will be decomposed so as to create as many distinct filter as there is distinct value in the `domain=` option - This also apply to the `badfilter` version of the filter, which means it now become possible to `badfilter` only one of the distinct filter without having to `badfilter` all of them. - The logger will always report these special filters with only a single hostname in the `domain=` option. *** [1] HNTrie is currently WASM-ed on Firefox.
2019-04-19 22:33:46 +02:00
}
readyToUse() {
return this.trieContainer.readyToUse();
Add HNTrie-based filter classes to store origin-only filters Related issue: - https://github.com/uBlockOrigin/uBlock-issues/issues/528#issuecomment-484408622 Following STrie-related work in above issue, I noticed that a large number of filters in EasyList were filters which only had to match against the document origin. For instance, among just the top 10 most populous buckets, there were four such buckets with over hundreds of entries each: - bits: 72, token: "http", 146 entries - bits: 72, token: "https", 139 entries - bits: 88, token: "http", 122 entries - bits: 88, token: "https", 118 entries These filters in these buckets have to be matched against all the network requests. In order to leverage HNTrie for these filters[1], they are now handled in a special way so as to ensure they all end up in a single HNTrie (per bucket), which means that instead of scanning hundreds of entries per URL, there is now a single scan per bucket per URL for these apply-everywhere filters. Now, any filter which fulfill ALL the following condition will be processed in a special manner internally: - Is of the form `|https://` or `|http://` or `*`; and - Does have a `domain=` option; and - Does not have a negated domain in its `domain=` option; and - Does not have `csp=` option; and - Does not have a `redirect=` option If a filter does not fulfill ALL the conditions above, no change in behavior. A filter which matches ALL of the above will be processed in a special manner: - The `domain=` option will be decomposed so as to create as many distinct filter as there is distinct value in the `domain=` option - This also apply to the `badfilter` version of the filter, which means it now become possible to `badfilter` only one of the distinct filter without having to `badfilter` all of them. - The logger will always report these special filters with only a single hostname in the `domain=` option. *** [1] HNTrie is currently WASM-ed on Firefox.
2019-04-19 22:33:46 +02:00
}
reset() {
this.trieContainer.reset();
this.strSlots.length = 0;
this.strToSlotId.clear();
Add HNTrie-based filter classes to store origin-only filters Related issue: - https://github.com/uBlockOrigin/uBlock-issues/issues/528#issuecomment-484408622 Following STrie-related work in above issue, I noticed that a large number of filters in EasyList were filters which only had to match against the document origin. For instance, among just the top 10 most populous buckets, there were four such buckets with over hundreds of entries each: - bits: 72, token: "http", 146 entries - bits: 72, token: "https", 139 entries - bits: 88, token: "http", 122 entries - bits: 88, token: "https", 118 entries These filters in these buckets have to be matched against all the network requests. In order to leverage HNTrie for these filters[1], they are now handled in a special way so as to ensure they all end up in a single HNTrie (per bucket), which means that instead of scanning hundreds of entries per URL, there is now a single scan per bucket per URL for these apply-everywhere filters. Now, any filter which fulfill ALL the following condition will be processed in a special manner internally: - Is of the form `|https://` or `|http://` or `*`; and - Does have a `domain=` option; and - Does not have a negated domain in its `domain=` option; and - Does not have `csp=` option; and - Does not have a `redirect=` option If a filter does not fulfill ALL the conditions above, no change in behavior. A filter which matches ALL of the above will be processed in a special manner: - The `domain=` option will be decomposed so as to create as many distinct filter as there is distinct value in the `domain=` option - This also apply to the `badfilter` version of the filter, which means it now become possible to `badfilter` only one of the distinct filter without having to `badfilter` all of them. - The logger will always report these special filters with only a single hostname in the `domain=` option. *** [1] HNTrie is currently WASM-ed on Firefox.
2019-04-19 22:33:46 +02:00
}
optimize() {
const trieDetails = this.trieContainer.optimize();
vAPI.localStorage.setItem(
'FilterOrigin.trieDetails',
JSON.stringify(trieDetails)
);
this.strToSlotId.clear();
Add HNTrie-based filter classes to store origin-only filters Related issue: - https://github.com/uBlockOrigin/uBlock-issues/issues/528#issuecomment-484408622 Following STrie-related work in above issue, I noticed that a large number of filters in EasyList were filters which only had to match against the document origin. For instance, among just the top 10 most populous buckets, there were four such buckets with over hundreds of entries each: - bits: 72, token: "http", 146 entries - bits: 72, token: "https", 139 entries - bits: 88, token: "http", 122 entries - bits: 88, token: "https", 118 entries These filters in these buckets have to be matched against all the network requests. In order to leverage HNTrie for these filters[1], they are now handled in a special way so as to ensure they all end up in a single HNTrie (per bucket), which means that instead of scanning hundreds of entries per URL, there is now a single scan per bucket per URL for these apply-everywhere filters. Now, any filter which fulfill ALL the following condition will be processed in a special manner internally: - Is of the form `|https://` or `|http://` or `*`; and - Does have a `domain=` option; and - Does not have a negated domain in its `domain=` option; and - Does not have `csp=` option; and - Does not have a `redirect=` option If a filter does not fulfill ALL the conditions above, no change in behavior. A filter which matches ALL of the above will be processed in a special manner: - The `domain=` option will be decomposed so as to create as many distinct filter as there is distinct value in the `domain=` option - This also apply to the `badfilter` version of the filter, which means it now become possible to `badfilter` only one of the distinct filter without having to `badfilter` all of them. - The logger will always report these special filters with only a single hostname in the `domain=` option. *** [1] HNTrie is currently WASM-ed on Firefox.
2019-04-19 22:33:46 +02:00
}
})();
2014-06-24 00:42:43 +02:00
/******************************************************************************/
2015-03-02 16:41:51 +01:00
// Surprinsingly, first peeking and comparing only the first character using
// charCodeAt() does help a bit performance -- 3-6µs gain per request on
// average for Chromium 71 and Firefox 65 with default lists.
// A likely explanation is that most visits are a miss, and in such case
// calling charCodeAt() to bail out earlier is cheaper than calling endsWith().
const FilterOriginHit = class {
constructor(hostname, wrapped) {
this.hostname = hostname;
this.wrapped = wrapped;
}
match(url, tokenBeg) {
const haystack = pageHostnameRegister;
const offset = haystack.length - this.hostname.length;
if ( offset < 0 ) { return false; }
if ( haystack.charCodeAt(offset) !== this.hostname.charCodeAt(0) ) {
return false;
}
if ( haystack.endsWith(this.hostname) === false ) { return false; }
if (
offset !== 0 &&
haystack.charCodeAt(offset-1) !== 0x2E /* '.' */
) {
return false;
}
return this.wrapped.match(url, tokenBeg);
}
logData() {
const out = this.wrapped.logData();
out.compiled = [ this.fid, this.hostname, out.compiled ];
return filterOrigin.logData(out, this.hostname);
}
compile(toSelfie = false) {
return [ this.fid, this.hostname, this.wrapped.compile(toSelfie) ];
}
2015-03-02 16:41:51 +01:00
static compile(domainOpt, wrapped) {
return [ FilterOriginHit.fid, domainOpt, wrapped ];
}
2015-03-02 16:41:51 +01:00
static load(args) {
return new FilterOriginHit(
args[1],
filterFromCompiledData(args[2])
);
}
2015-03-02 16:41:51 +01:00
};
registerFilterClass(FilterOriginHit);
2015-03-02 16:41:51 +01:00
/******************************************************************************/
const FilterOriginMiss = class {
constructor(hostname, wrapped) {
this.hostname = hostname;
this.wrapped = wrapped;
}
2015-03-02 16:41:51 +01:00
match(url, tokenBeg) {
const haystack = pageHostnameRegister;
if ( haystack.endsWith(this.hostname) ) {
const offset = haystack.length - this.hostname.length;
if (
offset === 0 ||
haystack.charCodeAt(offset-1) === 0x2E /* '.' */
) {
return false;
}
}
return this.wrapped.match(url, tokenBeg);
}
logData() {
const out = this.wrapped.logData();
out.compiled = [ this.fid, this.hostname, out.compiled ];
return filterOrigin.logData(out, `~${this.hostname}`);
}
compile(toSelfie = false) {
return [ this.fid, this.hostname, this.wrapped.compile(toSelfie) ];
}
static compile(domainOpt, wrapped) {
return [ FilterOriginMiss.fid, domainOpt.slice(1), wrapped ];
}
2015-03-02 16:41:51 +01:00
static load(args) {
return new FilterOriginMiss(
args[1],
filterFromCompiledData(args[2])
);
}
};
registerFilterClass(FilterOriginMiss);
/******************************************************************************/
const FilterOriginHitSet = class {
constructor(domainOpt, wrapped, oneOf = null) {
this.domainOpt = typeof domainOpt === 'number'
? domainOpt
: filterOrigin.slotIdFromStr(domainOpt);
this.wrapped = filterFromCompiledData(wrapped);
Add HNTrie-based filter classes to store origin-only filters Related issue: - https://github.com/uBlockOrigin/uBlock-issues/issues/528#issuecomment-484408622 Following STrie-related work in above issue, I noticed that a large number of filters in EasyList were filters which only had to match against the document origin. For instance, among just the top 10 most populous buckets, there were four such buckets with over hundreds of entries each: - bits: 72, token: "http", 146 entries - bits: 72, token: "https", 139 entries - bits: 88, token: "http", 122 entries - bits: 88, token: "https", 118 entries These filters in these buckets have to be matched against all the network requests. In order to leverage HNTrie for these filters[1], they are now handled in a special way so as to ensure they all end up in a single HNTrie (per bucket), which means that instead of scanning hundreds of entries per URL, there is now a single scan per bucket per URL for these apply-everywhere filters. Now, any filter which fulfill ALL the following condition will be processed in a special manner internally: - Is of the form `|https://` or `|http://` or `*`; and - Does have a `domain=` option; and - Does not have a negated domain in its `domain=` option; and - Does not have `csp=` option; and - Does not have a `redirect=` option If a filter does not fulfill ALL the conditions above, no change in behavior. A filter which matches ALL of the above will be processed in a special manner: - The `domain=` option will be decomposed so as to create as many distinct filter as there is distinct value in the `domain=` option - This also apply to the `badfilter` version of the filter, which means it now become possible to `badfilter` only one of the distinct filter without having to `badfilter` all of them. - The logger will always report these special filters with only a single hostname in the `domain=` option. *** [1] HNTrie is currently WASM-ed on Firefox.
2019-04-19 22:33:46 +02:00
this.oneOf = oneOf !== null
? filterOrigin.trieContainer.createOne(oneOf)
: null;
}
2015-03-02 22:22:23 +01:00
match(url, tokenBeg) {
if ( this.oneOf === null ) {
this.oneOf = filterOrigin.trieContainer.fromIterable(
filterOrigin.strFromSlotId(this.domainOpt).split('|')
);
}
return this.oneOf.matches(pageHostnameRegister) !== -1 &&
this.wrapped.match(url, tokenBeg);
}
logData() {
const out = this.wrapped.logData();
const domainOpt = filterOrigin.strFromSlotId(this.domainOpt);
out.compiled = [ this.fid, domainOpt, out.compiled ];
return filterOrigin.logData(out, domainOpt);
}
compile(toSelfie = false) {
const out = [
Add HNTrie-based filter classes to store origin-only filters Related issue: - https://github.com/uBlockOrigin/uBlock-issues/issues/528#issuecomment-484408622 Following STrie-related work in above issue, I noticed that a large number of filters in EasyList were filters which only had to match against the document origin. For instance, among just the top 10 most populous buckets, there were four such buckets with over hundreds of entries each: - bits: 72, token: "http", 146 entries - bits: 72, token: "https", 139 entries - bits: 88, token: "http", 122 entries - bits: 88, token: "https", 118 entries These filters in these buckets have to be matched against all the network requests. In order to leverage HNTrie for these filters[1], they are now handled in a special way so as to ensure they all end up in a single HNTrie (per bucket), which means that instead of scanning hundreds of entries per URL, there is now a single scan per bucket per URL for these apply-everywhere filters. Now, any filter which fulfill ALL the following condition will be processed in a special manner internally: - Is of the form `|https://` or `|http://` or `*`; and - Does have a `domain=` option; and - Does not have a negated domain in its `domain=` option; and - Does not have `csp=` option; and - Does not have a `redirect=` option If a filter does not fulfill ALL the conditions above, no change in behavior. A filter which matches ALL of the above will be processed in a special manner: - The `domain=` option will be decomposed so as to create as many distinct filter as there is distinct value in the `domain=` option - This also apply to the `badfilter` version of the filter, which means it now become possible to `badfilter` only one of the distinct filter without having to `badfilter` all of them. - The logger will always report these special filters with only a single hostname in the `domain=` option. *** [1] HNTrie is currently WASM-ed on Firefox.
2019-04-19 22:33:46 +02:00
this.fid,
toSelfie
? this.domainOpt :
filterOrigin.strFromSlotId(this.domainOpt),
this.wrapped.compile(toSelfie),
Add HNTrie-based filter classes to store origin-only filters Related issue: - https://github.com/uBlockOrigin/uBlock-issues/issues/528#issuecomment-484408622 Following STrie-related work in above issue, I noticed that a large number of filters in EasyList were filters which only had to match against the document origin. For instance, among just the top 10 most populous buckets, there were four such buckets with over hundreds of entries each: - bits: 72, token: "http", 146 entries - bits: 72, token: "https", 139 entries - bits: 88, token: "http", 122 entries - bits: 88, token: "https", 118 entries These filters in these buckets have to be matched against all the network requests. In order to leverage HNTrie for these filters[1], they are now handled in a special way so as to ensure they all end up in a single HNTrie (per bucket), which means that instead of scanning hundreds of entries per URL, there is now a single scan per bucket per URL for these apply-everywhere filters. Now, any filter which fulfill ALL the following condition will be processed in a special manner internally: - Is of the form `|https://` or `|http://` or `*`; and - Does have a `domain=` option; and - Does not have a negated domain in its `domain=` option; and - Does not have `csp=` option; and - Does not have a `redirect=` option If a filter does not fulfill ALL the conditions above, no change in behavior. A filter which matches ALL of the above will be processed in a special manner: - The `domain=` option will be decomposed so as to create as many distinct filter as there is distinct value in the `domain=` option - This also apply to the `badfilter` version of the filter, which means it now become possible to `badfilter` only one of the distinct filter without having to `badfilter` all of them. - The logger will always report these special filters with only a single hostname in the `domain=` option. *** [1] HNTrie is currently WASM-ed on Firefox.
2019-04-19 22:33:46 +02:00
];
if ( this.oneOf !== null ) {
out.push(filterOrigin.trieContainer.compileOne(this.oneOf));
}
return out;
}
static compile(domainOpt, wrapped) {
return [ FilterOriginHitSet.fid, domainOpt, wrapped ];
}
static load(args) {
return new FilterOriginHitSet(...args.slice(1));
}
};
registerFilterClass(FilterOriginHitSet);
/******************************************************************************/
const FilterOriginMissSet = class {
constructor(domainOpt, wrapped, noneOf = null) {
this.domainOpt = typeof domainOpt === 'number'
? domainOpt
: filterOrigin.slotIdFromStr(domainOpt);
this.wrapped = filterFromCompiledData(wrapped);
Add HNTrie-based filter classes to store origin-only filters Related issue: - https://github.com/uBlockOrigin/uBlock-issues/issues/528#issuecomment-484408622 Following STrie-related work in above issue, I noticed that a large number of filters in EasyList were filters which only had to match against the document origin. For instance, among just the top 10 most populous buckets, there were four such buckets with over hundreds of entries each: - bits: 72, token: "http", 146 entries - bits: 72, token: "https", 139 entries - bits: 88, token: "http", 122 entries - bits: 88, token: "https", 118 entries These filters in these buckets have to be matched against all the network requests. In order to leverage HNTrie for these filters[1], they are now handled in a special way so as to ensure they all end up in a single HNTrie (per bucket), which means that instead of scanning hundreds of entries per URL, there is now a single scan per bucket per URL for these apply-everywhere filters. Now, any filter which fulfill ALL the following condition will be processed in a special manner internally: - Is of the form `|https://` or `|http://` or `*`; and - Does have a `domain=` option; and - Does not have a negated domain in its `domain=` option; and - Does not have `csp=` option; and - Does not have a `redirect=` option If a filter does not fulfill ALL the conditions above, no change in behavior. A filter which matches ALL of the above will be processed in a special manner: - The `domain=` option will be decomposed so as to create as many distinct filter as there is distinct value in the `domain=` option - This also apply to the `badfilter` version of the filter, which means it now become possible to `badfilter` only one of the distinct filter without having to `badfilter` all of them. - The logger will always report these special filters with only a single hostname in the `domain=` option. *** [1] HNTrie is currently WASM-ed on Firefox.
2019-04-19 22:33:46 +02:00
this.noneOf = noneOf !== null
? filterOrigin.trieContainer.createOne(noneOf)
: null;
}
2015-03-02 22:22:23 +01:00
match(url, tokenBeg) {
if ( this.noneOf === null ) {
this.noneOf = filterOrigin.trieContainer.fromIterable(
filterOrigin
.strFromSlotId(this.domainOpt)
.replace(/~/g, '')
.split('|')
);
}
return this.noneOf.matches(pageHostnameRegister) === -1 &&
this.wrapped.match(url, tokenBeg);
}
logData() {
const out = this.wrapped.logData();
const domainOpt = filterOrigin.strFromSlotId(this.domainOpt);
out.compiled = [ this.fid, domainOpt, out.compiled ];
return filterOrigin.logData(out, domainOpt);
}
compile(toSelfie = false) {
const out = [
Add HNTrie-based filter classes to store origin-only filters Related issue: - https://github.com/uBlockOrigin/uBlock-issues/issues/528#issuecomment-484408622 Following STrie-related work in above issue, I noticed that a large number of filters in EasyList were filters which only had to match against the document origin. For instance, among just the top 10 most populous buckets, there were four such buckets with over hundreds of entries each: - bits: 72, token: "http", 146 entries - bits: 72, token: "https", 139 entries - bits: 88, token: "http", 122 entries - bits: 88, token: "https", 118 entries These filters in these buckets have to be matched against all the network requests. In order to leverage HNTrie for these filters[1], they are now handled in a special way so as to ensure they all end up in a single HNTrie (per bucket), which means that instead of scanning hundreds of entries per URL, there is now a single scan per bucket per URL for these apply-everywhere filters. Now, any filter which fulfill ALL the following condition will be processed in a special manner internally: - Is of the form `|https://` or `|http://` or `*`; and - Does have a `domain=` option; and - Does not have a negated domain in its `domain=` option; and - Does not have `csp=` option; and - Does not have a `redirect=` option If a filter does not fulfill ALL the conditions above, no change in behavior. A filter which matches ALL of the above will be processed in a special manner: - The `domain=` option will be decomposed so as to create as many distinct filter as there is distinct value in the `domain=` option - This also apply to the `badfilter` version of the filter, which means it now become possible to `badfilter` only one of the distinct filter without having to `badfilter` all of them. - The logger will always report these special filters with only a single hostname in the `domain=` option. *** [1] HNTrie is currently WASM-ed on Firefox.
2019-04-19 22:33:46 +02:00
this.fid,
toSelfie
? this.domainOpt
: filterOrigin.strFromSlotId(this.domainOpt),
this.wrapped.compile(toSelfie),
Add HNTrie-based filter classes to store origin-only filters Related issue: - https://github.com/uBlockOrigin/uBlock-issues/issues/528#issuecomment-484408622 Following STrie-related work in above issue, I noticed that a large number of filters in EasyList were filters which only had to match against the document origin. For instance, among just the top 10 most populous buckets, there were four such buckets with over hundreds of entries each: - bits: 72, token: "http", 146 entries - bits: 72, token: "https", 139 entries - bits: 88, token: "http", 122 entries - bits: 88, token: "https", 118 entries These filters in these buckets have to be matched against all the network requests. In order to leverage HNTrie for these filters[1], they are now handled in a special way so as to ensure they all end up in a single HNTrie (per bucket), which means that instead of scanning hundreds of entries per URL, there is now a single scan per bucket per URL for these apply-everywhere filters. Now, any filter which fulfill ALL the following condition will be processed in a special manner internally: - Is of the form `|https://` or `|http://` or `*`; and - Does have a `domain=` option; and - Does not have a negated domain in its `domain=` option; and - Does not have `csp=` option; and - Does not have a `redirect=` option If a filter does not fulfill ALL the conditions above, no change in behavior. A filter which matches ALL of the above will be processed in a special manner: - The `domain=` option will be decomposed so as to create as many distinct filter as there is distinct value in the `domain=` option - This also apply to the `badfilter` version of the filter, which means it now become possible to `badfilter` only one of the distinct filter without having to `badfilter` all of them. - The logger will always report these special filters with only a single hostname in the `domain=` option. *** [1] HNTrie is currently WASM-ed on Firefox.
2019-04-19 22:33:46 +02:00
];
if ( this.noneOf !== null ) {
out.push(filterOrigin.trieContainer.compileOne(this.noneOf));
}
return out;
}
static compile(domainOpt, wrapped) {
return [ FilterOriginMissSet.fid, domainOpt, wrapped ];
}
2015-03-02 22:22:23 +01:00
static load(args) {
return new FilterOriginMissSet(...args.slice(1));
}
2015-03-02 22:22:23 +01:00
};
registerFilterClass(FilterOriginMissSet);
/******************************************************************************/
2015-03-02 22:22:23 +01:00
const FilterOriginMixedSet = class {
constructor(domainOpt, wrapped, oneOf = null, noneOf = null) {
this.domainOpt = typeof domainOpt === 'number'
? domainOpt
: filterOrigin.slotIdFromStr(domainOpt);
this.wrapped = filterFromCompiledData(wrapped);
Add HNTrie-based filter classes to store origin-only filters Related issue: - https://github.com/uBlockOrigin/uBlock-issues/issues/528#issuecomment-484408622 Following STrie-related work in above issue, I noticed that a large number of filters in EasyList were filters which only had to match against the document origin. For instance, among just the top 10 most populous buckets, there were four such buckets with over hundreds of entries each: - bits: 72, token: "http", 146 entries - bits: 72, token: "https", 139 entries - bits: 88, token: "http", 122 entries - bits: 88, token: "https", 118 entries These filters in these buckets have to be matched against all the network requests. In order to leverage HNTrie for these filters[1], they are now handled in a special way so as to ensure they all end up in a single HNTrie (per bucket), which means that instead of scanning hundreds of entries per URL, there is now a single scan per bucket per URL for these apply-everywhere filters. Now, any filter which fulfill ALL the following condition will be processed in a special manner internally: - Is of the form `|https://` or `|http://` or `*`; and - Does have a `domain=` option; and - Does not have a negated domain in its `domain=` option; and - Does not have `csp=` option; and - Does not have a `redirect=` option If a filter does not fulfill ALL the conditions above, no change in behavior. A filter which matches ALL of the above will be processed in a special manner: - The `domain=` option will be decomposed so as to create as many distinct filter as there is distinct value in the `domain=` option - This also apply to the `badfilter` version of the filter, which means it now become possible to `badfilter` only one of the distinct filter without having to `badfilter` all of them. - The logger will always report these special filters with only a single hostname in the `domain=` option. *** [1] HNTrie is currently WASM-ed on Firefox.
2019-04-19 22:33:46 +02:00
this.oneOf = oneOf !== null
? filterOrigin.trieContainer.createOne(oneOf)
: null;
this.noneOf = noneOf !== null
? filterOrigin.trieContainer.createOne(noneOf)
: null;
}
2015-03-02 22:22:23 +01:00
init() {
const oneOf = [], noneOf = [];
const domainOpt = filterOrigin.strFromSlotId(this.domainOpt);
for ( const hostname of domainOpt.split('|') ) {
if ( hostname.charCodeAt(0) === 0x7E /* '~' */ ) {
noneOf.push(hostname.slice(1));
} else {
oneOf.push(hostname);
}
}
this.oneOf = filterOrigin.trieContainer.fromIterable(oneOf);
this.noneOf = filterOrigin.trieContainer.fromIterable(noneOf);
}
match(url, tokenBeg) {
if ( this.oneOf === null ) { this.init(); }
let needle = pageHostnameRegister;
return this.oneOf.matches(needle) !== -1 &&
this.noneOf.matches(needle) === -1 &&
this.wrapped.match(url, tokenBeg);
}
logData() {
const out = this.wrapped.logData();
const domainOpt = filterOrigin.strFromSlotId(this.domainOpt);
out.compiled = [ this.fid, domainOpt, out.compiled ];
return filterOrigin.logData(out, domainOpt);
}
compile(toSelfie = false) {
const out = [
Add HNTrie-based filter classes to store origin-only filters Related issue: - https://github.com/uBlockOrigin/uBlock-issues/issues/528#issuecomment-484408622 Following STrie-related work in above issue, I noticed that a large number of filters in EasyList were filters which only had to match against the document origin. For instance, among just the top 10 most populous buckets, there were four such buckets with over hundreds of entries each: - bits: 72, token: "http", 146 entries - bits: 72, token: "https", 139 entries - bits: 88, token: "http", 122 entries - bits: 88, token: "https", 118 entries These filters in these buckets have to be matched against all the network requests. In order to leverage HNTrie for these filters[1], they are now handled in a special way so as to ensure they all end up in a single HNTrie (per bucket), which means that instead of scanning hundreds of entries per URL, there is now a single scan per bucket per URL for these apply-everywhere filters. Now, any filter which fulfill ALL the following condition will be processed in a special manner internally: - Is of the form `|https://` or `|http://` or `*`; and - Does have a `domain=` option; and - Does not have a negated domain in its `domain=` option; and - Does not have `csp=` option; and - Does not have a `redirect=` option If a filter does not fulfill ALL the conditions above, no change in behavior. A filter which matches ALL of the above will be processed in a special manner: - The `domain=` option will be decomposed so as to create as many distinct filter as there is distinct value in the `domain=` option - This also apply to the `badfilter` version of the filter, which means it now become possible to `badfilter` only one of the distinct filter without having to `badfilter` all of them. - The logger will always report these special filters with only a single hostname in the `domain=` option. *** [1] HNTrie is currently WASM-ed on Firefox.
2019-04-19 22:33:46 +02:00
this.fid,
toSelfie
? this.domainOpt
: filterOrigin.strFromSlotId(this.domainOpt),
this.wrapped.compile(toSelfie),
Add HNTrie-based filter classes to store origin-only filters Related issue: - https://github.com/uBlockOrigin/uBlock-issues/issues/528#issuecomment-484408622 Following STrie-related work in above issue, I noticed that a large number of filters in EasyList were filters which only had to match against the document origin. For instance, among just the top 10 most populous buckets, there were four such buckets with over hundreds of entries each: - bits: 72, token: "http", 146 entries - bits: 72, token: "https", 139 entries - bits: 88, token: "http", 122 entries - bits: 88, token: "https", 118 entries These filters in these buckets have to be matched against all the network requests. In order to leverage HNTrie for these filters[1], they are now handled in a special way so as to ensure they all end up in a single HNTrie (per bucket), which means that instead of scanning hundreds of entries per URL, there is now a single scan per bucket per URL for these apply-everywhere filters. Now, any filter which fulfill ALL the following condition will be processed in a special manner internally: - Is of the form `|https://` or `|http://` or `*`; and - Does have a `domain=` option; and - Does not have a negated domain in its `domain=` option; and - Does not have `csp=` option; and - Does not have a `redirect=` option If a filter does not fulfill ALL the conditions above, no change in behavior. A filter which matches ALL of the above will be processed in a special manner: - The `domain=` option will be decomposed so as to create as many distinct filter as there is distinct value in the `domain=` option - This also apply to the `badfilter` version of the filter, which means it now become possible to `badfilter` only one of the distinct filter without having to `badfilter` all of them. - The logger will always report these special filters with only a single hostname in the `domain=` option. *** [1] HNTrie is currently WASM-ed on Firefox.
2019-04-19 22:33:46 +02:00
];
if ( this.oneOf !== null ) {
out.push(
filterOrigin.trieContainer.compileOne(this.oneOf),
filterOrigin.trieContainer.compileOne(this.noneOf)
);
}
return out;
}
static compile(domainOpt, wrapped) {
return [ FilterOriginMixedSet.fid, domainOpt, wrapped ];
}
static load(args) {
return new FilterOriginMixedSet(...args.slice(1));
}
};
registerFilterClass(FilterOriginMixedSet);
2015-03-02 22:22:23 +01:00
/******************************************************************************/
2015-01-23 17:32:49 +01:00
const FilterDataHolder = class {
constructor(dataType, dataStr) {
this.dataType = dataType;
this.dataStr = dataStr;
this.wrapped = undefined;
}
2015-01-23 17:32:49 +01:00
match(url, tokenBeg) {
return this.wrapped.match(url, tokenBeg);
}
logData() {
const out = this.wrapped.logData();
out.compiled = [ this.fid, this.dataType, this.dataStr, out.compiled ];
let opt = this.dataType;
if ( this.dataStr !== '' ) {
opt += `=${this.dataStr}`;
}
if ( out.opts === undefined ) {
out.opts = opt;
} else {
out.opts = opt + ',' + out.opts;
}
return out;
}
2015-01-23 17:32:49 +01:00
compile(toSelfie = false) {
return [
this.fid,
this.dataType,
this.dataStr,
this.wrapped.compile(toSelfie)
];
}
2015-01-23 17:32:49 +01:00
static compile(details) {
return [ FilterDataHolder.fid, details.dataType, details.dataStr ];
}
2015-02-24 00:31:29 +01:00
static load(args) {
const f = new FilterDataHolder(args[1], args[2]);
f.wrapped = filterFromCompiledData(args[3]);
return f;
}
2015-01-23 17:32:49 +01:00
};
registerFilterClass(FilterDataHolder);
2015-01-23 17:32:49 +01:00
// Helper class for storing instances of FilterDataHolder.
2015-01-23 17:32:49 +01:00
const FilterDataHolderEntry = class {
constructor(categoryBits, tokenHash, fdata) {
this.categoryBits = categoryBits;
this.tokenHash = tokenHash;
this.filter = filterFromCompiledData(fdata);
this.next = undefined;
}
2015-01-23 17:32:49 +01:00
logData() {
return toLogDataInternal(this.categoryBits, this.tokenHash, this.filter);
}
2015-02-24 00:31:29 +01:00
compile() {
return [ this.categoryBits, this.tokenHash, this.filter.compile() ];
}
2015-01-23 17:32:49 +01:00
static load(data) {
return new FilterDataHolderEntry(data[0], data[1], data[2]);
}
2015-01-23 17:32:49 +01:00
};
2014-09-08 23:46:58 +02:00
/******************************************************************************/
2015-02-05 00:06:31 +01:00
// Dictionary of hostnames
const FilterHostnameDict = class {
constructor(args) {
this.h = ''; // short-lived register
this.dict = FilterHostnameDict.trieContainer.createOne(args);
}
2015-02-05 00:06:31 +01:00
get size() {
2017-03-20 20:54:41 +01:00
return this.dict.size;
}
add(hn) {
return this.dict.add(hn);
}
match() {
const pos = this.dict.matches(requestHostnameRegister);
if ( pos === -1 ) { return false; }
this.h = requestHostnameRegister.slice(pos);
return true;
}
logData() {
return {
raw: `||${this.h}^`,
regex: `${rawToRegexStr(this.h, 0)}(?:[^%.0-9a-z_-]|$)`,
compiled: this.h
};
}
compile() {
return [ this.fid, FilterHostnameDict.trieContainer.compileOne(this.dict) ];
}
static readyToUse() {
return FilterHostnameDict.trieContainer.readyToUse();
}
static reset() {
return FilterHostnameDict.trieContainer.reset();
}
static optimize() {
const trieDetails = FilterHostnameDict.trieContainer.optimize();
vAPI.localStorage.setItem(
'FilterHostnameDict.trieDetails',
JSON.stringify(trieDetails)
);
}
static load(args) {
return new FilterHostnameDict(args[1]);
}
2017-03-20 20:54:41 +01:00
};
FilterHostnameDict.trieContainer = (( ) => {
let trieDetails;
try {
trieDetails = JSON.parse(
vAPI.localStorage.getItem('FilterHostnameDict.trieDetails')
);
} catch(ex) {
2015-02-05 00:06:31 +01:00
}
return new µBlock.HNTrieContainer(trieDetails);
})();
registerFilterClass(FilterHostnameDict);
2015-02-05 00:06:31 +01:00
/******************************************************************************/
Add HNTrie-based filter classes to store origin-only filters Related issue: - https://github.com/uBlockOrigin/uBlock-issues/issues/528#issuecomment-484408622 Following STrie-related work in above issue, I noticed that a large number of filters in EasyList were filters which only had to match against the document origin. For instance, among just the top 10 most populous buckets, there were four such buckets with over hundreds of entries each: - bits: 72, token: "http", 146 entries - bits: 72, token: "https", 139 entries - bits: 88, token: "http", 122 entries - bits: 88, token: "https", 118 entries These filters in these buckets have to be matched against all the network requests. In order to leverage HNTrie for these filters[1], they are now handled in a special way so as to ensure they all end up in a single HNTrie (per bucket), which means that instead of scanning hundreds of entries per URL, there is now a single scan per bucket per URL for these apply-everywhere filters. Now, any filter which fulfill ALL the following condition will be processed in a special manner internally: - Is of the form `|https://` or `|http://` or `*`; and - Does have a `domain=` option; and - Does not have a negated domain in its `domain=` option; and - Does not have `csp=` option; and - Does not have a `redirect=` option If a filter does not fulfill ALL the conditions above, no change in behavior. A filter which matches ALL of the above will be processed in a special manner: - The `domain=` option will be decomposed so as to create as many distinct filter as there is distinct value in the `domain=` option - This also apply to the `badfilter` version of the filter, which means it now become possible to `badfilter` only one of the distinct filter without having to `badfilter` all of them. - The logger will always report these special filters with only a single hostname in the `domain=` option. *** [1] HNTrie is currently WASM-ed on Firefox.
2019-04-19 22:33:46 +02:00
// Dictionary of hostnames for filters which only purpose is to match
// the document origin.
const FilterJustOrigin = class {
constructor(args) {
this.h = ''; // short-lived register
this.dict = filterOrigin.trieContainer.createOne(args);
}
get size() {
return this.dict.size;
}
add(hn) {
return this.dict.add(hn);
}
match() {
const pos = this.dict.matches(pageHostnameRegister);
if ( pos === -1 ) { return false; }
this.h = pageHostnameRegister.slice(pos);
return true;
}
logData() {
return {
raw: '*',
regex: '^',
compiled: this.h,
opts: `domain=${this.h}`,
Add HNTrie-based filter classes to store origin-only filters Related issue: - https://github.com/uBlockOrigin/uBlock-issues/issues/528#issuecomment-484408622 Following STrie-related work in above issue, I noticed that a large number of filters in EasyList were filters which only had to match against the document origin. For instance, among just the top 10 most populous buckets, there were four such buckets with over hundreds of entries each: - bits: 72, token: "http", 146 entries - bits: 72, token: "https", 139 entries - bits: 88, token: "http", 122 entries - bits: 88, token: "https", 118 entries These filters in these buckets have to be matched against all the network requests. In order to leverage HNTrie for these filters[1], they are now handled in a special way so as to ensure they all end up in a single HNTrie (per bucket), which means that instead of scanning hundreds of entries per URL, there is now a single scan per bucket per URL for these apply-everywhere filters. Now, any filter which fulfill ALL the following condition will be processed in a special manner internally: - Is of the form `|https://` or `|http://` or `*`; and - Does have a `domain=` option; and - Does not have a negated domain in its `domain=` option; and - Does not have `csp=` option; and - Does not have a `redirect=` option If a filter does not fulfill ALL the conditions above, no change in behavior. A filter which matches ALL of the above will be processed in a special manner: - The `domain=` option will be decomposed so as to create as many distinct filter as there is distinct value in the `domain=` option - This also apply to the `badfilter` version of the filter, which means it now become possible to `badfilter` only one of the distinct filter without having to `badfilter` all of them. - The logger will always report these special filters with only a single hostname in the `domain=` option. *** [1] HNTrie is currently WASM-ed on Firefox.
2019-04-19 22:33:46 +02:00
};
}
compile() {
return [ this.fid, filterOrigin.trieContainer.compileOne(this.dict) ];
}
static load(args) {
return new FilterJustOrigin(args[1]);
}
};
registerFilterClass(FilterJustOrigin);
/******************************************************************************/
const FilterHTTPSJustOrigin = class extends FilterJustOrigin {
match(url) {
return url.startsWith('https://') && super.match();
}
logData() {
const out = super.logData();
out.raw = '|https://';
out.regex = '^https://';
return out;
Add HNTrie-based filter classes to store origin-only filters Related issue: - https://github.com/uBlockOrigin/uBlock-issues/issues/528#issuecomment-484408622 Following STrie-related work in above issue, I noticed that a large number of filters in EasyList were filters which only had to match against the document origin. For instance, among just the top 10 most populous buckets, there were four such buckets with over hundreds of entries each: - bits: 72, token: "http", 146 entries - bits: 72, token: "https", 139 entries - bits: 88, token: "http", 122 entries - bits: 88, token: "https", 118 entries These filters in these buckets have to be matched against all the network requests. In order to leverage HNTrie for these filters[1], they are now handled in a special way so as to ensure they all end up in a single HNTrie (per bucket), which means that instead of scanning hundreds of entries per URL, there is now a single scan per bucket per URL for these apply-everywhere filters. Now, any filter which fulfill ALL the following condition will be processed in a special manner internally: - Is of the form `|https://` or `|http://` or `*`; and - Does have a `domain=` option; and - Does not have a negated domain in its `domain=` option; and - Does not have `csp=` option; and - Does not have a `redirect=` option If a filter does not fulfill ALL the conditions above, no change in behavior. A filter which matches ALL of the above will be processed in a special manner: - The `domain=` option will be decomposed so as to create as many distinct filter as there is distinct value in the `domain=` option - This also apply to the `badfilter` version of the filter, which means it now become possible to `badfilter` only one of the distinct filter without having to `badfilter` all of them. - The logger will always report these special filters with only a single hostname in the `domain=` option. *** [1] HNTrie is currently WASM-ed on Firefox.
2019-04-19 22:33:46 +02:00
}
static load(args) {
return new FilterHTTPSJustOrigin(args[1]);
}
};
registerFilterClass(FilterHTTPSJustOrigin);
/******************************************************************************/
const FilterHTTPJustOrigin = class extends FilterJustOrigin {
match(url) {
return url.startsWith('http://') && super.match();
}
logData() {
const out = super.logData();
out.raw = '|https://';
out.regex = '^https://';
return out;
Add HNTrie-based filter classes to store origin-only filters Related issue: - https://github.com/uBlockOrigin/uBlock-issues/issues/528#issuecomment-484408622 Following STrie-related work in above issue, I noticed that a large number of filters in EasyList were filters which only had to match against the document origin. For instance, among just the top 10 most populous buckets, there were four such buckets with over hundreds of entries each: - bits: 72, token: "http", 146 entries - bits: 72, token: "https", 139 entries - bits: 88, token: "http", 122 entries - bits: 88, token: "https", 118 entries These filters in these buckets have to be matched against all the network requests. In order to leverage HNTrie for these filters[1], they are now handled in a special way so as to ensure they all end up in a single HNTrie (per bucket), which means that instead of scanning hundreds of entries per URL, there is now a single scan per bucket per URL for these apply-everywhere filters. Now, any filter which fulfill ALL the following condition will be processed in a special manner internally: - Is of the form `|https://` or `|http://` or `*`; and - Does have a `domain=` option; and - Does not have a negated domain in its `domain=` option; and - Does not have `csp=` option; and - Does not have a `redirect=` option If a filter does not fulfill ALL the conditions above, no change in behavior. A filter which matches ALL of the above will be processed in a special manner: - The `domain=` option will be decomposed so as to create as many distinct filter as there is distinct value in the `domain=` option - This also apply to the `badfilter` version of the filter, which means it now become possible to `badfilter` only one of the distinct filter without having to `badfilter` all of them. - The logger will always report these special filters with only a single hostname in the `domain=` option. *** [1] HNTrie is currently WASM-ed on Firefox.
2019-04-19 22:33:46 +02:00
}
static load(args) {
return new FilterHTTPJustOrigin(args[1]);
}
};
registerFilterClass(FilterHTTPJustOrigin);
/******************************************************************************/
const FilterPair = class {
constructor(a, b) {
this.f1 = a;
this.f2 = b;
}
get size() {
return 2;
}
match(url, tokenBeg) {
if ( this.f1.match(url, tokenBeg) === true ) {
this.f = this.f1;
return true;
}
if ( this.f2.match(url, tokenBeg) === true ) {
this.f = this.f2;
return true;
}
return false;
}
logData() {
return this.f.logData();
}
compile(toSelfie = false) {
return [
this.fid,
this.f1.compile(toSelfie),
this.f2.compile(toSelfie)
];
}
upgrade(a) {
const bucket = new FilterBucket(this.f1, this.f2, a);
this.f1 = this.f2 = undefined;
this.f = null;
FilterPair.available = this;
return bucket;
}
static load(args) {
const f1 = filterFromCompiledData(args[1]);
const f2 = filterFromCompiledData(args[2]);
const pair = FilterPair.available;
if ( pair === null ) {
return new FilterPair(f1, f2);
}
FilterPair.available = null;
pair.f1 = f1;
pair.f2 = f2;
return pair;
}
};
FilterPair.prototype.f = null;
FilterPair.available = null;
registerFilterClass(FilterPair);
/******************************************************************************/
const FilterBucket = class {
constructor(a, b, c) {
this.filters = [];
if ( a !== undefined ) {
this.filters.push(a, b, c);
this._countTrieable();
}
this.trieResult = 0;
2014-09-08 23:46:58 +02:00
}
get size() {
let size = this.filters.length;
if ( this.plainTrie !== null ) {
size += this.plainTrie.size;
}
if ( this.plainHnAnchoredTrie !== null ) {
size += this.plainHnAnchoredTrie.size;
}
return size;
}
add(fdata) {
const fclass = filterClasses[fdata[0]];
if ( fclass.trieableId === 0 ) {
if ( this.plainTrie !== null ) {
return fclass.addToTrie(fdata, this.plainTrie);
}
if ( this.plainCount === 3 ) {
this.plainTrie = FilterBucket.trieContainer.createOne();
this._transferTrieable(0, this.plainTrie);
return fclass.addToTrie(fdata, this.plainTrie);
}
this.plainCount += 1;
}
if ( fclass.trieableId === 1 ) {
if ( this.plainHnAnchoredTrie !== null ) {
return fclass.addToTrie(fdata, this.plainHnAnchoredTrie);
}
if ( this.plainHnAnchoredCount === 3 ) {
this.plainHnAnchoredTrie = FilterBucket.trieContainer.createOne();
this._transferTrieable(1, this.plainHnAnchoredTrie);
return fclass.addToTrie(fdata, this.plainHnAnchoredTrie);
}
this.plainHnAnchoredCount += 1;
}
this.filters.push(filterFromCompiledData(fdata));
}
match(url, tokenBeg) {
if ( this.plainTrie !== null ) {
const pos = this.plainTrie.matches(url, tokenBeg);
if ( pos !== -1 ) {
this.trieResult = pos;
this.f = this.plainFilter;
this.f.tokenBeg = tokenBeg - (pos >>> 16);
return true;
}
}
if ( this.plainHnAnchoredTrie !== null ) {
const pos = this.plainHnAnchoredTrie.matches(url, tokenBeg);
if ( pos !== -1 && isHnAnchored(url, pos >>> 16) ) {
this.trieResult = pos;
this.f = this.plainHnAnchoredFilter;
this.f.tokenBeg = tokenBeg - (pos >>> 16);
return true;
}
}
const filters = this.filters;
for ( let i = 0, n = filters.length; i < n; i++ ) {
if ( filters[i].match(url, tokenBeg) === true ) {
this.f = filters[i];
if ( i >= 16 ) { this._promote(i); }
return true;
}
}
return false;
}
2014-09-08 23:46:58 +02:00
logData() {
if (
this.f === this.plainFilter ||
this.f === this.plainHnAnchoredFilter
) {
this.f.s = urlRegister.slice(
this.trieResult >>> 16,
this.trieResult & 0xFFFF
);
}
return this.f.logData();
}
compile(toSelfie = false) {
return [
this.fid,
this.filters.map(filter => filter.compile(toSelfie)),
this.plainTrie !== null &&
FilterBucket.trieContainer.compileOne(this.plainTrie),
this.plainHnAnchoredTrie !== null &&
FilterBucket.trieContainer.compileOne(this.plainHnAnchoredTrie),
];
}
_countTrieable() {
for ( const f of this.filters ) {
if ( f.trieableId === 0 ) {
this.plainCount += 1;
} else if ( f.trieableId === 1 ) {
this.plainHnAnchoredCount += 1;
}
}
}
_transferTrieable(trieableId, trie) {
const filters = this.filters;
let i = filters.length;
while ( i-- ) {
const f = filters[i];
if ( f.trieableId !== trieableId || f.s.length > 255 ) { continue; }
f.addToTrie(trie);
filters.splice(i, 1);
2014-09-08 23:46:58 +02:00
}
}
// Promote hit filters so they can be found faster next time.
_promote(i) {
const filters = this.filters;
let pivot = filters.length >>> 1;
while ( i < pivot ) {
pivot >>>= 1;
if ( pivot < 16 ) { break; }
}
if ( i <= pivot ) { return; }
const j = this.promoted % pivot;
//console.debug('FilterBucket.promote(): promoted %d to %d', i, j);
const f = filters[j];
filters[j] = filters[i];
filters[i] = f;
this.promoted += 1;
}
2014-09-08 23:46:58 +02:00
static reset() {
FilterBucket.trieContainer.reset();
}
2014-09-08 23:46:58 +02:00
static optimize() {
const trieDetails = this.trieContainer.optimize();
vAPI.localStorage.setItem(
'FilterBucket.trieDetails',
JSON.stringify(trieDetails)
);
}
2017-05-30 17:38:45 +02:00
static load(args) {
const bucket = new FilterBucket();
bucket.filters = args[1].map(data => filterFromCompiledData(data));
if ( Array.isArray(args[2]) ) {
bucket.plainTrie =
FilterBucket.trieContainer.createOne(args[2]);
}
if ( Array.isArray(args[3]) ) {
bucket.plainHnAnchoredTrie =
FilterBucket.trieContainer.createOne(args[3]);
}
return bucket;
}
};
FilterBucket.prototype.f = null;
FilterBucket.prototype.promoted = 0;
FilterBucket.prototype.plainCount = 0;
FilterBucket.prototype.plainTrie = null;
FilterBucket.prototype.plainFilter = new FilterPlainX('', 0);
FilterBucket.prototype.plainHnAnchoredCount = 0;
FilterBucket.prototype.plainHnAnchoredTrie = null;
FilterBucket.prototype.plainHnAnchoredFilter = new FilterPlainHnAnchoredX('', 0);
FilterBucket.trieContainer = (( ) => {
let trieDetails;
try {
trieDetails = JSON.parse(
vAPI.localStorage.getItem('FilterBucket.trieDetails')
);
} catch(ex) {
}
return new µBlock.BidiTrieContainer(trieDetails);
})();
registerFilterClass(FilterBucket);
/******************************************************************************/
/******************************************************************************/
Squashed commit of the following: commit 7c6cacc59b27660fabacb55d668ef099b222a9e6 Author: Raymond Hill <rhill@raymondhill.net> Date: Sat Nov 3 08:52:51 2018 -0300 code review: finalize support for wasm-based hntrie commit 8596ed80e3bdac2c36e3c860b51e7189f6bc8487 Merge: cbe1f2e 000eb82 Author: Raymond Hill <rhill@raymondhill.net> Date: Sat Nov 3 08:41:40 2018 -0300 Merge branch 'master' of github.com:gorhill/uBlock into trie-wasm commit cbe1f2e2f38484d42af3204ec7f1b5decd30f99e Merge: 270fc7f dbb7e80 Author: Raymond Hill <rhill@raymondhill.net> Date: Fri Nov 2 17:43:20 2018 -0300 Merge branch 'master' of github.com:gorhill/uBlock into trie-wasm commit 270fc7f9b3b73d79e6355522c1a42ce782fe7e5c Merge: d2a89cf d693d4f Author: Raymond Hill <rhill@raymondhill.net> Date: Fri Nov 2 16:21:08 2018 -0300 Merge branch 'master' of github.com:gorhill/uBlock into trie-wasm commit d2a89cf28f0816ffd4617c2c7b4ccfcdcc30e1b4 Merge: d7afc78 649f82f Author: Raymond Hill <rhill@raymondhill.net> Date: Fri Nov 2 14:54:58 2018 -0300 Merge branch 'master' of github.com:gorhill/uBlock into trie-wasm commit d7afc78b5f5675d7d34c5a1d0ec3099a77caef49 Author: Raymond Hill <rhill@raymondhill.net> Date: Fri Nov 2 13:56:11 2018 -0300 finalize wasm-based hntrie implementation commit e7b9e043cf36ad055791713e34eb0322dec84627 Author: Raymond Hill <rhill@raymondhill.net> Date: Fri Nov 2 08:14:02 2018 -0300 add first-pass implementation of wasm version of hntrie commit 1015cb34624f3ef73ace58b58fe4e03dfc59897f Author: Raymond Hill <rhill@raymondhill.net> Date: Wed Oct 31 17:16:47 2018 -0300 back up draft work toward experimenting with wasm hntries
2018-11-03 12:58:46 +01:00
const FilterParser = function() {
this.cantWebsocket = vAPI.cantWebsocket;
2017-01-09 14:56:42 +01:00
this.reBadDomainOptChars = /[*+?^${}()[\]\\]/;
this.reHostnameRule1 = /^\w[\w.-]*[a-z]$/i;
this.reHostnameRule2 = /^\w[\w.-]*[a-z]\^?$/i;
2015-12-13 17:03:13 +01:00
this.reCanTrimCarets1 = /^[^*]*$/;
this.reCanTrimCarets2 = /^\^?[^^]+[^^][^^]+\^?$/;
2015-12-13 18:55:55 +01:00
this.reIsolateHostname = /^(\*?\.)?([^\x00-\x24\x26-\x2C\x2F\x3A-\x5E\x60\x7B-\x7F]+)(.*)/;
2015-02-27 00:08:42 +01:00
this.reHasUnicode = /[^\x00-\x7F]/;
this.reWebsocketAny = /^ws[s*]?(?::\/?\/?)?\*?$/;
this.reBadCSP = /(?:^|;)\s*report-(?:to|uri)\b/;
this.domainOpt = '';
Add HNTrie-based filter classes to store origin-only filters Related issue: - https://github.com/uBlockOrigin/uBlock-issues/issues/528#issuecomment-484408622 Following STrie-related work in above issue, I noticed that a large number of filters in EasyList were filters which only had to match against the document origin. For instance, among just the top 10 most populous buckets, there were four such buckets with over hundreds of entries each: - bits: 72, token: "http", 146 entries - bits: 72, token: "https", 139 entries - bits: 88, token: "http", 122 entries - bits: 88, token: "https", 118 entries These filters in these buckets have to be matched against all the network requests. In order to leverage HNTrie for these filters[1], they are now handled in a special way so as to ensure they all end up in a single HNTrie (per bucket), which means that instead of scanning hundreds of entries per URL, there is now a single scan per bucket per URL for these apply-everywhere filters. Now, any filter which fulfill ALL the following condition will be processed in a special manner internally: - Is of the form `|https://` or `|http://` or `*`; and - Does have a `domain=` option; and - Does not have a negated domain in its `domain=` option; and - Does not have `csp=` option; and - Does not have a `redirect=` option If a filter does not fulfill ALL the conditions above, no change in behavior. A filter which matches ALL of the above will be processed in a special manner: - The `domain=` option will be decomposed so as to create as many distinct filter as there is distinct value in the `domain=` option - This also apply to the `badfilter` version of the filter, which means it now become possible to `badfilter` only one of the distinct filter without having to `badfilter` all of them. - The logger will always report these special filters with only a single hostname in the `domain=` option. *** [1] HNTrie is currently WASM-ed on Firefox.
2019-04-19 22:33:46 +02:00
this.noTokenHash = µb.urlTokenizer.noTokenHash;
2014-08-28 15:59:05 +02:00
this.reset();
2014-06-24 00:42:43 +02:00
};
/******************************************************************************/
2016-11-06 16:49:02 +01:00
// https://github.com/gorhill/uBlock/issues/1493
// Transpose `ping` into `other` for now.
2014-06-24 00:42:43 +02:00
FilterParser.prototype.toNormalizedType = {
'all': 'all',
'beacon': 'other',
'css': 'stylesheet',
'data': 'data',
'doc': 'main_frame',
'document': 'main_frame',
'elemhide': 'generichide',
2015-04-05 16:38:47 +02:00
'font': 'font',
'frame': 'sub_frame',
2017-09-14 23:54:59 +02:00
'genericblock': 'unsupported',
'generichide': 'generichide',
'image': 'image',
2017-09-16 13:49:43 +02:00
'inline-font': 'inline-font',
'inline-script': 'inline-script',
2016-03-07 01:16:46 +01:00
'media': 'media',
'object': 'object',
'object-subrequest': 'object',
2017-11-03 21:51:28 +01:00
'other': 'other',
2016-11-06 16:49:02 +01:00
'ping': 'other',
'popunder': 'popunder',
'popup': 'popup',
'script': 'script',
'stylesheet': 'stylesheet',
'subdocument': 'sub_frame',
'xhr': 'xmlhttprequest',
'xmlhttprequest': 'xmlhttprequest',
2017-09-14 05:41:20 +02:00
'webrtc': 'unsupported',
'websocket': 'websocket',
2014-06-24 00:42:43 +02:00
};
/******************************************************************************/
FilterParser.prototype.reset = function() {
this.action = BlockAction;
this.anchor = 0;
this.badFilter = false;
this.dataType = undefined;
this.dataStr = undefined;
2014-06-24 00:42:43 +02:00
this.elemHiding = false;
this.f = '';
this.firstParty = false;
this.thirdParty = false;
this.party = AnyParty;
2014-06-24 00:42:43 +02:00
this.fopts = '';
this.domainOpt = '';
this.isPureHostname = false;
2015-01-23 17:32:49 +01:00
this.isRegex = false;
this.raw = '';
this.redirect = 0;
2015-12-04 03:24:37 +01:00
this.token = '*';
2017-05-19 14:45:19 +02:00
this.tokenHash = this.noTokenHash;
2015-01-23 17:32:49 +01:00
this.tokenBeg = 0;
this.types = 0;
this.notTypes = 0;
2014-08-29 21:02:31 +02:00
this.important = 0;
this.wildcarded = false;
2014-06-24 00:42:43 +02:00
this.unsupported = false;
return this;
};
/******************************************************************************/
FilterParser.prototype.bitFromType = function(type) {
return 1 << ((typeNameToTypeValue[type] >>> 4) - 1);
};
/******************************************************************************/
2015-04-07 03:26:05 +02:00
// https://github.com/chrisaljoudi/uBlock/issues/589
// Be ready to handle multiple negated types
FilterParser.prototype.parseTypeOption = function(raw, not) {
const typeBit = raw !== 'all'
? this.bitFromType(this.toNormalizedType[raw])
: allTypesBits;
if ( not ) {
this.notTypes |= typeBit;
} else {
2015-03-26 00:28:22 +01:00
this.types |= typeBit;
}
2014-06-24 00:42:43 +02:00
};
/******************************************************************************/
FilterParser.prototype.parsePartyOption = function(firstParty, not) {
if ( firstParty ) {
not = !not;
}
2014-06-24 00:42:43 +02:00
if ( not ) {
this.firstParty = true;
this.party = this.thirdParty ? AnyParty : FirstParty;
2014-06-24 00:42:43 +02:00
} else {
this.thirdParty = true;
this.party = this.firstParty ? AnyParty : ThirdParty;
2014-06-24 00:42:43 +02:00
}
};
/******************************************************************************/
FilterParser.prototype.parseDomainOption = function(s) {
if ( this.reHasUnicode.test(s) ) {
const hostnames = s.split('|');
let i = hostnames.length;
while ( i-- ) {
if ( this.reHasUnicode.test(hostnames[i]) ) {
hostnames[i] = punycode.toASCII(hostnames[i]);
}
}
s = hostnames.join('|');
}
if ( this.reBadDomainOptChars.test(s) ) { return ''; }
return s;
};
/******************************************************************************/
2015-01-23 17:32:49 +01:00
FilterParser.prototype.parseOptions = function(s) {
this.fopts = s;
for ( let opt of s.split(/\s*,\s*/) ) {
const not = opt.startsWith('~');
2014-06-24 00:42:43 +02:00
if ( not ) {
2015-01-23 17:32:49 +01:00
opt = opt.slice(1);
2014-06-24 00:42:43 +02:00
}
if ( opt === 'third-party' || opt === '3p' ) {
this.parsePartyOption(false, not);
2015-01-23 17:32:49 +01:00
continue;
}
if ( opt === 'first-party' || opt === '1p' ) {
this.parsePartyOption(true, not);
continue;
}
2015-01-23 17:32:49 +01:00
if ( this.toNormalizedType.hasOwnProperty(opt) ) {
this.parseTypeOption(opt, not);
2015-01-23 17:32:49 +01:00
continue;
}
2017-01-09 14:56:42 +01:00
// https://github.com/gorhill/uBlock/issues/2294
// Detect and discard filter if domain option contains nonsensical
// characters.
if ( opt.startsWith('domain=') ) {
this.domainOpt = this.parseDomainOption(opt.slice(7));
if ( this.domainOpt === '' ) {
2017-01-09 14:56:42 +01:00
this.unsupported = true;
break;
}
2015-01-23 17:32:49 +01:00
continue;
}
if ( opt === 'important' ) {
this.important = Important;
continue;
2014-06-24 00:42:43 +02:00
}
if ( /^redirect(?:-rule)?=/.test(opt) ) {
if ( this.redirect !== 0 ) {
this.unsupported = true;
break;
}
this.redirect = opt.charCodeAt(8) === 0x3D /* '=' */ ? 1 : 2;
continue;
}
if (
opt.startsWith('csp=') &&
opt.length > 4 &&
this.reBadCSP.test(opt) === false
) {
this.parseTypeOption('data', not);
this.dataType = 'csp';
this.dataStr = opt.slice(4).trim();
continue;
}
if ( opt === 'csp' && this.action === AllowAction ) {
this.parseTypeOption('data', not);
this.dataType = 'csp';
this.dataStr = '';
continue;
}
// Used by Adguard, purpose is unclear -- just ignore for now.
if ( opt === 'empty' ) {
if ( this.redirect !== 0 ) {
this.unsupported = true;
break;
}
this.redirect = 1;
continue;
}
// https://github.com/uBlockOrigin/uAssets/issues/192
if ( opt === 'badfilter' ) {
this.badFilter = true;
continue;
}
2015-11-24 05:34:03 +01:00
// Unrecognized filter option: ignore whole filter.
2015-01-23 17:32:49 +01:00
this.unsupported = true;
break;
2014-06-24 00:42:43 +02:00
}
// Redirect rules can't be exception filters.
if ( this.redirect !== 0 && this.action !== BlockAction ) {
this.unsupported = true;
}
// Negated network types? Toggle on all network type bits.
// Negated non-network types can only toggle themselves.
if ( (this.notTypes & allNetworkTypesBits) !== 0 ) {
this.types |= allNetworkTypesBits;
}
if ( this.notTypes !== 0 ) {
this.types &= ~this.notTypes;
if ( this.types === 0 ) {
this.unsupported = true;
}
}
// https://github.com/gorhill/uBlock/issues/2283
// Abort if type is only for unsupported types, otherwise
// toggle off `unsupported` bit.
if ( this.types & unsupportedTypeBit ) {
this.types &= ~unsupportedTypeBit;
if ( this.types === 0 ) {
this.unsupported = true;
}
}
2014-06-24 00:42:43 +02:00
};
/*******************************************************************************
anchor: bit vector
0000 (0x0): no anchoring
0001 (0x1): anchored to the end of the URL.
0010 (0x2): anchored to the start of the URL.
0011 (0x3): anchored to the start and end of the URL.
0100 (0x4): anchored to the hostname of the URL.
0101 (0x5): anchored to the hostname and end of the URL.
**/
2015-02-27 00:08:42 +01:00
FilterParser.prototype.parse = function(raw) {
2014-06-24 00:42:43 +02:00
// important!
this.reset();
let s = this.raw = raw;
2015-02-27 00:08:42 +01:00
// Filters which are a single alphanumeric character are discarded
// as unsupported.
if ( s.length === 1 && /[0-9a-z]/i.test(s) ) {
this.unsupported = true;
return this;
}
2015-12-13 18:55:55 +01:00
// plain hostname? (from HOSTS file)
if ( this.reHostnameRule1.test(s) ) {
this.f = s.toLowerCase();
this.isPureHostname = true;
this.anchor |= 0x4;
2014-09-19 16:59:44 +02:00
return this;
}
2014-06-24 00:42:43 +02:00
// element hiding filter?
let pos = s.indexOf('#');
2015-01-23 17:32:49 +01:00
if ( pos !== -1 ) {
const c = s.charAt(pos + 1);
2015-01-23 17:32:49 +01:00
if ( c === '#' || c === '@' ) {
console.error('static-net-filtering.js > unexpected cosmetic filters');
this.elemHiding = true;
return this;
}
}
// block or allow filter?
// Important: this must be executed before parsing options
if ( s.startsWith('@@') ) {
this.action = AllowAction;
s = s.slice(2);
}
2015-01-23 17:32:49 +01:00
// options
2015-11-06 16:49:09 +01:00
// https://github.com/gorhill/uBlock/issues/842
// - ensure sure we are not dealing with a regex-based filter.
// - lookup the last occurrence of `$`.
if ( s.startsWith('/') === false || s.endsWith('/') === false ) {
2015-11-06 16:49:09 +01:00
pos = s.lastIndexOf('$');
if ( pos !== -1 ) {
// https://github.com/gorhill/uBlock/issues/952
2017-09-14 05:41:20 +02:00
// Discard Adguard-specific `$$` filters.
if ( s.indexOf('$$') !== -1 ) {
this.unsupported = true;
return this;
}
2015-11-06 16:49:09 +01:00
this.parseOptions(s.slice(pos + 1));
if ( this.unsupported ) { return this; }
2015-11-06 16:49:09 +01:00
s = s.slice(0, pos);
}
2014-06-24 00:42:43 +02:00
}
2015-01-23 17:32:49 +01:00
// regex?
if ( s.startsWith('/') && s.endsWith('/') && s.length > 2 ) {
2015-01-23 17:32:49 +01:00
this.isRegex = true;
this.f = s.slice(1, -1);
2016-01-17 02:21:17 +01:00
// https://github.com/gorhill/uBlock/issues/1246
// If the filter is valid, use the corrected version of the source
// string -- this ensure reverse-lookup will work fine.
this.f = normalizeRegexSource(this.f);
if ( this.f === '' ) {
2015-10-26 16:23:56 +01:00
console.error(
"uBlock Origin> discarding bad regular expression-based network filter '%s': '%s'",
raw,
2016-01-17 02:21:17 +01:00
normalizeRegexSource.message
2015-10-26 16:23:56 +01:00
);
this.unsupported = true;
}
2014-09-08 23:46:58 +02:00
return this;
}
2015-02-27 00:08:42 +01:00
// hostname-anchored
if ( s.startsWith('||') ) {
this.anchor |= 0x4;
2015-12-13 18:55:55 +01:00
s = s.slice(2);
2015-02-27 00:08:42 +01:00
// convert hostname to punycode if needed
2017-05-09 14:58:30 +02:00
// https://github.com/gorhill/uBlock/issues/2599
2015-02-27 00:08:42 +01:00
if ( this.reHasUnicode.test(s) ) {
const matches = this.reIsolateHostname.exec(s);
2015-12-13 18:55:55 +01:00
if ( matches ) {
2017-05-09 14:58:30 +02:00
s = (matches[1] !== undefined ? matches[1] : '') +
punycode.toASCII(matches[2]) +
matches[3];
2015-02-27 00:08:42 +01:00
//console.debug('µBlock.staticNetFilteringEngine/FilterParser.parse():', raw, '=', s);
}
}
2015-03-26 20:16:48 +01:00
2015-04-07 03:26:05 +02:00
// https://github.com/chrisaljoudi/uBlock/issues/1096
if ( s.startsWith('^') ) {
2015-03-26 20:16:48 +01:00
this.unsupported = true;
return this;
}
2015-12-13 18:55:55 +01:00
// plain hostname? (from ABP filter list)
2016-06-27 03:15:18 +02:00
// https://github.com/gorhill/uBlock/issues/1757
// A filter can't be a pure-hostname one if there is a domain or csp
// option present.
if ( this.reHostnameRule2.test(s) ) {
if ( s.charCodeAt(s.length - 1) === 0x5E /* '^' */ ) {
s = s.slice(0, -1);
}
this.f = s.toLowerCase();
this.isPureHostname = true;
2015-12-13 18:55:55 +01:00
return this;
}
2014-06-24 00:42:43 +02:00
}
// left-anchored
else if ( s.startsWith('|') ) {
this.anchor |= 0x2;
2014-06-24 00:42:43 +02:00
s = s.slice(1);
}
// right-anchored
if ( s.endsWith('|') ) {
this.anchor |= 0x1;
2014-06-24 00:42:43 +02:00
s = s.slice(0, -1);
}
2017-05-30 17:38:45 +02:00
// https://github.com/gorhill/uBlock/issues/1669#issuecomment-224822448
// remove pointless leading *.
2017-09-18 19:06:36 +02:00
// https://github.com/gorhill/uBlock/issues/3034
// - We can remove anchoring if we need to match all at the start.
2017-05-30 17:38:45 +02:00
if ( s.startsWith('*') ) {
2018-03-07 16:37:18 +01:00
s = s.replace(/^\*+([^%0-9a-z])/i, '$1');
2017-09-18 19:06:36 +02:00
this.anchor &= ~0x6;
2017-05-30 17:38:45 +02:00
}
// remove pointless trailing *
2017-09-18 19:06:36 +02:00
// https://github.com/gorhill/uBlock/issues/3034
// - We can remove anchoring if we need to match all at the end.
2017-05-30 17:38:45 +02:00
if ( s.endsWith('*') ) {
2018-03-07 16:37:18 +01:00
s = s.replace(/([^%0-9a-z])\*+$/i, '$1');
2017-09-18 19:06:36 +02:00
this.anchor &= ~0x1;
2015-01-23 17:32:49 +01:00
}
2014-09-19 16:59:44 +02:00
// nothing left?
if ( s === '' ) {
2015-03-17 14:39:03 +01:00
s = '*';
}
// TODO: remove once redirect rules with `*/*` pattern are no longer used.
else if ( this.redirect !== 0 && s === '/' ) {
s = '*';
}
2015-12-11 12:36:28 +01:00
// https://github.com/gorhill/uBlock/issues/1047
// Hostname-anchored makes no sense if matching all requests.
if ( s === '*' ) {
this.anchor = 0;
2015-12-11 12:36:28 +01:00
}
this.wildcarded = reIsWildcarded.test(s);
this.f = s.toLowerCase();
2015-01-23 17:32:49 +01:00
return this;
};
/******************************************************************************/
2015-03-02 16:41:51 +01:00
// Given a string, find a good token. Tokens which are too generic, i.e. very
// common with a high probability of ending up as a miss, are not
// good. Avoid if possible. This has a *significant* positive impact on
// performance.
// These "bad tokens" are collated manually.
2015-12-04 03:24:37 +01:00
// Hostname-anchored with no wildcard always have a token index of 0.
Add HNTrie-based filter classes to store origin-only filters Related issue: - https://github.com/uBlockOrigin/uBlock-issues/issues/528#issuecomment-484408622 Following STrie-related work in above issue, I noticed that a large number of filters in EasyList were filters which only had to match against the document origin. For instance, among just the top 10 most populous buckets, there were four such buckets with over hundreds of entries each: - bits: 72, token: "http", 146 entries - bits: 72, token: "https", 139 entries - bits: 88, token: "http", 122 entries - bits: 88, token: "https", 118 entries These filters in these buckets have to be matched against all the network requests. In order to leverage HNTrie for these filters[1], they are now handled in a special way so as to ensure they all end up in a single HNTrie (per bucket), which means that instead of scanning hundreds of entries per URL, there is now a single scan per bucket per URL for these apply-everywhere filters. Now, any filter which fulfill ALL the following condition will be processed in a special manner internally: - Is of the form `|https://` or `|http://` or `*`; and - Does have a `domain=` option; and - Does not have a negated domain in its `domain=` option; and - Does not have `csp=` option; and - Does not have a `redirect=` option If a filter does not fulfill ALL the conditions above, no change in behavior. A filter which matches ALL of the above will be processed in a special manner: - The `domain=` option will be decomposed so as to create as many distinct filter as there is distinct value in the `domain=` option - This also apply to the `badfilter` version of the filter, which means it now become possible to `badfilter` only one of the distinct filter without having to `badfilter` all of them. - The logger will always report these special filters with only a single hostname in the `domain=` option. *** [1] HNTrie is currently WASM-ed on Firefox.
2019-04-19 22:33:46 +02:00
const reGoodToken = /[%0-9a-z]{2,}/g;
const reRegexToken = /[%0-9A-Za-z]{2,}/g;
const reRegexTokenAbort = /[([]/;
const reRegexBadPrefix = /(^|[^\\]\.|[*?{}\\])$/;
const reRegexBadSuffix = /^([^\\]\.|\\[dw]|[([{}?*.]|$)/;
Add HNTrie-based filter classes to store origin-only filters Related issue: - https://github.com/uBlockOrigin/uBlock-issues/issues/528#issuecomment-484408622 Following STrie-related work in above issue, I noticed that a large number of filters in EasyList were filters which only had to match against the document origin. For instance, among just the top 10 most populous buckets, there were four such buckets with over hundreds of entries each: - bits: 72, token: "http", 146 entries - bits: 72, token: "https", 139 entries - bits: 88, token: "http", 122 entries - bits: 88, token: "https", 118 entries These filters in these buckets have to be matched against all the network requests. In order to leverage HNTrie for these filters[1], they are now handled in a special way so as to ensure they all end up in a single HNTrie (per bucket), which means that instead of scanning hundreds of entries per URL, there is now a single scan per bucket per URL for these apply-everywhere filters. Now, any filter which fulfill ALL the following condition will be processed in a special manner internally: - Is of the form `|https://` or `|http://` or `*`; and - Does have a `domain=` option; and - Does not have a negated domain in its `domain=` option; and - Does not have `csp=` option; and - Does not have a `redirect=` option If a filter does not fulfill ALL the conditions above, no change in behavior. A filter which matches ALL of the above will be processed in a special manner: - The `domain=` option will be decomposed so as to create as many distinct filter as there is distinct value in the `domain=` option - This also apply to the `badfilter` version of the filter, which means it now become possible to `badfilter` only one of the distinct filter without having to `badfilter` all of them. - The logger will always report these special filters with only a single hostname in the `domain=` option. *** [1] HNTrie is currently WASM-ed on Firefox.
2019-04-19 22:33:46 +02:00
const badTokens = new Set([
'com',
'google',
'http',
'https',
'icon',
'images',
'img',
'js',
'net',
'news',
'www'
]);
2015-03-02 16:41:51 +01:00
2017-07-11 19:57:31 +02:00
FilterParser.prototype.findFirstGoodToken = function() {
2015-03-02 16:41:51 +01:00
reGoodToken.lastIndex = 0;
Add HNTrie-based filter classes to store origin-only filters Related issue: - https://github.com/uBlockOrigin/uBlock-issues/issues/528#issuecomment-484408622 Following STrie-related work in above issue, I noticed that a large number of filters in EasyList were filters which only had to match against the document origin. For instance, among just the top 10 most populous buckets, there were four such buckets with over hundreds of entries each: - bits: 72, token: "http", 146 entries - bits: 72, token: "https", 139 entries - bits: 88, token: "http", 122 entries - bits: 88, token: "https", 118 entries These filters in these buckets have to be matched against all the network requests. In order to leverage HNTrie for these filters[1], they are now handled in a special way so as to ensure they all end up in a single HNTrie (per bucket), which means that instead of scanning hundreds of entries per URL, there is now a single scan per bucket per URL for these apply-everywhere filters. Now, any filter which fulfill ALL the following condition will be processed in a special manner internally: - Is of the form `|https://` or `|http://` or `*`; and - Does have a `domain=` option; and - Does not have a negated domain in its `domain=` option; and - Does not have `csp=` option; and - Does not have a `redirect=` option If a filter does not fulfill ALL the conditions above, no change in behavior. A filter which matches ALL of the above will be processed in a special manner: - The `domain=` option will be decomposed so as to create as many distinct filter as there is distinct value in the `domain=` option - This also apply to the `badfilter` version of the filter, which means it now become possible to `badfilter` only one of the distinct filter without having to `badfilter` all of them. - The logger will always report these special filters with only a single hostname in the `domain=` option. *** [1] HNTrie is currently WASM-ed on Firefox.
2019-04-19 22:33:46 +02:00
const s = this.f;
let matches;
let badTokenMatch = null;
while ( (matches = reGoodToken.exec(s)) !== null ) {
2015-12-03 16:06:06 +01:00
// https://github.com/gorhill/uBlock/issues/997
// Ignore token if preceded by wildcard.
Add HNTrie-based filter classes to store origin-only filters Related issue: - https://github.com/uBlockOrigin/uBlock-issues/issues/528#issuecomment-484408622 Following STrie-related work in above issue, I noticed that a large number of filters in EasyList were filters which only had to match against the document origin. For instance, among just the top 10 most populous buckets, there were four such buckets with over hundreds of entries each: - bits: 72, token: "http", 146 entries - bits: 72, token: "https", 139 entries - bits: 88, token: "http", 122 entries - bits: 88, token: "https", 118 entries These filters in these buckets have to be matched against all the network requests. In order to leverage HNTrie for these filters[1], they are now handled in a special way so as to ensure they all end up in a single HNTrie (per bucket), which means that instead of scanning hundreds of entries per URL, there is now a single scan per bucket per URL for these apply-everywhere filters. Now, any filter which fulfill ALL the following condition will be processed in a special manner internally: - Is of the form `|https://` or `|http://` or `*`; and - Does have a `domain=` option; and - Does not have a negated domain in its `domain=` option; and - Does not have `csp=` option; and - Does not have a `redirect=` option If a filter does not fulfill ALL the conditions above, no change in behavior. A filter which matches ALL of the above will be processed in a special manner: - The `domain=` option will be decomposed so as to create as many distinct filter as there is distinct value in the `domain=` option - This also apply to the `badfilter` version of the filter, which means it now become possible to `badfilter` only one of the distinct filter without having to `badfilter` all of them. - The logger will always report these special filters with only a single hostname in the `domain=` option. *** [1] HNTrie is currently WASM-ed on Firefox.
2019-04-19 22:33:46 +02:00
const lpos = matches.index;
if ( lpos !== 0 && s.charCodeAt(lpos - 1) === 0x2A /* '*' */ ) {
2015-12-03 16:06:06 +01:00
continue;
}
if ( s.charCodeAt(reGoodToken.lastIndex) === 0x2A /* '*' */ ) {
2015-03-02 22:22:23 +01:00
continue;
}
if ( badTokens.has(matches[0]) ) {
2015-12-04 03:24:37 +01:00
if ( badTokenMatch === null ) {
badTokenMatch = matches;
}
2015-03-02 16:41:51 +01:00
continue;
}
2015-03-02 22:22:23 +01:00
return matches;
}
2015-12-04 03:24:37 +01:00
return badTokenMatch;
2015-03-02 16:41:51 +01:00
};
2017-07-11 19:57:31 +02:00
FilterParser.prototype.extractTokenFromRegex = function() {
reRegexToken.lastIndex = 0;
Add HNTrie-based filter classes to store origin-only filters Related issue: - https://github.com/uBlockOrigin/uBlock-issues/issues/528#issuecomment-484408622 Following STrie-related work in above issue, I noticed that a large number of filters in EasyList were filters which only had to match against the document origin. For instance, among just the top 10 most populous buckets, there were four such buckets with over hundreds of entries each: - bits: 72, token: "http", 146 entries - bits: 72, token: "https", 139 entries - bits: 88, token: "http", 122 entries - bits: 88, token: "https", 118 entries These filters in these buckets have to be matched against all the network requests. In order to leverage HNTrie for these filters[1], they are now handled in a special way so as to ensure they all end up in a single HNTrie (per bucket), which means that instead of scanning hundreds of entries per URL, there is now a single scan per bucket per URL for these apply-everywhere filters. Now, any filter which fulfill ALL the following condition will be processed in a special manner internally: - Is of the form `|https://` or `|http://` or `*`; and - Does have a `domain=` option; and - Does not have a negated domain in its `domain=` option; and - Does not have `csp=` option; and - Does not have a `redirect=` option If a filter does not fulfill ALL the conditions above, no change in behavior. A filter which matches ALL of the above will be processed in a special manner: - The `domain=` option will be decomposed so as to create as many distinct filter as there is distinct value in the `domain=` option - This also apply to the `badfilter` version of the filter, which means it now become possible to `badfilter` only one of the distinct filter without having to `badfilter` all of them. - The logger will always report these special filters with only a single hostname in the `domain=` option. *** [1] HNTrie is currently WASM-ed on Firefox.
2019-04-19 22:33:46 +02:00
const s = this.f;
let matches;
while ( (matches = reRegexToken.exec(s)) !== null ) {
Add HNTrie-based filter classes to store origin-only filters Related issue: - https://github.com/uBlockOrigin/uBlock-issues/issues/528#issuecomment-484408622 Following STrie-related work in above issue, I noticed that a large number of filters in EasyList were filters which only had to match against the document origin. For instance, among just the top 10 most populous buckets, there were four such buckets with over hundreds of entries each: - bits: 72, token: "http", 146 entries - bits: 72, token: "https", 139 entries - bits: 88, token: "http", 122 entries - bits: 88, token: "https", 118 entries These filters in these buckets have to be matched against all the network requests. In order to leverage HNTrie for these filters[1], they are now handled in a special way so as to ensure they all end up in a single HNTrie (per bucket), which means that instead of scanning hundreds of entries per URL, there is now a single scan per bucket per URL for these apply-everywhere filters. Now, any filter which fulfill ALL the following condition will be processed in a special manner internally: - Is of the form `|https://` or `|http://` or `*`; and - Does have a `domain=` option; and - Does not have a negated domain in its `domain=` option; and - Does not have `csp=` option; and - Does not have a `redirect=` option If a filter does not fulfill ALL the conditions above, no change in behavior. A filter which matches ALL of the above will be processed in a special manner: - The `domain=` option will be decomposed so as to create as many distinct filter as there is distinct value in the `domain=` option - This also apply to the `badfilter` version of the filter, which means it now become possible to `badfilter` only one of the distinct filter without having to `badfilter` all of them. - The logger will always report these special filters with only a single hostname in the `domain=` option. *** [1] HNTrie is currently WASM-ed on Firefox.
2019-04-19 22:33:46 +02:00
const prefix = s.slice(0, matches.index);
if ( reRegexTokenAbort.test(prefix) ) { return; }
if (
reRegexBadPrefix.test(prefix) ||
reRegexBadSuffix.test(s.slice(reRegexToken.lastIndex))
) {
continue;
}
this.token = matches[0].toLowerCase();
this.tokenHash = µb.urlTokenizer.tokenHashFromString(this.token);
this.tokenBeg = matches.index;
if ( badTokens.has(this.token) === false ) { break; }
}
2017-07-11 19:57:31 +02:00
};
2015-03-02 22:22:23 +01:00
/******************************************************************************/
2017-07-11 18:21:08 +02:00
// https://github.com/chrisaljoudi/uBlock/issues/1038
// Single asterisk will match any URL.
// https://github.com/gorhill/uBlock/issues/2781
// For efficiency purpose, try to extract a token from a regex-based filter.
2015-01-23 17:32:49 +01:00
FilterParser.prototype.makeToken = function() {
2017-07-11 18:21:08 +02:00
if ( this.isRegex ) {
2017-07-11 19:57:31 +02:00
this.extractTokenFromRegex();
2017-07-11 18:21:08 +02:00
return;
}
if ( this.f === '*' ) { return; }
2015-01-23 17:32:49 +01:00
let matches = this.findFirstGoodToken();
if ( matches !== null ) {
2015-12-04 03:24:37 +01:00
this.token = matches[0];
2017-05-19 14:45:19 +02:00
this.tokenHash = µb.urlTokenizer.tokenHashFromString(this.token);
2015-01-23 17:32:49 +01:00
this.tokenBeg = matches.index;
}
2014-06-24 00:42:43 +02:00
};
Add HNTrie-based filter classes to store origin-only filters Related issue: - https://github.com/uBlockOrigin/uBlock-issues/issues/528#issuecomment-484408622 Following STrie-related work in above issue, I noticed that a large number of filters in EasyList were filters which only had to match against the document origin. For instance, among just the top 10 most populous buckets, there were four such buckets with over hundreds of entries each: - bits: 72, token: "http", 146 entries - bits: 72, token: "https", 139 entries - bits: 88, token: "http", 122 entries - bits: 88, token: "https", 118 entries These filters in these buckets have to be matched against all the network requests. In order to leverage HNTrie for these filters[1], they are now handled in a special way so as to ensure they all end up in a single HNTrie (per bucket), which means that instead of scanning hundreds of entries per URL, there is now a single scan per bucket per URL for these apply-everywhere filters. Now, any filter which fulfill ALL the following condition will be processed in a special manner internally: - Is of the form `|https://` or `|http://` or `*`; and - Does have a `domain=` option; and - Does not have a negated domain in its `domain=` option; and - Does not have `csp=` option; and - Does not have a `redirect=` option If a filter does not fulfill ALL the conditions above, no change in behavior. A filter which matches ALL of the above will be processed in a special manner: - The `domain=` option will be decomposed so as to create as many distinct filter as there is distinct value in the `domain=` option - This also apply to the `badfilter` version of the filter, which means it now become possible to `badfilter` only one of the distinct filter without having to `badfilter` all of them. - The logger will always report these special filters with only a single hostname in the `domain=` option. *** [1] HNTrie is currently WASM-ed on Firefox.
2019-04-19 22:33:46 +02:00
/******************************************************************************/
FilterParser.prototype.isJustOrigin = function() {
return this.dataType === undefined &&
Add HNTrie-based filter classes to store origin-only filters Related issue: - https://github.com/uBlockOrigin/uBlock-issues/issues/528#issuecomment-484408622 Following STrie-related work in above issue, I noticed that a large number of filters in EasyList were filters which only had to match against the document origin. For instance, among just the top 10 most populous buckets, there were four such buckets with over hundreds of entries each: - bits: 72, token: "http", 146 entries - bits: 72, token: "https", 139 entries - bits: 88, token: "http", 122 entries - bits: 88, token: "https", 118 entries These filters in these buckets have to be matched against all the network requests. In order to leverage HNTrie for these filters[1], they are now handled in a special way so as to ensure they all end up in a single HNTrie (per bucket), which means that instead of scanning hundreds of entries per URL, there is now a single scan per bucket per URL for these apply-everywhere filters. Now, any filter which fulfill ALL the following condition will be processed in a special manner internally: - Is of the form `|https://` or `|http://` or `*`; and - Does have a `domain=` option; and - Does not have a negated domain in its `domain=` option; and - Does not have `csp=` option; and - Does not have a `redirect=` option If a filter does not fulfill ALL the conditions above, no change in behavior. A filter which matches ALL of the above will be processed in a special manner: - The `domain=` option will be decomposed so as to create as many distinct filter as there is distinct value in the `domain=` option - This also apply to the `badfilter` version of the filter, which means it now become possible to `badfilter` only one of the distinct filter without having to `badfilter` all of them. - The logger will always report these special filters with only a single hostname in the `domain=` option. *** [1] HNTrie is currently WASM-ed on Firefox.
2019-04-19 22:33:46 +02:00
this.domainOpt !== '' &&
/^(?:\*|http[s*]?:(?:\/\/)?)$/.test(this.f) &&
Add HNTrie-based filter classes to store origin-only filters Related issue: - https://github.com/uBlockOrigin/uBlock-issues/issues/528#issuecomment-484408622 Following STrie-related work in above issue, I noticed that a large number of filters in EasyList were filters which only had to match against the document origin. For instance, among just the top 10 most populous buckets, there were four such buckets with over hundreds of entries each: - bits: 72, token: "http", 146 entries - bits: 72, token: "https", 139 entries - bits: 88, token: "http", 122 entries - bits: 88, token: "https", 118 entries These filters in these buckets have to be matched against all the network requests. In order to leverage HNTrie for these filters[1], they are now handled in a special way so as to ensure they all end up in a single HNTrie (per bucket), which means that instead of scanning hundreds of entries per URL, there is now a single scan per bucket per URL for these apply-everywhere filters. Now, any filter which fulfill ALL the following condition will be processed in a special manner internally: - Is of the form `|https://` or `|http://` or `*`; and - Does have a `domain=` option; and - Does not have a negated domain in its `domain=` option; and - Does not have `csp=` option; and - Does not have a `redirect=` option If a filter does not fulfill ALL the conditions above, no change in behavior. A filter which matches ALL of the above will be processed in a special manner: - The `domain=` option will be decomposed so as to create as many distinct filter as there is distinct value in the `domain=` option - This also apply to the `badfilter` version of the filter, which means it now become possible to `badfilter` only one of the distinct filter without having to `badfilter` all of them. - The logger will always report these special filters with only a single hostname in the `domain=` option. *** [1] HNTrie is currently WASM-ed on Firefox.
2019-04-19 22:33:46 +02:00
this.domainOpt.indexOf('~') === -1;
};
2014-06-24 00:42:43 +02:00
/******************************************************************************/
/******************************************************************************/
Squashed commit of the following: commit 7c6cacc59b27660fabacb55d668ef099b222a9e6 Author: Raymond Hill <rhill@raymondhill.net> Date: Sat Nov 3 08:52:51 2018 -0300 code review: finalize support for wasm-based hntrie commit 8596ed80e3bdac2c36e3c860b51e7189f6bc8487 Merge: cbe1f2e 000eb82 Author: Raymond Hill <rhill@raymondhill.net> Date: Sat Nov 3 08:41:40 2018 -0300 Merge branch 'master' of github.com:gorhill/uBlock into trie-wasm commit cbe1f2e2f38484d42af3204ec7f1b5decd30f99e Merge: 270fc7f dbb7e80 Author: Raymond Hill <rhill@raymondhill.net> Date: Fri Nov 2 17:43:20 2018 -0300 Merge branch 'master' of github.com:gorhill/uBlock into trie-wasm commit 270fc7f9b3b73d79e6355522c1a42ce782fe7e5c Merge: d2a89cf d693d4f Author: Raymond Hill <rhill@raymondhill.net> Date: Fri Nov 2 16:21:08 2018 -0300 Merge branch 'master' of github.com:gorhill/uBlock into trie-wasm commit d2a89cf28f0816ffd4617c2c7b4ccfcdcc30e1b4 Merge: d7afc78 649f82f Author: Raymond Hill <rhill@raymondhill.net> Date: Fri Nov 2 14:54:58 2018 -0300 Merge branch 'master' of github.com:gorhill/uBlock into trie-wasm commit d7afc78b5f5675d7d34c5a1d0ec3099a77caef49 Author: Raymond Hill <rhill@raymondhill.net> Date: Fri Nov 2 13:56:11 2018 -0300 finalize wasm-based hntrie implementation commit e7b9e043cf36ad055791713e34eb0322dec84627 Author: Raymond Hill <rhill@raymondhill.net> Date: Fri Nov 2 08:14:02 2018 -0300 add first-pass implementation of wasm version of hntrie commit 1015cb34624f3ef73ace58b58fe4e03dfc59897f Author: Raymond Hill <rhill@raymondhill.net> Date: Wed Oct 31 17:16:47 2018 -0300 back up draft work toward experimenting with wasm hntries
2018-11-03 12:58:46 +01:00
const FilterContainer = function() {
2014-06-24 00:42:43 +02:00
this.filterParser = new FilterParser();
this.urlTokenizer = µb.urlTokenizer;
Add HNTrie-based filter classes to store origin-only filters Related issue: - https://github.com/uBlockOrigin/uBlock-issues/issues/528#issuecomment-484408622 Following STrie-related work in above issue, I noticed that a large number of filters in EasyList were filters which only had to match against the document origin. For instance, among just the top 10 most populous buckets, there were four such buckets with over hundreds of entries each: - bits: 72, token: "http", 146 entries - bits: 72, token: "https", 139 entries - bits: 88, token: "http", 122 entries - bits: 88, token: "https", 118 entries These filters in these buckets have to be matched against all the network requests. In order to leverage HNTrie for these filters[1], they are now handled in a special way so as to ensure they all end up in a single HNTrie (per bucket), which means that instead of scanning hundreds of entries per URL, there is now a single scan per bucket per URL for these apply-everywhere filters. Now, any filter which fulfill ALL the following condition will be processed in a special manner internally: - Is of the form `|https://` or `|http://` or `*`; and - Does have a `domain=` option; and - Does not have a negated domain in its `domain=` option; and - Does not have `csp=` option; and - Does not have a `redirect=` option If a filter does not fulfill ALL the conditions above, no change in behavior. A filter which matches ALL of the above will be processed in a special manner: - The `domain=` option will be decomposed so as to create as many distinct filter as there is distinct value in the `domain=` option - This also apply to the `badfilter` version of the filter, which means it now become possible to `badfilter` only one of the distinct filter without having to `badfilter` all of them. - The logger will always report these special filters with only a single hostname in the `domain=` option. *** [1] HNTrie is currently WASM-ed on Firefox.
2019-04-19 22:33:46 +02:00
this.noTokenHash = this.urlTokenizer.noTokenHash;
this.dotTokenHash = this.urlTokenizer.dotTokenHash;
this.anyTokenHash = this.urlTokenizer.anyTokenHash;
this.anyHTTPSTokenHash = this.urlTokenizer.anyHTTPSTokenHash;
this.anyHTTPTokenHash = this.urlTokenizer.anyHTTPTokenHash;
2014-07-20 21:00:26 +02:00
this.reset();
};
/******************************************************************************/
// Reset all, thus reducing to a minimum memory footprint of the context.
FilterContainer.prototype.reset = function() {
this.frozen = false;
2014-06-24 00:42:43 +02:00
this.processedFilterCount = 0;
this.acceptedCount = 0;
2014-09-08 23:46:58 +02:00
this.rejectedCount = 0;
2014-06-24 00:42:43 +02:00
this.allowFilterCount = 0;
this.blockFilterCount = 0;
2016-03-17 18:56:21 +01:00
this.discardedCount = 0;
this.goodFilters = new Set();
this.badFilters = new Set();
this.categories = new Map();
this.dataFilters = new Map();
2014-07-20 21:00:26 +02:00
this.filterParser.reset();
this.urlTokenizer.resetKnownTokens();
Add HNTrie-based filter classes to store origin-only filters Related issue: - https://github.com/uBlockOrigin/uBlock-issues/issues/528#issuecomment-484408622 Following STrie-related work in above issue, I noticed that a large number of filters in EasyList were filters which only had to match against the document origin. For instance, among just the top 10 most populous buckets, there were four such buckets with over hundreds of entries each: - bits: 72, token: "http", 146 entries - bits: 72, token: "https", 139 entries - bits: 88, token: "http", 122 entries - bits: 88, token: "https", 118 entries These filters in these buckets have to be matched against all the network requests. In order to leverage HNTrie for these filters[1], they are now handled in a special way so as to ensure they all end up in a single HNTrie (per bucket), which means that instead of scanning hundreds of entries per URL, there is now a single scan per bucket per URL for these apply-everywhere filters. Now, any filter which fulfill ALL the following condition will be processed in a special manner internally: - Is of the form `|https://` or `|http://` or `*`; and - Does have a `domain=` option; and - Does not have a negated domain in its `domain=` option; and - Does not have `csp=` option; and - Does not have a `redirect=` option If a filter does not fulfill ALL the conditions above, no change in behavior. A filter which matches ALL of the above will be processed in a special manner: - The `domain=` option will be decomposed so as to create as many distinct filter as there is distinct value in the `domain=` option - This also apply to the `badfilter` version of the filter, which means it now become possible to `badfilter` only one of the distinct filter without having to `badfilter` all of them. - The logger will always report these special filters with only a single hostname in the `domain=` option. *** [1] HNTrie is currently WASM-ed on Firefox.
2019-04-19 22:33:46 +02:00
// This will invalidate all tries
FilterHostnameDict.reset();
Add HNTrie-based filter classes to store origin-only filters Related issue: - https://github.com/uBlockOrigin/uBlock-issues/issues/528#issuecomment-484408622 Following STrie-related work in above issue, I noticed that a large number of filters in EasyList were filters which only had to match against the document origin. For instance, among just the top 10 most populous buckets, there were four such buckets with over hundreds of entries each: - bits: 72, token: "http", 146 entries - bits: 72, token: "https", 139 entries - bits: 88, token: "http", 122 entries - bits: 88, token: "https", 118 entries These filters in these buckets have to be matched against all the network requests. In order to leverage HNTrie for these filters[1], they are now handled in a special way so as to ensure they all end up in a single HNTrie (per bucket), which means that instead of scanning hundreds of entries per URL, there is now a single scan per bucket per URL for these apply-everywhere filters. Now, any filter which fulfill ALL the following condition will be processed in a special manner internally: - Is of the form `|https://` or `|http://` or `*`; and - Does have a `domain=` option; and - Does not have a negated domain in its `domain=` option; and - Does not have `csp=` option; and - Does not have a `redirect=` option If a filter does not fulfill ALL the conditions above, no change in behavior. A filter which matches ALL of the above will be processed in a special manner: - The `domain=` option will be decomposed so as to create as many distinct filter as there is distinct value in the `domain=` option - This also apply to the `badfilter` version of the filter, which means it now become possible to `badfilter` only one of the distinct filter without having to `badfilter` all of them. - The logger will always report these special filters with only a single hostname in the `domain=` option. *** [1] HNTrie is currently WASM-ed on Firefox.
2019-04-19 22:33:46 +02:00
filterOrigin.reset();
FilterBucket.reset();
Squashed commit of the following: commit 7c6cacc59b27660fabacb55d668ef099b222a9e6 Author: Raymond Hill <rhill@raymondhill.net> Date: Sat Nov 3 08:52:51 2018 -0300 code review: finalize support for wasm-based hntrie commit 8596ed80e3bdac2c36e3c860b51e7189f6bc8487 Merge: cbe1f2e 000eb82 Author: Raymond Hill <rhill@raymondhill.net> Date: Sat Nov 3 08:41:40 2018 -0300 Merge branch 'master' of github.com:gorhill/uBlock into trie-wasm commit cbe1f2e2f38484d42af3204ec7f1b5decd30f99e Merge: 270fc7f dbb7e80 Author: Raymond Hill <rhill@raymondhill.net> Date: Fri Nov 2 17:43:20 2018 -0300 Merge branch 'master' of github.com:gorhill/uBlock into trie-wasm commit 270fc7f9b3b73d79e6355522c1a42ce782fe7e5c Merge: d2a89cf d693d4f Author: Raymond Hill <rhill@raymondhill.net> Date: Fri Nov 2 16:21:08 2018 -0300 Merge branch 'master' of github.com:gorhill/uBlock into trie-wasm commit d2a89cf28f0816ffd4617c2c7b4ccfcdcc30e1b4 Merge: d7afc78 649f82f Author: Raymond Hill <rhill@raymondhill.net> Date: Fri Nov 2 14:54:58 2018 -0300 Merge branch 'master' of github.com:gorhill/uBlock into trie-wasm commit d7afc78b5f5675d7d34c5a1d0ec3099a77caef49 Author: Raymond Hill <rhill@raymondhill.net> Date: Fri Nov 2 13:56:11 2018 -0300 finalize wasm-based hntrie implementation commit e7b9e043cf36ad055791713e34eb0322dec84627 Author: Raymond Hill <rhill@raymondhill.net> Date: Fri Nov 2 08:14:02 2018 -0300 add first-pass implementation of wasm version of hntrie commit 1015cb34624f3ef73ace58b58fe4e03dfc59897f Author: Raymond Hill <rhill@raymondhill.net> Date: Wed Oct 31 17:16:47 2018 -0300 back up draft work toward experimenting with wasm hntries
2018-11-03 12:58:46 +01:00
// Runtime registers
Add HNTrie-based filter classes to store origin-only filters Related issue: - https://github.com/uBlockOrigin/uBlock-issues/issues/528#issuecomment-484408622 Following STrie-related work in above issue, I noticed that a large number of filters in EasyList were filters which only had to match against the document origin. For instance, among just the top 10 most populous buckets, there were four such buckets with over hundreds of entries each: - bits: 72, token: "http", 146 entries - bits: 72, token: "https", 139 entries - bits: 88, token: "http", 122 entries - bits: 88, token: "https", 118 entries These filters in these buckets have to be matched against all the network requests. In order to leverage HNTrie for these filters[1], they are now handled in a special way so as to ensure they all end up in a single HNTrie (per bucket), which means that instead of scanning hundreds of entries per URL, there is now a single scan per bucket per URL for these apply-everywhere filters. Now, any filter which fulfill ALL the following condition will be processed in a special manner internally: - Is of the form `|https://` or `|http://` or `*`; and - Does have a `domain=` option; and - Does not have a negated domain in its `domain=` option; and - Does not have `csp=` option; and - Does not have a `redirect=` option If a filter does not fulfill ALL the conditions above, no change in behavior. A filter which matches ALL of the above will be processed in a special manner: - The `domain=` option will be decomposed so as to create as many distinct filter as there is distinct value in the `domain=` option - This also apply to the `badfilter` version of the filter, which means it now become possible to `badfilter` only one of the distinct filter without having to `badfilter` all of them. - The logger will always report these special filters with only a single hostname in the `domain=` option. *** [1] HNTrie is currently WASM-ed on Firefox.
2019-04-19 22:33:46 +02:00
this.catbitsRegister = 0;
this.tokenRegister = 0;
this.filterRegister = null;
2014-07-20 21:00:26 +02:00
};
2014-06-24 00:42:43 +02:00
2014-07-20 21:00:26 +02:00
/******************************************************************************/
2014-06-24 00:42:43 +02:00
2014-07-20 21:00:26 +02:00
FilterContainer.prototype.freeze = function() {
const filterPairId = FilterPair.fid;
const filterBucketId = FilterBucket.fid;
const filterDataHolderId = FilterDataHolder.fid;
const redirectTypeValue = typeNameToTypeValue.redirect;
const unserialize = µb.CompiledLineIO.unserialize;
for ( const line of this.goodFilters ) {
if ( this.badFilters.has(line) ) {
this.discardedCount += 1;
continue;
}
const args = unserialize(line);
const bits = args[0];
// Special cases: delegate to more specialized engines.
// Redirect engine.
if ( (bits & 0x1F0) === redirectTypeValue ) {
µb.redirectEngine.fromCompiledRule(args[1]);
continue;
}
// Plain static filters.
const tokenHash = args[1];
const fdata = args[2];
// Special treatment: data-holding filters are stored separately
// because they require special matching algorithm (unlike other
// filters, ALL hits must be reported).
if ( fdata[0] === filterDataHolderId ) {
let entry = new FilterDataHolderEntry(bits, tokenHash, fdata);
let bucket = this.dataFilters.get(tokenHash);
if ( bucket !== undefined ) {
entry.next = bucket;
}
this.dataFilters.set(tokenHash, entry);
this.urlTokenizer.addKnownToken(tokenHash);
continue;
}
let bucket = this.categories.get(bits);
if ( bucket === undefined ) {
bucket = new Map();
this.categories.set(bits, bucket);
}
let entry = bucket.get(tokenHash);
if ( tokenHash === this.dotTokenHash ) {
if ( entry === undefined ) {
entry = new FilterHostnameDict();
bucket.set(this.dotTokenHash, entry);
}
entry.add(fdata);
continue;
}
Add HNTrie-based filter classes to store origin-only filters Related issue: - https://github.com/uBlockOrigin/uBlock-issues/issues/528#issuecomment-484408622 Following STrie-related work in above issue, I noticed that a large number of filters in EasyList were filters which only had to match against the document origin. For instance, among just the top 10 most populous buckets, there were four such buckets with over hundreds of entries each: - bits: 72, token: "http", 146 entries - bits: 72, token: "https", 139 entries - bits: 88, token: "http", 122 entries - bits: 88, token: "https", 118 entries These filters in these buckets have to be matched against all the network requests. In order to leverage HNTrie for these filters[1], they are now handled in a special way so as to ensure they all end up in a single HNTrie (per bucket), which means that instead of scanning hundreds of entries per URL, there is now a single scan per bucket per URL for these apply-everywhere filters. Now, any filter which fulfill ALL the following condition will be processed in a special manner internally: - Is of the form `|https://` or `|http://` or `*`; and - Does have a `domain=` option; and - Does not have a negated domain in its `domain=` option; and - Does not have `csp=` option; and - Does not have a `redirect=` option If a filter does not fulfill ALL the conditions above, no change in behavior. A filter which matches ALL of the above will be processed in a special manner: - The `domain=` option will be decomposed so as to create as many distinct filter as there is distinct value in the `domain=` option - This also apply to the `badfilter` version of the filter, which means it now become possible to `badfilter` only one of the distinct filter without having to `badfilter` all of them. - The logger will always report these special filters with only a single hostname in the `domain=` option. *** [1] HNTrie is currently WASM-ed on Firefox.
2019-04-19 22:33:46 +02:00
if ( tokenHash === this.anyTokenHash ) {
if ( entry === undefined ) {
entry = new FilterJustOrigin();
bucket.set(this.anyTokenHash, entry);
}
entry.add(fdata);
continue;
}
if ( tokenHash === this.anyHTTPSTokenHash ) {
if ( entry === undefined ) {
entry = new FilterHTTPSJustOrigin();
bucket.set(this.anyHTTPSTokenHash, entry);
}
entry.add(fdata);
continue;
}
if ( tokenHash === this.anyHTTPTokenHash ) {
if ( entry === undefined ) {
entry = new FilterHTTPJustOrigin();
bucket.set(this.anyHTTPTokenHash, entry);
}
entry.add(fdata);
continue;
}
this.urlTokenizer.addKnownToken(tokenHash);
if ( entry === undefined ) {
bucket.set(tokenHash, filterFromCompiledData(fdata));
continue;
}
if ( entry.fid === filterBucketId ) {
entry.add(fdata);
continue;
}
if ( entry.fid === filterPairId ) {
bucket.set(
tokenHash,
entry.upgrade(filterFromCompiledData(fdata))
);
continue;
}
bucket.set(
tokenHash,
new FilterPair(entry, filterFromCompiledData(fdata))
);
}
2014-07-20 21:00:26 +02:00
this.filterParser.reset();
Add HNTrie-based filter classes to store origin-only filters Related issue: - https://github.com/uBlockOrigin/uBlock-issues/issues/528#issuecomment-484408622 Following STrie-related work in above issue, I noticed that a large number of filters in EasyList were filters which only had to match against the document origin. For instance, among just the top 10 most populous buckets, there were four such buckets with over hundreds of entries each: - bits: 72, token: "http", 146 entries - bits: 72, token: "https", 139 entries - bits: 88, token: "http", 122 entries - bits: 88, token: "https", 118 entries These filters in these buckets have to be matched against all the network requests. In order to leverage HNTrie for these filters[1], they are now handled in a special way so as to ensure they all end up in a single HNTrie (per bucket), which means that instead of scanning hundreds of entries per URL, there is now a single scan per bucket per URL for these apply-everywhere filters. Now, any filter which fulfill ALL the following condition will be processed in a special manner internally: - Is of the form `|https://` or `|http://` or `*`; and - Does have a `domain=` option; and - Does not have a negated domain in its `domain=` option; and - Does not have `csp=` option; and - Does not have a `redirect=` option If a filter does not fulfill ALL the conditions above, no change in behavior. A filter which matches ALL of the above will be processed in a special manner: - The `domain=` option will be decomposed so as to create as many distinct filter as there is distinct value in the `domain=` option - This also apply to the `badfilter` version of the filter, which means it now become possible to `badfilter` only one of the distinct filter without having to `badfilter` all of them. - The logger will always report these special filters with only a single hostname in the `domain=` option. *** [1] HNTrie is currently WASM-ed on Firefox.
2019-04-19 22:33:46 +02:00
this.badFilters.clear();
this.goodFilters.clear();
FilterHostnameDict.optimize();
FilterBucket.optimize();
2014-07-20 21:00:26 +02:00
this.frozen = true;
};
/******************************************************************************/
Squashed commit of the following: commit 7c6cacc59b27660fabacb55d668ef099b222a9e6 Author: Raymond Hill <rhill@raymondhill.net> Date: Sat Nov 3 08:52:51 2018 -0300 code review: finalize support for wasm-based hntrie commit 8596ed80e3bdac2c36e3c860b51e7189f6bc8487 Merge: cbe1f2e 000eb82 Author: Raymond Hill <rhill@raymondhill.net> Date: Sat Nov 3 08:41:40 2018 -0300 Merge branch 'master' of github.com:gorhill/uBlock into trie-wasm commit cbe1f2e2f38484d42af3204ec7f1b5decd30f99e Merge: 270fc7f dbb7e80 Author: Raymond Hill <rhill@raymondhill.net> Date: Fri Nov 2 17:43:20 2018 -0300 Merge branch 'master' of github.com:gorhill/uBlock into trie-wasm commit 270fc7f9b3b73d79e6355522c1a42ce782fe7e5c Merge: d2a89cf d693d4f Author: Raymond Hill <rhill@raymondhill.net> Date: Fri Nov 2 16:21:08 2018 -0300 Merge branch 'master' of github.com:gorhill/uBlock into trie-wasm commit d2a89cf28f0816ffd4617c2c7b4ccfcdcc30e1b4 Merge: d7afc78 649f82f Author: Raymond Hill <rhill@raymondhill.net> Date: Fri Nov 2 14:54:58 2018 -0300 Merge branch 'master' of github.com:gorhill/uBlock into trie-wasm commit d7afc78b5f5675d7d34c5a1d0ec3099a77caef49 Author: Raymond Hill <rhill@raymondhill.net> Date: Fri Nov 2 13:56:11 2018 -0300 finalize wasm-based hntrie implementation commit e7b9e043cf36ad055791713e34eb0322dec84627 Author: Raymond Hill <rhill@raymondhill.net> Date: Fri Nov 2 08:14:02 2018 -0300 add first-pass implementation of wasm version of hntrie commit 1015cb34624f3ef73ace58b58fe4e03dfc59897f Author: Raymond Hill <rhill@raymondhill.net> Date: Wed Oct 31 17:16:47 2018 -0300 back up draft work toward experimenting with wasm hntries
2018-11-03 12:58:46 +01:00
// This is necessary for when the filtering engine readiness will depend
// on asynchronous operations (ex.: when loading a wasm module).
FilterContainer.prototype.readyToUse = function() {
return Promise.resolve();
Squashed commit of the following: commit 7c6cacc59b27660fabacb55d668ef099b222a9e6 Author: Raymond Hill <rhill@raymondhill.net> Date: Sat Nov 3 08:52:51 2018 -0300 code review: finalize support for wasm-based hntrie commit 8596ed80e3bdac2c36e3c860b51e7189f6bc8487 Merge: cbe1f2e 000eb82 Author: Raymond Hill <rhill@raymondhill.net> Date: Sat Nov 3 08:41:40 2018 -0300 Merge branch 'master' of github.com:gorhill/uBlock into trie-wasm commit cbe1f2e2f38484d42af3204ec7f1b5decd30f99e Merge: 270fc7f dbb7e80 Author: Raymond Hill <rhill@raymondhill.net> Date: Fri Nov 2 17:43:20 2018 -0300 Merge branch 'master' of github.com:gorhill/uBlock into trie-wasm commit 270fc7f9b3b73d79e6355522c1a42ce782fe7e5c Merge: d2a89cf d693d4f Author: Raymond Hill <rhill@raymondhill.net> Date: Fri Nov 2 16:21:08 2018 -0300 Merge branch 'master' of github.com:gorhill/uBlock into trie-wasm commit d2a89cf28f0816ffd4617c2c7b4ccfcdcc30e1b4 Merge: d7afc78 649f82f Author: Raymond Hill <rhill@raymondhill.net> Date: Fri Nov 2 14:54:58 2018 -0300 Merge branch 'master' of github.com:gorhill/uBlock into trie-wasm commit d7afc78b5f5675d7d34c5a1d0ec3099a77caef49 Author: Raymond Hill <rhill@raymondhill.net> Date: Fri Nov 2 13:56:11 2018 -0300 finalize wasm-based hntrie implementation commit e7b9e043cf36ad055791713e34eb0322dec84627 Author: Raymond Hill <rhill@raymondhill.net> Date: Fri Nov 2 08:14:02 2018 -0300 add first-pass implementation of wasm version of hntrie commit 1015cb34624f3ef73ace58b58fe4e03dfc59897f Author: Raymond Hill <rhill@raymondhill.net> Date: Wed Oct 31 17:16:47 2018 -0300 back up draft work toward experimenting with wasm hntries
2018-11-03 12:58:46 +01:00
};
/******************************************************************************/
Refactor selfie generation into a more flexible persistence mechanism The motivation is to address the higher peak memory usage at launch time with 3rd-gen HNTrie when a selfie was present. The selfie generation prior to this change was to collect all filtering data into a single data structure, and then to serialize that whole structure at once into storage (using JSON.stringify). However, HNTrie serialization requires that a large UintArray32 be converted into a plain JS array, which itslef would be indirectly converted into a JSON string. This was the main reason why peak memory usage would be higher at launch from selfie, since the JSON string would need to be wholly unserialized into JS objects, which themselves would need to be converted into more specialized data structures (like that Uint32Array one). The solution to lower peak memory usage at launch is to refactor selfie generation to allow a more piecemeal approach: each filtering component is given the ability to serialize itself rather than to be forced to be embedded in the master selfie. With this approach, the HNTrie buffer can now serialize to its own storage by converting the buffer data directly into a string which can be directly sent to storage. This avoiding expensive intermediate steps such as converting into a JS array and then to a JSON string. As part of the refactoring, there was also opportunistic code upgrade to ES6 and Promise (eventually all of uBO's code will be proper ES6). Additionally, the polyfill to bring getBytesInUse() to Firefox has been revisited to replace the rather expensive previous implementation with an implementation with virtually no overhead.
2019-02-14 19:33:55 +01:00
FilterContainer.prototype.toSelfie = function(path) {
const categoriesToSelfie = function(categoryMap) {
const selfie = [];
for ( const [ catbits, bucket ] of categoryMap ) {
const tokenEntries = [];
for ( const [ token, filter ] of bucket ) {
tokenEntries.push([ token, filter.compile(true) ]);
}
Refactor selfie generation into a more flexible persistence mechanism The motivation is to address the higher peak memory usage at launch time with 3rd-gen HNTrie when a selfie was present. The selfie generation prior to this change was to collect all filtering data into a single data structure, and then to serialize that whole structure at once into storage (using JSON.stringify). However, HNTrie serialization requires that a large UintArray32 be converted into a plain JS array, which itslef would be indirectly converted into a JSON string. This was the main reason why peak memory usage would be higher at launch from selfie, since the JSON string would need to be wholly unserialized into JS objects, which themselves would need to be converted into more specialized data structures (like that Uint32Array one). The solution to lower peak memory usage at launch is to refactor selfie generation to allow a more piecemeal approach: each filtering component is given the ability to serialize itself rather than to be forced to be embedded in the master selfie. With this approach, the HNTrie buffer can now serialize to its own storage by converting the buffer data directly into a string which can be directly sent to storage. This avoiding expensive intermediate steps such as converting into a JS array and then to a JSON string. As part of the refactoring, there was also opportunistic code upgrade to ES6 and Promise (eventually all of uBO's code will be proper ES6). Additionally, the polyfill to bring getBytesInUse() to Firefox has been revisited to replace the rather expensive previous implementation with an implementation with virtually no overhead.
2019-02-14 19:33:55 +01:00
selfie.push([ catbits, tokenEntries ]);
2014-09-08 23:46:58 +02:00
}
return selfie;
2014-09-08 23:46:58 +02:00
};
Refactor selfie generation into a more flexible persistence mechanism The motivation is to address the higher peak memory usage at launch time with 3rd-gen HNTrie when a selfie was present. The selfie generation prior to this change was to collect all filtering data into a single data structure, and then to serialize that whole structure at once into storage (using JSON.stringify). However, HNTrie serialization requires that a large UintArray32 be converted into a plain JS array, which itslef would be indirectly converted into a JSON string. This was the main reason why peak memory usage would be higher at launch from selfie, since the JSON string would need to be wholly unserialized into JS objects, which themselves would need to be converted into more specialized data structures (like that Uint32Array one). The solution to lower peak memory usage at launch is to refactor selfie generation to allow a more piecemeal approach: each filtering component is given the ability to serialize itself rather than to be forced to be embedded in the master selfie. With this approach, the HNTrie buffer can now serialize to its own storage by converting the buffer data directly into a string which can be directly sent to storage. This avoiding expensive intermediate steps such as converting into a JS array and then to a JSON string. As part of the refactoring, there was also opportunistic code upgrade to ES6 and Promise (eventually all of uBO's code will be proper ES6). Additionally, the polyfill to bring getBytesInUse() to Firefox has been revisited to replace the rather expensive previous implementation with an implementation with virtually no overhead.
2019-02-14 19:33:55 +01:00
const dataFiltersToSelfie = function(dataFilters) {
const selfie = [];
for ( let entry of dataFilters.values() ) {
do {
selfie.push(entry.compile(true));
entry = entry.next;
} while ( entry !== undefined );
}
return selfie;
};
Add HNTrie-based filter classes to store origin-only filters Related issue: - https://github.com/uBlockOrigin/uBlock-issues/issues/528#issuecomment-484408622 Following STrie-related work in above issue, I noticed that a large number of filters in EasyList were filters which only had to match against the document origin. For instance, among just the top 10 most populous buckets, there were four such buckets with over hundreds of entries each: - bits: 72, token: "http", 146 entries - bits: 72, token: "https", 139 entries - bits: 88, token: "http", 122 entries - bits: 88, token: "https", 118 entries These filters in these buckets have to be matched against all the network requests. In order to leverage HNTrie for these filters[1], they are now handled in a special way so as to ensure they all end up in a single HNTrie (per bucket), which means that instead of scanning hundreds of entries per URL, there is now a single scan per bucket per URL for these apply-everywhere filters. Now, any filter which fulfill ALL the following condition will be processed in a special manner internally: - Is of the form `|https://` or `|http://` or `*`; and - Does have a `domain=` option; and - Does not have a negated domain in its `domain=` option; and - Does not have `csp=` option; and - Does not have a `redirect=` option If a filter does not fulfill ALL the conditions above, no change in behavior. A filter which matches ALL of the above will be processed in a special manner: - The `domain=` option will be decomposed so as to create as many distinct filter as there is distinct value in the `domain=` option - This also apply to the `badfilter` version of the filter, which means it now become possible to `badfilter` only one of the distinct filter without having to `badfilter` all of them. - The logger will always report these special filters with only a single hostname in the `domain=` option. *** [1] HNTrie is currently WASM-ed on Firefox.
2019-04-19 22:33:46 +02:00
filterOrigin.optimize();
Refactor selfie generation into a more flexible persistence mechanism The motivation is to address the higher peak memory usage at launch time with 3rd-gen HNTrie when a selfie was present. The selfie generation prior to this change was to collect all filtering data into a single data structure, and then to serialize that whole structure at once into storage (using JSON.stringify). However, HNTrie serialization requires that a large UintArray32 be converted into a plain JS array, which itslef would be indirectly converted into a JSON string. This was the main reason why peak memory usage would be higher at launch from selfie, since the JSON string would need to be wholly unserialized into JS objects, which themselves would need to be converted into more specialized data structures (like that Uint32Array one). The solution to lower peak memory usage at launch is to refactor selfie generation to allow a more piecemeal approach: each filtering component is given the ability to serialize itself rather than to be forced to be embedded in the master selfie. With this approach, the HNTrie buffer can now serialize to its own storage by converting the buffer data directly into a string which can be directly sent to storage. This avoiding expensive intermediate steps such as converting into a JS array and then to a JSON string. As part of the refactoring, there was also opportunistic code upgrade to ES6 and Promise (eventually all of uBO's code will be proper ES6). Additionally, the polyfill to bring getBytesInUse() to Firefox has been revisited to replace the rather expensive previous implementation with an implementation with virtually no overhead.
2019-02-14 19:33:55 +01:00
return Promise.all([
µBlock.assets.put(
`${path}/FilterHostnameDict.trieContainer`,
FilterHostnameDict.trieContainer.serialize(µBlock.base64)
Refactor selfie generation into a more flexible persistence mechanism The motivation is to address the higher peak memory usage at launch time with 3rd-gen HNTrie when a selfie was present. The selfie generation prior to this change was to collect all filtering data into a single data structure, and then to serialize that whole structure at once into storage (using JSON.stringify). However, HNTrie serialization requires that a large UintArray32 be converted into a plain JS array, which itslef would be indirectly converted into a JSON string. This was the main reason why peak memory usage would be higher at launch from selfie, since the JSON string would need to be wholly unserialized into JS objects, which themselves would need to be converted into more specialized data structures (like that Uint32Array one). The solution to lower peak memory usage at launch is to refactor selfie generation to allow a more piecemeal approach: each filtering component is given the ability to serialize itself rather than to be forced to be embedded in the master selfie. With this approach, the HNTrie buffer can now serialize to its own storage by converting the buffer data directly into a string which can be directly sent to storage. This avoiding expensive intermediate steps such as converting into a JS array and then to a JSON string. As part of the refactoring, there was also opportunistic code upgrade to ES6 and Promise (eventually all of uBO's code will be proper ES6). Additionally, the polyfill to bring getBytesInUse() to Firefox has been revisited to replace the rather expensive previous implementation with an implementation with virtually no overhead.
2019-02-14 19:33:55 +01:00
),
Add HNTrie-based filter classes to store origin-only filters Related issue: - https://github.com/uBlockOrigin/uBlock-issues/issues/528#issuecomment-484408622 Following STrie-related work in above issue, I noticed that a large number of filters in EasyList were filters which only had to match against the document origin. For instance, among just the top 10 most populous buckets, there were four such buckets with over hundreds of entries each: - bits: 72, token: "http", 146 entries - bits: 72, token: "https", 139 entries - bits: 88, token: "http", 122 entries - bits: 88, token: "https", 118 entries These filters in these buckets have to be matched against all the network requests. In order to leverage HNTrie for these filters[1], they are now handled in a special way so as to ensure they all end up in a single HNTrie (per bucket), which means that instead of scanning hundreds of entries per URL, there is now a single scan per bucket per URL for these apply-everywhere filters. Now, any filter which fulfill ALL the following condition will be processed in a special manner internally: - Is of the form `|https://` or `|http://` or `*`; and - Does have a `domain=` option; and - Does not have a negated domain in its `domain=` option; and - Does not have `csp=` option; and - Does not have a `redirect=` option If a filter does not fulfill ALL the conditions above, no change in behavior. A filter which matches ALL of the above will be processed in a special manner: - The `domain=` option will be decomposed so as to create as many distinct filter as there is distinct value in the `domain=` option - This also apply to the `badfilter` version of the filter, which means it now become possible to `badfilter` only one of the distinct filter without having to `badfilter` all of them. - The logger will always report these special filters with only a single hostname in the `domain=` option. *** [1] HNTrie is currently WASM-ed on Firefox.
2019-04-19 22:33:46 +02:00
µBlock.assets.put(
`${path}/FilterOrigin.trieContainer`,
filterOrigin.trieContainer.serialize(µBlock.base64)
Add HNTrie-based filter classes to store origin-only filters Related issue: - https://github.com/uBlockOrigin/uBlock-issues/issues/528#issuecomment-484408622 Following STrie-related work in above issue, I noticed that a large number of filters in EasyList were filters which only had to match against the document origin. For instance, among just the top 10 most populous buckets, there were four such buckets with over hundreds of entries each: - bits: 72, token: "http", 146 entries - bits: 72, token: "https", 139 entries - bits: 88, token: "http", 122 entries - bits: 88, token: "https", 118 entries These filters in these buckets have to be matched against all the network requests. In order to leverage HNTrie for these filters[1], they are now handled in a special way so as to ensure they all end up in a single HNTrie (per bucket), which means that instead of scanning hundreds of entries per URL, there is now a single scan per bucket per URL for these apply-everywhere filters. Now, any filter which fulfill ALL the following condition will be processed in a special manner internally: - Is of the form `|https://` or `|http://` or `*`; and - Does have a `domain=` option; and - Does not have a negated domain in its `domain=` option; and - Does not have `csp=` option; and - Does not have a `redirect=` option If a filter does not fulfill ALL the conditions above, no change in behavior. A filter which matches ALL of the above will be processed in a special manner: - The `domain=` option will be decomposed so as to create as many distinct filter as there is distinct value in the `domain=` option - This also apply to the `badfilter` version of the filter, which means it now become possible to `badfilter` only one of the distinct filter without having to `badfilter` all of them. - The logger will always report these special filters with only a single hostname in the `domain=` option. *** [1] HNTrie is currently WASM-ed on Firefox.
2019-04-19 22:33:46 +02:00
),
µBlock.assets.put(
`${path}/FilterBucket.trieContainer`,
FilterBucket.trieContainer.serialize(µBlock.base64)
),
Refactor selfie generation into a more flexible persistence mechanism The motivation is to address the higher peak memory usage at launch time with 3rd-gen HNTrie when a selfie was present. The selfie generation prior to this change was to collect all filtering data into a single data structure, and then to serialize that whole structure at once into storage (using JSON.stringify). However, HNTrie serialization requires that a large UintArray32 be converted into a plain JS array, which itslef would be indirectly converted into a JSON string. This was the main reason why peak memory usage would be higher at launch from selfie, since the JSON string would need to be wholly unserialized into JS objects, which themselves would need to be converted into more specialized data structures (like that Uint32Array one). The solution to lower peak memory usage at launch is to refactor selfie generation to allow a more piecemeal approach: each filtering component is given the ability to serialize itself rather than to be forced to be embedded in the master selfie. With this approach, the HNTrie buffer can now serialize to its own storage by converting the buffer data directly into a string which can be directly sent to storage. This avoiding expensive intermediate steps such as converting into a JS array and then to a JSON string. As part of the refactoring, there was also opportunistic code upgrade to ES6 and Promise (eventually all of uBO's code will be proper ES6). Additionally, the polyfill to bring getBytesInUse() to Firefox has been revisited to replace the rather expensive previous implementation with an implementation with virtually no overhead.
2019-02-14 19:33:55 +01:00
µBlock.assets.put(
`${path}/main`,
JSON.stringify({
processedFilterCount: this.processedFilterCount,
acceptedCount: this.acceptedCount,
rejectedCount: this.rejectedCount,
allowFilterCount: this.allowFilterCount,
blockFilterCount: this.blockFilterCount,
discardedCount: this.discardedCount,
categories: categoriesToSelfie(this.categories),
dataFilters: dataFiltersToSelfie(this.dataFilters),
urlTokenizer: this.urlTokenizer.toSelfie(),
filterOriginStrSlots: filterOrigin.strSlots,
Refactor selfie generation into a more flexible persistence mechanism The motivation is to address the higher peak memory usage at launch time with 3rd-gen HNTrie when a selfie was present. The selfie generation prior to this change was to collect all filtering data into a single data structure, and then to serialize that whole structure at once into storage (using JSON.stringify). However, HNTrie serialization requires that a large UintArray32 be converted into a plain JS array, which itslef would be indirectly converted into a JSON string. This was the main reason why peak memory usage would be higher at launch from selfie, since the JSON string would need to be wholly unserialized into JS objects, which themselves would need to be converted into more specialized data structures (like that Uint32Array one). The solution to lower peak memory usage at launch is to refactor selfie generation to allow a more piecemeal approach: each filtering component is given the ability to serialize itself rather than to be forced to be embedded in the master selfie. With this approach, the HNTrie buffer can now serialize to its own storage by converting the buffer data directly into a string which can be directly sent to storage. This avoiding expensive intermediate steps such as converting into a JS array and then to a JSON string. As part of the refactoring, there was also opportunistic code upgrade to ES6 and Promise (eventually all of uBO's code will be proper ES6). Additionally, the polyfill to bring getBytesInUse() to Firefox has been revisited to replace the rather expensive previous implementation with an implementation with virtually no overhead.
2019-02-14 19:33:55 +01:00
})
)
]);
2014-09-08 23:46:58 +02:00
};
/******************************************************************************/
Refactor selfie generation into a more flexible persistence mechanism The motivation is to address the higher peak memory usage at launch time with 3rd-gen HNTrie when a selfie was present. The selfie generation prior to this change was to collect all filtering data into a single data structure, and then to serialize that whole structure at once into storage (using JSON.stringify). However, HNTrie serialization requires that a large UintArray32 be converted into a plain JS array, which itslef would be indirectly converted into a JSON string. This was the main reason why peak memory usage would be higher at launch from selfie, since the JSON string would need to be wholly unserialized into JS objects, which themselves would need to be converted into more specialized data structures (like that Uint32Array one). The solution to lower peak memory usage at launch is to refactor selfie generation to allow a more piecemeal approach: each filtering component is given the ability to serialize itself rather than to be forced to be embedded in the master selfie. With this approach, the HNTrie buffer can now serialize to its own storage by converting the buffer data directly into a string which can be directly sent to storage. This avoiding expensive intermediate steps such as converting into a JS array and then to a JSON string. As part of the refactoring, there was also opportunistic code upgrade to ES6 and Promise (eventually all of uBO's code will be proper ES6). Additionally, the polyfill to bring getBytesInUse() to Firefox has been revisited to replace the rather expensive previous implementation with an implementation with virtually no overhead.
2019-02-14 19:33:55 +01:00
FilterContainer.prototype.fromSelfie = function(path) {
return Promise.all([
µBlock.assets.get(`${path}/FilterHostnameDict.trieContainer`).then(details =>
Refactor selfie generation into a more flexible persistence mechanism The motivation is to address the higher peak memory usage at launch time with 3rd-gen HNTrie when a selfie was present. The selfie generation prior to this change was to collect all filtering data into a single data structure, and then to serialize that whole structure at once into storage (using JSON.stringify). However, HNTrie serialization requires that a large UintArray32 be converted into a plain JS array, which itslef would be indirectly converted into a JSON string. This was the main reason why peak memory usage would be higher at launch from selfie, since the JSON string would need to be wholly unserialized into JS objects, which themselves would need to be converted into more specialized data structures (like that Uint32Array one). The solution to lower peak memory usage at launch is to refactor selfie generation to allow a more piecemeal approach: each filtering component is given the ability to serialize itself rather than to be forced to be embedded in the master selfie. With this approach, the HNTrie buffer can now serialize to its own storage by converting the buffer data directly into a string which can be directly sent to storage. This avoiding expensive intermediate steps such as converting into a JS array and then to a JSON string. As part of the refactoring, there was also opportunistic code upgrade to ES6 and Promise (eventually all of uBO's code will be proper ES6). Additionally, the polyfill to bring getBytesInUse() to Firefox has been revisited to replace the rather expensive previous implementation with an implementation with virtually no overhead.
2019-02-14 19:33:55 +01:00
FilterHostnameDict.trieContainer.unserialize(
details.content,
µBlock.base64
)
),
µBlock.assets.get(`${path}/FilterOrigin.trieContainer`).then(details =>
Add HNTrie-based filter classes to store origin-only filters Related issue: - https://github.com/uBlockOrigin/uBlock-issues/issues/528#issuecomment-484408622 Following STrie-related work in above issue, I noticed that a large number of filters in EasyList were filters which only had to match against the document origin. For instance, among just the top 10 most populous buckets, there were four such buckets with over hundreds of entries each: - bits: 72, token: "http", 146 entries - bits: 72, token: "https", 139 entries - bits: 88, token: "http", 122 entries - bits: 88, token: "https", 118 entries These filters in these buckets have to be matched against all the network requests. In order to leverage HNTrie for these filters[1], they are now handled in a special way so as to ensure they all end up in a single HNTrie (per bucket), which means that instead of scanning hundreds of entries per URL, there is now a single scan per bucket per URL for these apply-everywhere filters. Now, any filter which fulfill ALL the following condition will be processed in a special manner internally: - Is of the form `|https://` or `|http://` or `*`; and - Does have a `domain=` option; and - Does not have a negated domain in its `domain=` option; and - Does not have `csp=` option; and - Does not have a `redirect=` option If a filter does not fulfill ALL the conditions above, no change in behavior. A filter which matches ALL of the above will be processed in a special manner: - The `domain=` option will be decomposed so as to create as many distinct filter as there is distinct value in the `domain=` option - This also apply to the `badfilter` version of the filter, which means it now become possible to `badfilter` only one of the distinct filter without having to `badfilter` all of them. - The logger will always report these special filters with only a single hostname in the `domain=` option. *** [1] HNTrie is currently WASM-ed on Firefox.
2019-04-19 22:33:46 +02:00
filterOrigin.trieContainer.unserialize(
details.content,
µBlock.base64
)
),
µBlock.assets.get(`${path}/FilterBucket.trieContainer`).then(details =>
FilterBucket.trieContainer.unserialize(
details.content,
µBlock.base64
)
),
Refactor selfie generation into a more flexible persistence mechanism The motivation is to address the higher peak memory usage at launch time with 3rd-gen HNTrie when a selfie was present. The selfie generation prior to this change was to collect all filtering data into a single data structure, and then to serialize that whole structure at once into storage (using JSON.stringify). However, HNTrie serialization requires that a large UintArray32 be converted into a plain JS array, which itslef would be indirectly converted into a JSON string. This was the main reason why peak memory usage would be higher at launch from selfie, since the JSON string would need to be wholly unserialized into JS objects, which themselves would need to be converted into more specialized data structures (like that Uint32Array one). The solution to lower peak memory usage at launch is to refactor selfie generation to allow a more piecemeal approach: each filtering component is given the ability to serialize itself rather than to be forced to be embedded in the master selfie. With this approach, the HNTrie buffer can now serialize to its own storage by converting the buffer data directly into a string which can be directly sent to storage. This avoiding expensive intermediate steps such as converting into a JS array and then to a JSON string. As part of the refactoring, there was also opportunistic code upgrade to ES6 and Promise (eventually all of uBO's code will be proper ES6). Additionally, the polyfill to bring getBytesInUse() to Firefox has been revisited to replace the rather expensive previous implementation with an implementation with virtually no overhead.
2019-02-14 19:33:55 +01:00
µBlock.assets.get(`${path}/main`).then(details => {
let selfie;
try {
selfie = JSON.parse(details.content);
} catch (ex) {
}
if ( selfie instanceof Object === false ) { return false; }
this.frozen = true;
this.processedFilterCount = selfie.processedFilterCount;
this.acceptedCount = selfie.acceptedCount;
this.rejectedCount = selfie.rejectedCount;
this.allowFilterCount = selfie.allowFilterCount;
this.blockFilterCount = selfie.blockFilterCount;
this.discardedCount = selfie.discardedCount;
this.urlTokenizer.fromSelfie(selfie.urlTokenizer);
filterOrigin.strSlots = selfie.filterOriginStrSlots;
Refactor selfie generation into a more flexible persistence mechanism The motivation is to address the higher peak memory usage at launch time with 3rd-gen HNTrie when a selfie was present. The selfie generation prior to this change was to collect all filtering data into a single data structure, and then to serialize that whole structure at once into storage (using JSON.stringify). However, HNTrie serialization requires that a large UintArray32 be converted into a plain JS array, which itslef would be indirectly converted into a JSON string. This was the main reason why peak memory usage would be higher at launch from selfie, since the JSON string would need to be wholly unserialized into JS objects, which themselves would need to be converted into more specialized data structures (like that Uint32Array one). The solution to lower peak memory usage at launch is to refactor selfie generation to allow a more piecemeal approach: each filtering component is given the ability to serialize itself rather than to be forced to be embedded in the master selfie. With this approach, the HNTrie buffer can now serialize to its own storage by converting the buffer data directly into a string which can be directly sent to storage. This avoiding expensive intermediate steps such as converting into a JS array and then to a JSON string. As part of the refactoring, there was also opportunistic code upgrade to ES6 and Promise (eventually all of uBO's code will be proper ES6). Additionally, the polyfill to bring getBytesInUse() to Firefox has been revisited to replace the rather expensive previous implementation with an implementation with virtually no overhead.
2019-02-14 19:33:55 +01:00
for ( const [ catbits, bucket ] of selfie.categories ) {
const tokenMap = new Map();
for ( const [ token, fdata ] of bucket ) {
tokenMap.set(token, filterFromCompiledData(fdata));
}
this.categories.set(catbits, tokenMap);
}
for ( const dataEntry of selfie.dataFilters ) {
const entry = FilterDataHolderEntry.load(dataEntry);
const bucket = this.dataFilters.get(entry.tokenHash);
if ( bucket !== undefined ) {
entry.next = bucket;
}
this.dataFilters.set(entry.tokenHash, entry);
}
return true;
}),
]).then(results =>
results.reduce((acc, v) => acc && v, true)
);
};
/******************************************************************************/
FilterContainer.prototype.compile = function(raw, writer) {
2014-06-24 00:42:43 +02:00
// ORDER OF TESTS IS IMPORTANT!
// Ignore empty lines
const s = raw.trim();
if ( s.length === 0 ) { return false; }
2014-06-24 00:42:43 +02:00
const parsed = this.filterParser.parse(s);
2014-06-24 00:42:43 +02:00
2015-01-23 17:32:49 +01:00
// Ignore element-hiding filters
if ( parsed.elemHiding ) {
2014-09-08 23:46:58 +02:00
return false;
}
2015-01-23 17:32:49 +01:00
// Ignore filters with unsupported options
if ( parsed.unsupported ) {
const who = writer.properties.get('assetKey') || '?';
µb.logger.writeOne({
realm: 'message',
type: 'error',
text: `Invalid network filter in ${who}: ${raw}`
});
2014-06-24 00:42:43 +02:00
return false;
}
// Redirect rule
if ( parsed.redirect !== 0 ) {
const result = this.compileRedirectRule(parsed, writer);
if ( result === false ) {
const who = writer.properties.get('assetKey') || '?';
µb.logger.writeOne({
realm: 'message',
type: 'error',
text: `Invalid redirect rule in ${who}: ${raw}`
});
return false;
}
if ( parsed.redirect === 2 ) {
return true;
}
}
// Pure hostnames, use more efficient dictionary lookup
2015-04-07 03:26:05 +02:00
// https://github.com/chrisaljoudi/uBlock/issues/665
2015-02-05 00:06:31 +01:00
// Create a dict keyed on request type etc.
if (
parsed.isPureHostname &&
parsed.domainOpt === '' &&
parsed.dataType === undefined
) {
parsed.tokenHash = this.dotTokenHash;
this.compileToAtomicFilter(parsed, parsed.f, writer);
2015-02-05 00:06:31 +01:00
return true;
2014-06-24 00:42:43 +02:00
}
parsed.makeToken();
let fdata;
if ( parsed.isRegex ) {
fdata = FilterRegex.compile(parsed);
} else if ( parsed.isPureHostname ) {
fdata = FilterPlainHostname.compile(parsed);
} else if ( parsed.f === '*' ) {
Add HNTrie-based filter classes to store origin-only filters Related issue: - https://github.com/uBlockOrigin/uBlock-issues/issues/528#issuecomment-484408622 Following STrie-related work in above issue, I noticed that a large number of filters in EasyList were filters which only had to match against the document origin. For instance, among just the top 10 most populous buckets, there were four such buckets with over hundreds of entries each: - bits: 72, token: "http", 146 entries - bits: 72, token: "https", 139 entries - bits: 88, token: "http", 122 entries - bits: 88, token: "https", 118 entries These filters in these buckets have to be matched against all the network requests. In order to leverage HNTrie for these filters[1], they are now handled in a special way so as to ensure they all end up in a single HNTrie (per bucket), which means that instead of scanning hundreds of entries per URL, there is now a single scan per bucket per URL for these apply-everywhere filters. Now, any filter which fulfill ALL the following condition will be processed in a special manner internally: - Is of the form `|https://` or `|http://` or `*`; and - Does have a `domain=` option; and - Does not have a negated domain in its `domain=` option; and - Does not have `csp=` option; and - Does not have a `redirect=` option If a filter does not fulfill ALL the conditions above, no change in behavior. A filter which matches ALL of the above will be processed in a special manner: - The `domain=` option will be decomposed so as to create as many distinct filter as there is distinct value in the `domain=` option - This also apply to the `badfilter` version of the filter, which means it now become possible to `badfilter` only one of the distinct filter without having to `badfilter` all of them. - The logger will always report these special filters with only a single hostname in the `domain=` option. *** [1] HNTrie is currently WASM-ed on Firefox.
2019-04-19 22:33:46 +02:00
if ( parsed.isJustOrigin() ) {
parsed.tokenHash = this.anyTokenHash;
for ( const hn of parsed.domainOpt.split('|') ) {
this.compileToAtomicFilter(parsed, hn, writer);
}
return true;
}
fdata = FilterTrue.compile();
} else if ( parsed.anchor === 0x5 ) {
fdata = FilterGenericHnAndRightAnchored.compile(parsed);
2017-05-30 17:38:45 +02:00
} else if ( parsed.anchor === 0x4 ) {
if (
parsed.wildcarded === false &&
parsed.tokenHash !== parsed.noTokenHash
2017-05-30 17:38:45 +02:00
) {
fdata = FilterPlainHnAnchored.compile(parsed);
} else {
fdata = FilterGenericHnAnchored.compile(parsed);
}
} else if ( parsed.anchor === 0x2 && parsed.isJustOrigin() ) {
const hostnames = parsed.domainOpt.split('|');
const isHTTPS = parsed.f === 'https://' || parsed.f === 'http*://';
const isHTTP = parsed.f === 'http://' || parsed.f === 'http*://';
for ( const hn of hostnames ) {
if ( isHTTPS ) {
Add HNTrie-based filter classes to store origin-only filters Related issue: - https://github.com/uBlockOrigin/uBlock-issues/issues/528#issuecomment-484408622 Following STrie-related work in above issue, I noticed that a large number of filters in EasyList were filters which only had to match against the document origin. For instance, among just the top 10 most populous buckets, there were four such buckets with over hundreds of entries each: - bits: 72, token: "http", 146 entries - bits: 72, token: "https", 139 entries - bits: 88, token: "http", 122 entries - bits: 88, token: "https", 118 entries These filters in these buckets have to be matched against all the network requests. In order to leverage HNTrie for these filters[1], they are now handled in a special way so as to ensure they all end up in a single HNTrie (per bucket), which means that instead of scanning hundreds of entries per URL, there is now a single scan per bucket per URL for these apply-everywhere filters. Now, any filter which fulfill ALL the following condition will be processed in a special manner internally: - Is of the form `|https://` or `|http://` or `*`; and - Does have a `domain=` option; and - Does not have a negated domain in its `domain=` option; and - Does not have `csp=` option; and - Does not have a `redirect=` option If a filter does not fulfill ALL the conditions above, no change in behavior. A filter which matches ALL of the above will be processed in a special manner: - The `domain=` option will be decomposed so as to create as many distinct filter as there is distinct value in the `domain=` option - This also apply to the `badfilter` version of the filter, which means it now become possible to `badfilter` only one of the distinct filter without having to `badfilter` all of them. - The logger will always report these special filters with only a single hostname in the `domain=` option. *** [1] HNTrie is currently WASM-ed on Firefox.
2019-04-19 22:33:46 +02:00
parsed.tokenHash = this.anyHTTPSTokenHash;
this.compileToAtomicFilter(parsed, hn, writer);
Add HNTrie-based filter classes to store origin-only filters Related issue: - https://github.com/uBlockOrigin/uBlock-issues/issues/528#issuecomment-484408622 Following STrie-related work in above issue, I noticed that a large number of filters in EasyList were filters which only had to match against the document origin. For instance, among just the top 10 most populous buckets, there were four such buckets with over hundreds of entries each: - bits: 72, token: "http", 146 entries - bits: 72, token: "https", 139 entries - bits: 88, token: "http", 122 entries - bits: 88, token: "https", 118 entries These filters in these buckets have to be matched against all the network requests. In order to leverage HNTrie for these filters[1], they are now handled in a special way so as to ensure they all end up in a single HNTrie (per bucket), which means that instead of scanning hundreds of entries per URL, there is now a single scan per bucket per URL for these apply-everywhere filters. Now, any filter which fulfill ALL the following condition will be processed in a special manner internally: - Is of the form `|https://` or `|http://` or `*`; and - Does have a `domain=` option; and - Does not have a negated domain in its `domain=` option; and - Does not have `csp=` option; and - Does not have a `redirect=` option If a filter does not fulfill ALL the conditions above, no change in behavior. A filter which matches ALL of the above will be processed in a special manner: - The `domain=` option will be decomposed so as to create as many distinct filter as there is distinct value in the `domain=` option - This also apply to the `badfilter` version of the filter, which means it now become possible to `badfilter` only one of the distinct filter without having to `badfilter` all of them. - The logger will always report these special filters with only a single hostname in the `domain=` option. *** [1] HNTrie is currently WASM-ed on Firefox.
2019-04-19 22:33:46 +02:00
}
if ( isHTTP ) {
Add HNTrie-based filter classes to store origin-only filters Related issue: - https://github.com/uBlockOrigin/uBlock-issues/issues/528#issuecomment-484408622 Following STrie-related work in above issue, I noticed that a large number of filters in EasyList were filters which only had to match against the document origin. For instance, among just the top 10 most populous buckets, there were four such buckets with over hundreds of entries each: - bits: 72, token: "http", 146 entries - bits: 72, token: "https", 139 entries - bits: 88, token: "http", 122 entries - bits: 88, token: "https", 118 entries These filters in these buckets have to be matched against all the network requests. In order to leverage HNTrie for these filters[1], they are now handled in a special way so as to ensure they all end up in a single HNTrie (per bucket), which means that instead of scanning hundreds of entries per URL, there is now a single scan per bucket per URL for these apply-everywhere filters. Now, any filter which fulfill ALL the following condition will be processed in a special manner internally: - Is of the form `|https://` or `|http://` or `*`; and - Does have a `domain=` option; and - Does not have a negated domain in its `domain=` option; and - Does not have `csp=` option; and - Does not have a `redirect=` option If a filter does not fulfill ALL the conditions above, no change in behavior. A filter which matches ALL of the above will be processed in a special manner: - The `domain=` option will be decomposed so as to create as many distinct filter as there is distinct value in the `domain=` option - This also apply to the `badfilter` version of the filter, which means it now become possible to `badfilter` only one of the distinct filter without having to `badfilter` all of them. - The logger will always report these special filters with only a single hostname in the `domain=` option. *** [1] HNTrie is currently WASM-ed on Firefox.
2019-04-19 22:33:46 +02:00
parsed.tokenHash = this.anyHTTPTokenHash;
this.compileToAtomicFilter(parsed, hn, writer);
Add HNTrie-based filter classes to store origin-only filters Related issue: - https://github.com/uBlockOrigin/uBlock-issues/issues/528#issuecomment-484408622 Following STrie-related work in above issue, I noticed that a large number of filters in EasyList were filters which only had to match against the document origin. For instance, among just the top 10 most populous buckets, there were four such buckets with over hundreds of entries each: - bits: 72, token: "http", 146 entries - bits: 72, token: "https", 139 entries - bits: 88, token: "http", 122 entries - bits: 88, token: "https", 118 entries These filters in these buckets have to be matched against all the network requests. In order to leverage HNTrie for these filters[1], they are now handled in a special way so as to ensure they all end up in a single HNTrie (per bucket), which means that instead of scanning hundreds of entries per URL, there is now a single scan per bucket per URL for these apply-everywhere filters. Now, any filter which fulfill ALL the following condition will be processed in a special manner internally: - Is of the form `|https://` or `|http://` or `*`; and - Does have a `domain=` option; and - Does not have a negated domain in its `domain=` option; and - Does not have `csp=` option; and - Does not have a `redirect=` option If a filter does not fulfill ALL the conditions above, no change in behavior. A filter which matches ALL of the above will be processed in a special manner: - The `domain=` option will be decomposed so as to create as many distinct filter as there is distinct value in the `domain=` option - This also apply to the `badfilter` version of the filter, which means it now become possible to `badfilter` only one of the distinct filter without having to `badfilter` all of them. - The logger will always report these special filters with only a single hostname in the `domain=` option. *** [1] HNTrie is currently WASM-ed on Firefox.
2019-04-19 22:33:46 +02:00
}
}
return true;
} else if ( parsed.wildcarded || parsed.tokenHash === parsed.noTokenHash ) {
fdata = FilterGeneric.compile(parsed);
} else if ( parsed.anchor === 0x2 ) {
fdata = FilterPlainLeftAnchored.compile(parsed);
} else if ( parsed.anchor === 0x1 ) {
fdata = FilterPlainRightAnchored.compile(parsed);
2017-10-09 15:28:28 +02:00
} else if ( parsed.anchor === 0x3 ) {
fdata = FilterExactMatch.compile(parsed);
} else {
fdata = FilterPlain.compile(parsed);
2014-06-24 00:42:43 +02:00
}
if ( parsed.domainOpt !== '' ) {
fdata = filterOrigin.compile(parsed, fdata);
}
if ( parsed.dataType !== undefined ) {
let fwrapped = fdata;
fdata = FilterDataHolder.compile(parsed);
fdata.push(fwrapped);
}
this.compileToAtomicFilter(parsed, fdata, writer);
2014-06-24 00:42:43 +02:00
return true;
};
/******************************************************************************/
FilterContainer.prototype.compileToAtomicFilter = function(
parsed,
fdata,
writer
) {
// 0 = network filters
// 1 = network filters: bad filters
writer.select(parsed.badFilter ? 1 : 0);
const descBits = parsed.action | parsed.important | parsed.party;
let typeBits = parsed.types;
// Typeless
if ( typeBits === 0 ) {
writer.push([ descBits, parsed.tokenHash, fdata ]);
2015-03-26 00:28:22 +01:00
return;
}
// If all network types are set, create a typeless filter
if ( (typeBits & allNetworkTypesBits) === allNetworkTypesBits ) {
writer.push([ descBits, parsed.tokenHash, fdata ]);
typeBits &= ~allNetworkTypesBits;
}
// One filter per specific types
let bitOffset = 1;
2015-03-26 00:28:22 +01:00
do {
if ( typeBits & 1 ) {
writer.push([ descBits | (bitOffset << 4), parsed.tokenHash, fdata ]);
}
bitOffset += 1;
typeBits >>>= 1;
} while ( typeBits !== 0 );
};
/******************************************************************************/
FilterContainer.prototype.compileRedirectRule = function(parsed, writer) {
const redirects = µb.redirectEngine.compileRuleFromStaticFilter(parsed.raw);
if ( Array.isArray(redirects) === false ) { return false; }
writer.select(parsed.badFilter ? 1 : 0);
const type = typeNameToTypeValue.redirect;
for ( const redirect of redirects ) {
writer.push([ type, redirect ]);
}
return true;
2014-06-24 00:42:43 +02:00
};
/******************************************************************************/
FilterContainer.prototype.fromCompiledContent = function(reader) {
// 0 = network filters
reader.select(0);
while ( reader.next() ) {
this.acceptedCount += 1;
if ( this.goodFilters.has(reader.line) ) {
this.discardedCount += 1;
} else {
this.goodFilters.add(reader.line);
}
}
2015-02-24 00:31:29 +01:00
// 1 = network filters: bad filter directives
reader.select(1);
while ( reader.next() ) {
Add HNTrie-based filter classes to store origin-only filters Related issue: - https://github.com/uBlockOrigin/uBlock-issues/issues/528#issuecomment-484408622 Following STrie-related work in above issue, I noticed that a large number of filters in EasyList were filters which only had to match against the document origin. For instance, among just the top 10 most populous buckets, there were four such buckets with over hundreds of entries each: - bits: 72, token: "http", 146 entries - bits: 72, token: "https", 139 entries - bits: 88, token: "http", 122 entries - bits: 88, token: "https", 118 entries These filters in these buckets have to be matched against all the network requests. In order to leverage HNTrie for these filters[1], they are now handled in a special way so as to ensure they all end up in a single HNTrie (per bucket), which means that instead of scanning hundreds of entries per URL, there is now a single scan per bucket per URL for these apply-everywhere filters. Now, any filter which fulfill ALL the following condition will be processed in a special manner internally: - Is of the form `|https://` or `|http://` or `*`; and - Does have a `domain=` option; and - Does not have a negated domain in its `domain=` option; and - Does not have `csp=` option; and - Does not have a `redirect=` option If a filter does not fulfill ALL the conditions above, no change in behavior. A filter which matches ALL of the above will be processed in a special manner: - The `domain=` option will be decomposed so as to create as many distinct filter as there is distinct value in the `domain=` option - This also apply to the `badfilter` version of the filter, which means it now become possible to `badfilter` only one of the distinct filter without having to `badfilter` all of them. - The logger will always report these special filters with only a single hostname in the `domain=` option. *** [1] HNTrie is currently WASM-ed on Firefox.
2019-04-19 22:33:46 +02:00
this.badFilters.add(reader.line);
}
};
/******************************************************************************/
FilterContainer.prototype.matchAndFetchData = function(
dataType,
requestURL,
out,
outlog
) {
if ( this.dataFilters.size === 0 ) { return; }
const url = this.urlTokenizer.setURL(requestURL);
pageHostnameRegister = requestHostnameRegister =
µb.URI.hostnameFromURI(url);
// We need to visit ALL the matching filters.
const toAddImportant = new Map();
const toAdd = new Map();
const toRemove = new Map();
const tokenHashes = this.urlTokenizer.getTokens();
let i = 0;
2017-05-19 14:45:19 +02:00
while ( i < 32 ) {
const tokenHash = tokenHashes[i++];
2017-05-19 14:45:19 +02:00
if ( tokenHash === 0 ) { break; }
const tokenOffset = tokenHashes[i++];
Squashed commit of the following: commit 7c6cacc59b27660fabacb55d668ef099b222a9e6 Author: Raymond Hill <rhill@raymondhill.net> Date: Sat Nov 3 08:52:51 2018 -0300 code review: finalize support for wasm-based hntrie commit 8596ed80e3bdac2c36e3c860b51e7189f6bc8487 Merge: cbe1f2e 000eb82 Author: Raymond Hill <rhill@raymondhill.net> Date: Sat Nov 3 08:41:40 2018 -0300 Merge branch 'master' of github.com:gorhill/uBlock into trie-wasm commit cbe1f2e2f38484d42af3204ec7f1b5decd30f99e Merge: 270fc7f dbb7e80 Author: Raymond Hill <rhill@raymondhill.net> Date: Fri Nov 2 17:43:20 2018 -0300 Merge branch 'master' of github.com:gorhill/uBlock into trie-wasm commit 270fc7f9b3b73d79e6355522c1a42ce782fe7e5c Merge: d2a89cf d693d4f Author: Raymond Hill <rhill@raymondhill.net> Date: Fri Nov 2 16:21:08 2018 -0300 Merge branch 'master' of github.com:gorhill/uBlock into trie-wasm commit d2a89cf28f0816ffd4617c2c7b4ccfcdcc30e1b4 Merge: d7afc78 649f82f Author: Raymond Hill <rhill@raymondhill.net> Date: Fri Nov 2 14:54:58 2018 -0300 Merge branch 'master' of github.com:gorhill/uBlock into trie-wasm commit d7afc78b5f5675d7d34c5a1d0ec3099a77caef49 Author: Raymond Hill <rhill@raymondhill.net> Date: Fri Nov 2 13:56:11 2018 -0300 finalize wasm-based hntrie implementation commit e7b9e043cf36ad055791713e34eb0322dec84627 Author: Raymond Hill <rhill@raymondhill.net> Date: Fri Nov 2 08:14:02 2018 -0300 add first-pass implementation of wasm version of hntrie commit 1015cb34624f3ef73ace58b58fe4e03dfc59897f Author: Raymond Hill <rhill@raymondhill.net> Date: Wed Oct 31 17:16:47 2018 -0300 back up draft work toward experimenting with wasm hntries
2018-11-03 12:58:46 +01:00
let entry = this.dataFilters.get(tokenHash);
while ( entry !== undefined ) {
const f = entry.filter;
2017-05-19 14:45:19 +02:00
if ( f.match(url, tokenOffset) === true ) {
if ( entry.categoryBits & 0x001 ) {
toRemove.set(f.dataStr, entry);
2017-05-19 14:45:19 +02:00
} else if ( entry.categoryBits & 0x002 ) {
toAddImportant.set(f.dataStr, entry);
} else {
toAdd.set(f.dataStr, entry);
}
}
entry = entry.next;
}
}
Squashed commit of the following: commit 7c6cacc59b27660fabacb55d668ef099b222a9e6 Author: Raymond Hill <rhill@raymondhill.net> Date: Sat Nov 3 08:52:51 2018 -0300 code review: finalize support for wasm-based hntrie commit 8596ed80e3bdac2c36e3c860b51e7189f6bc8487 Merge: cbe1f2e 000eb82 Author: Raymond Hill <rhill@raymondhill.net> Date: Sat Nov 3 08:41:40 2018 -0300 Merge branch 'master' of github.com:gorhill/uBlock into trie-wasm commit cbe1f2e2f38484d42af3204ec7f1b5decd30f99e Merge: 270fc7f dbb7e80 Author: Raymond Hill <rhill@raymondhill.net> Date: Fri Nov 2 17:43:20 2018 -0300 Merge branch 'master' of github.com:gorhill/uBlock into trie-wasm commit 270fc7f9b3b73d79e6355522c1a42ce782fe7e5c Merge: d2a89cf d693d4f Author: Raymond Hill <rhill@raymondhill.net> Date: Fri Nov 2 16:21:08 2018 -0300 Merge branch 'master' of github.com:gorhill/uBlock into trie-wasm commit d2a89cf28f0816ffd4617c2c7b4ccfcdcc30e1b4 Merge: d7afc78 649f82f Author: Raymond Hill <rhill@raymondhill.net> Date: Fri Nov 2 14:54:58 2018 -0300 Merge branch 'master' of github.com:gorhill/uBlock into trie-wasm commit d7afc78b5f5675d7d34c5a1d0ec3099a77caef49 Author: Raymond Hill <rhill@raymondhill.net> Date: Fri Nov 2 13:56:11 2018 -0300 finalize wasm-based hntrie implementation commit e7b9e043cf36ad055791713e34eb0322dec84627 Author: Raymond Hill <rhill@raymondhill.net> Date: Fri Nov 2 08:14:02 2018 -0300 add first-pass implementation of wasm version of hntrie commit 1015cb34624f3ef73ace58b58fe4e03dfc59897f Author: Raymond Hill <rhill@raymondhill.net> Date: Wed Oct 31 17:16:47 2018 -0300 back up draft work toward experimenting with wasm hntries
2018-11-03 12:58:46 +01:00
let entry = this.dataFilters.get(this.noTokenHash);
while ( entry !== undefined ) {
const f = entry.filter;
2017-05-19 14:45:19 +02:00
if ( f.match(url) === true ) {
if ( entry.categoryBits & 0x001 ) {
toRemove.set(f.dataStr, entry);
2017-05-19 14:45:19 +02:00
} else if ( entry.categoryBits & 0x002 ) {
toAddImportant.set(f.dataStr, entry);
} else {
toAdd.set(f.dataStr, entry);
}
}
entry = entry.next;
}
if ( toAddImportant.size === 0 && toAdd.size === 0 ) { return; }
// Remove entries overriden by other filters.
for ( const key of toAddImportant.keys() ) {
toAdd.delete(key);
toRemove.delete(key);
}
for ( const key of toRemove.keys() ) {
if ( key === '' ) {
toAdd.clear();
break;
}
toAdd.delete(key);
}
for ( const entry of toAddImportant ) {
out.push(entry[0]);
if ( outlog === undefined ) { continue; }
Squashed commit of the following: commit 7c6cacc59b27660fabacb55d668ef099b222a9e6 Author: Raymond Hill <rhill@raymondhill.net> Date: Sat Nov 3 08:52:51 2018 -0300 code review: finalize support for wasm-based hntrie commit 8596ed80e3bdac2c36e3c860b51e7189f6bc8487 Merge: cbe1f2e 000eb82 Author: Raymond Hill <rhill@raymondhill.net> Date: Sat Nov 3 08:41:40 2018 -0300 Merge branch 'master' of github.com:gorhill/uBlock into trie-wasm commit cbe1f2e2f38484d42af3204ec7f1b5decd30f99e Merge: 270fc7f dbb7e80 Author: Raymond Hill <rhill@raymondhill.net> Date: Fri Nov 2 17:43:20 2018 -0300 Merge branch 'master' of github.com:gorhill/uBlock into trie-wasm commit 270fc7f9b3b73d79e6355522c1a42ce782fe7e5c Merge: d2a89cf d693d4f Author: Raymond Hill <rhill@raymondhill.net> Date: Fri Nov 2 16:21:08 2018 -0300 Merge branch 'master' of github.com:gorhill/uBlock into trie-wasm commit d2a89cf28f0816ffd4617c2c7b4ccfcdcc30e1b4 Merge: d7afc78 649f82f Author: Raymond Hill <rhill@raymondhill.net> Date: Fri Nov 2 14:54:58 2018 -0300 Merge branch 'master' of github.com:gorhill/uBlock into trie-wasm commit d7afc78b5f5675d7d34c5a1d0ec3099a77caef49 Author: Raymond Hill <rhill@raymondhill.net> Date: Fri Nov 2 13:56:11 2018 -0300 finalize wasm-based hntrie implementation commit e7b9e043cf36ad055791713e34eb0322dec84627 Author: Raymond Hill <rhill@raymondhill.net> Date: Fri Nov 2 08:14:02 2018 -0300 add first-pass implementation of wasm version of hntrie commit 1015cb34624f3ef73ace58b58fe4e03dfc59897f Author: Raymond Hill <rhill@raymondhill.net> Date: Wed Oct 31 17:16:47 2018 -0300 back up draft work toward experimenting with wasm hntries
2018-11-03 12:58:46 +01:00
let logData = entry[1].logData();
logData.source = 'static';
logData.result = 1;
outlog.push(logData);
}
for ( const entry of toAdd ) {
out.push(entry[0]);
if ( outlog === undefined ) { continue; }
Squashed commit of the following: commit 7c6cacc59b27660fabacb55d668ef099b222a9e6 Author: Raymond Hill <rhill@raymondhill.net> Date: Sat Nov 3 08:52:51 2018 -0300 code review: finalize support for wasm-based hntrie commit 8596ed80e3bdac2c36e3c860b51e7189f6bc8487 Merge: cbe1f2e 000eb82 Author: Raymond Hill <rhill@raymondhill.net> Date: Sat Nov 3 08:41:40 2018 -0300 Merge branch 'master' of github.com:gorhill/uBlock into trie-wasm commit cbe1f2e2f38484d42af3204ec7f1b5decd30f99e Merge: 270fc7f dbb7e80 Author: Raymond Hill <rhill@raymondhill.net> Date: Fri Nov 2 17:43:20 2018 -0300 Merge branch 'master' of github.com:gorhill/uBlock into trie-wasm commit 270fc7f9b3b73d79e6355522c1a42ce782fe7e5c Merge: d2a89cf d693d4f Author: Raymond Hill <rhill@raymondhill.net> Date: Fri Nov 2 16:21:08 2018 -0300 Merge branch 'master' of github.com:gorhill/uBlock into trie-wasm commit d2a89cf28f0816ffd4617c2c7b4ccfcdcc30e1b4 Merge: d7afc78 649f82f Author: Raymond Hill <rhill@raymondhill.net> Date: Fri Nov 2 14:54:58 2018 -0300 Merge branch 'master' of github.com:gorhill/uBlock into trie-wasm commit d7afc78b5f5675d7d34c5a1d0ec3099a77caef49 Author: Raymond Hill <rhill@raymondhill.net> Date: Fri Nov 2 13:56:11 2018 -0300 finalize wasm-based hntrie implementation commit e7b9e043cf36ad055791713e34eb0322dec84627 Author: Raymond Hill <rhill@raymondhill.net> Date: Fri Nov 2 08:14:02 2018 -0300 add first-pass implementation of wasm version of hntrie commit 1015cb34624f3ef73ace58b58fe4e03dfc59897f Author: Raymond Hill <rhill@raymondhill.net> Date: Wed Oct 31 17:16:47 2018 -0300 back up draft work toward experimenting with wasm hntries
2018-11-03 12:58:46 +01:00
let logData = entry[1].logData();
logData.source = 'static';
logData.result = 1;
outlog.push(logData);
}
if ( outlog !== undefined ) {
for ( const entry of toRemove.values()) {
const logData = entry.logData();
logData.source = 'static';
logData.result = 2;
outlog.push(logData);
}
}
};
/******************************************************************************/
FilterContainer.prototype.realmMatchString = function(
Add HNTrie-based filter classes to store origin-only filters Related issue: - https://github.com/uBlockOrigin/uBlock-issues/issues/528#issuecomment-484408622 Following STrie-related work in above issue, I noticed that a large number of filters in EasyList were filters which only had to match against the document origin. For instance, among just the top 10 most populous buckets, there were four such buckets with over hundreds of entries each: - bits: 72, token: "http", 146 entries - bits: 72, token: "https", 139 entries - bits: 88, token: "http", 122 entries - bits: 88, token: "https", 118 entries These filters in these buckets have to be matched against all the network requests. In order to leverage HNTrie for these filters[1], they are now handled in a special way so as to ensure they all end up in a single HNTrie (per bucket), which means that instead of scanning hundreds of entries per URL, there is now a single scan per bucket per URL for these apply-everywhere filters. Now, any filter which fulfill ALL the following condition will be processed in a special manner internally: - Is of the form `|https://` or `|http://` or `*`; and - Does have a `domain=` option; and - Does not have a negated domain in its `domain=` option; and - Does not have `csp=` option; and - Does not have a `redirect=` option If a filter does not fulfill ALL the conditions above, no change in behavior. A filter which matches ALL of the above will be processed in a special manner: - The `domain=` option will be decomposed so as to create as many distinct filter as there is distinct value in the `domain=` option - This also apply to the `badfilter` version of the filter, which means it now become possible to `badfilter` only one of the distinct filter without having to `badfilter` all of them. - The logger will always report these special filters with only a single hostname in the `domain=` option. *** [1] HNTrie is currently WASM-ed on Firefox.
2019-04-19 22:33:46 +02:00
realmBits,
typeBits,
partyBits
) {
const exactType = typeBits & 0x80000000;
typeBits &= 0x7FFFFFFF;
const catBits00 = realmBits;
const catBits01 = realmBits | typeBits;
const catBits10 = realmBits | partyBits;
const catBits11 = realmBits | typeBits | partyBits;
const bucket00 = exactType === 0
2019-05-23 14:15:26 +02:00
? this.categories.get(catBits00)
: undefined;
const bucket01 = exactType !== 0 || typeBits !== 0
2019-05-23 14:15:26 +02:00
? this.categories.get(catBits01)
: undefined;
const bucket10 = exactType === 0 && partyBits !== 0
2019-05-23 14:15:26 +02:00
? this.categories.get(catBits10)
: undefined;
const bucket11 = (exactType !== 0 || typeBits !== 0) && partyBits !== 0
2019-05-23 14:15:26 +02:00
? this.categories.get(catBits11)
: undefined;
if (
2019-05-23 14:15:26 +02:00
bucket00 === undefined && bucket01 === undefined &&
bucket10 === undefined && bucket11 === undefined
) {
return false;
2015-01-23 17:32:49 +01:00
}
Add HNTrie-based filter classes to store origin-only filters Related issue: - https://github.com/uBlockOrigin/uBlock-issues/issues/528#issuecomment-484408622 Following STrie-related work in above issue, I noticed that a large number of filters in EasyList were filters which only had to match against the document origin. For instance, among just the top 10 most populous buckets, there were four such buckets with over hundreds of entries each: - bits: 72, token: "http", 146 entries - bits: 72, token: "https", 139 entries - bits: 88, token: "http", 122 entries - bits: 88, token: "https", 118 entries These filters in these buckets have to be matched against all the network requests. In order to leverage HNTrie for these filters[1], they are now handled in a special way so as to ensure they all end up in a single HNTrie (per bucket), which means that instead of scanning hundreds of entries per URL, there is now a single scan per bucket per URL for these apply-everywhere filters. Now, any filter which fulfill ALL the following condition will be processed in a special manner internally: - Is of the form `|https://` or `|http://` or `*`; and - Does have a `domain=` option; and - Does not have a negated domain in its `domain=` option; and - Does not have `csp=` option; and - Does not have a `redirect=` option If a filter does not fulfill ALL the conditions above, no change in behavior. A filter which matches ALL of the above will be processed in a special manner: - The `domain=` option will be decomposed so as to create as many distinct filter as there is distinct value in the `domain=` option - This also apply to the `badfilter` version of the filter, which means it now become possible to `badfilter` only one of the distinct filter without having to `badfilter` all of them. - The logger will always report these special filters with only a single hostname in the `domain=` option. *** [1] HNTrie is currently WASM-ed on Firefox.
2019-04-19 22:33:46 +02:00
2019-05-23 14:15:26 +02:00
let catBits = 0, f;
2015-01-23 17:32:49 +01:00
// Pure hostname-based filters
let tokenHash = this.dotTokenHash;
if (
2019-05-23 14:15:26 +02:00
(bucket00 !== undefined) &&
(f = bucket00.get(tokenHash)) !== undefined &&
(f.match() === true)
) {
catBits = catBits00;
} else if (
2019-05-23 14:15:26 +02:00
(bucket01 !== undefined) &&
(f = bucket01.get(tokenHash)) !== undefined &&
(f.match() === true)
) {
catBits = catBits01;
} else if (
2019-05-23 14:15:26 +02:00
(bucket10 !== undefined) &&
(f = bucket10.get(tokenHash)) !== undefined &&
(f.match() === true)
) {
catBits = catBits10;
} else if (
2019-05-23 14:15:26 +02:00
(bucket11 !== undefined) &&
(f = bucket11.get(tokenHash)) !== undefined &&
(f.match() === true)
) {
catBits = catBits11;
Add HNTrie-based filter classes to store origin-only filters Related issue: - https://github.com/uBlockOrigin/uBlock-issues/issues/528#issuecomment-484408622 Following STrie-related work in above issue, I noticed that a large number of filters in EasyList were filters which only had to match against the document origin. For instance, among just the top 10 most populous buckets, there were four such buckets with over hundreds of entries each: - bits: 72, token: "http", 146 entries - bits: 72, token: "https", 139 entries - bits: 88, token: "http", 122 entries - bits: 88, token: "https", 118 entries These filters in these buckets have to be matched against all the network requests. In order to leverage HNTrie for these filters[1], they are now handled in a special way so as to ensure they all end up in a single HNTrie (per bucket), which means that instead of scanning hundreds of entries per URL, there is now a single scan per bucket per URL for these apply-everywhere filters. Now, any filter which fulfill ALL the following condition will be processed in a special manner internally: - Is of the form `|https://` or `|http://` or `*`; and - Does have a `domain=` option; and - Does not have a negated domain in its `domain=` option; and - Does not have `csp=` option; and - Does not have a `redirect=` option If a filter does not fulfill ALL the conditions above, no change in behavior. A filter which matches ALL of the above will be processed in a special manner: - The `domain=` option will be decomposed so as to create as many distinct filter as there is distinct value in the `domain=` option - This also apply to the `badfilter` version of the filter, which means it now become possible to `badfilter` only one of the distinct filter without having to `badfilter` all of them. - The logger will always report these special filters with only a single hostname in the `domain=` option. *** [1] HNTrie is currently WASM-ed on Firefox.
2019-04-19 22:33:46 +02:00
}
// Pattern-based filters
2019-05-23 14:15:26 +02:00
else {
const url = urlRegister;
2019-05-23 14:15:26 +02:00
const tokenHashes = this.urlTokenizer.getTokens();
let i = 0, tokenBeg = 0;
for (;;) {
tokenHash = tokenHashes[i];
if ( tokenHash === 0 ) { return false; }
tokenBeg = tokenHashes[i+1];
if (
(bucket00 !== undefined) &&
(f = bucket00.get(tokenHash)) !== undefined &&
(f.match(url, tokenBeg) === true)
) {
catBits = catBits00;
break;
}
if (
(bucket01 !== undefined) &&
(f = bucket01.get(tokenHash)) !== undefined &&
(f.match(url, tokenBeg) === true)
) {
catBits = catBits01;
break;
}
if (
(bucket10 !== undefined) &&
(f = bucket10.get(tokenHash)) !== undefined &&
(f.match(url, tokenBeg) === true)
) {
catBits = catBits10;
break;
}
if (
(bucket11 !== undefined) &&
(f = bucket11.get(tokenHash)) !== undefined &&
(f.match(url, tokenBeg) === true)
) {
catBits = catBits11;
break;
}
i += 2;
}
Add HNTrie-based filter classes to store origin-only filters Related issue: - https://github.com/uBlockOrigin/uBlock-issues/issues/528#issuecomment-484408622 Following STrie-related work in above issue, I noticed that a large number of filters in EasyList were filters which only had to match against the document origin. For instance, among just the top 10 most populous buckets, there were four such buckets with over hundreds of entries each: - bits: 72, token: "http", 146 entries - bits: 72, token: "https", 139 entries - bits: 88, token: "http", 122 entries - bits: 88, token: "https", 118 entries These filters in these buckets have to be matched against all the network requests. In order to leverage HNTrie for these filters[1], they are now handled in a special way so as to ensure they all end up in a single HNTrie (per bucket), which means that instead of scanning hundreds of entries per URL, there is now a single scan per bucket per URL for these apply-everywhere filters. Now, any filter which fulfill ALL the following condition will be processed in a special manner internally: - Is of the form `|https://` or `|http://` or `*`; and - Does have a `domain=` option; and - Does not have a negated domain in its `domain=` option; and - Does not have `csp=` option; and - Does not have a `redirect=` option If a filter does not fulfill ALL the conditions above, no change in behavior. A filter which matches ALL of the above will be processed in a special manner: - The `domain=` option will be decomposed so as to create as many distinct filter as there is distinct value in the `domain=` option - This also apply to the `badfilter` version of the filter, which means it now become possible to `badfilter` only one of the distinct filter without having to `badfilter` all of them. - The logger will always report these special filters with only a single hostname in the `domain=` option. *** [1] HNTrie is currently WASM-ed on Firefox.
2019-04-19 22:33:46 +02:00
}
2019-05-23 14:15:26 +02:00
this.catbitsRegister = catBits;
this.tokenRegister = tokenHash;
this.filterRegister = f;
return true;
2014-06-24 00:42:43 +02:00
};
/******************************************************************************/
Add HNTrie-based filter classes to store origin-only filters Related issue: - https://github.com/uBlockOrigin/uBlock-issues/issues/528#issuecomment-484408622 Following STrie-related work in above issue, I noticed that a large number of filters in EasyList were filters which only had to match against the document origin. For instance, among just the top 10 most populous buckets, there were four such buckets with over hundreds of entries each: - bits: 72, token: "http", 146 entries - bits: 72, token: "https", 139 entries - bits: 88, token: "http", 122 entries - bits: 88, token: "https", 118 entries These filters in these buckets have to be matched against all the network requests. In order to leverage HNTrie for these filters[1], they are now handled in a special way so as to ensure they all end up in a single HNTrie (per bucket), which means that instead of scanning hundreds of entries per URL, there is now a single scan per bucket per URL for these apply-everywhere filters. Now, any filter which fulfill ALL the following condition will be processed in a special manner internally: - Is of the form `|https://` or `|http://` or `*`; and - Does have a `domain=` option; and - Does not have a negated domain in its `domain=` option; and - Does not have `csp=` option; and - Does not have a `redirect=` option If a filter does not fulfill ALL the conditions above, no change in behavior. A filter which matches ALL of the above will be processed in a special manner: - The `domain=` option will be decomposed so as to create as many distinct filter as there is distinct value in the `domain=` option - This also apply to the `badfilter` version of the filter, which means it now become possible to `badfilter` only one of the distinct filter without having to `badfilter` all of them. - The logger will always report these special filters with only a single hostname in the `domain=` option. *** [1] HNTrie is currently WASM-ed on Firefox.
2019-04-19 22:33:46 +02:00
// Specialized handler
2016-11-08 13:13:26 +01:00
// https://github.com/gorhill/uBlock/issues/1477
// Special case: blocking-generichide filter ALWAYS exists, it is implicit --
// thus we always first check for exception filters, then for important block
// filter if and only if there was a hit on an exception filter.
// https://github.com/gorhill/uBlock/issues/2103
// User may want to override `generichide` exception filters.
2018-07-22 16:47:02 +02:00
FilterContainer.prototype.matchStringGenericHide = function(requestURL) {
const typeBits = typeNameToTypeValue['generichide'] | 0x80000000;
2016-11-08 13:13:26 +01:00
Add HNTrie-based filter classes to store origin-only filters Related issue: - https://github.com/uBlockOrigin/uBlock-issues/issues/528#issuecomment-484408622 Following STrie-related work in above issue, I noticed that a large number of filters in EasyList were filters which only had to match against the document origin. For instance, among just the top 10 most populous buckets, there were four such buckets with over hundreds of entries each: - bits: 72, token: "http", 146 entries - bits: 72, token: "https", 139 entries - bits: 88, token: "http", 122 entries - bits: 88, token: "https", 118 entries These filters in these buckets have to be matched against all the network requests. In order to leverage HNTrie for these filters[1], they are now handled in a special way so as to ensure they all end up in a single HNTrie (per bucket), which means that instead of scanning hundreds of entries per URL, there is now a single scan per bucket per URL for these apply-everywhere filters. Now, any filter which fulfill ALL the following condition will be processed in a special manner internally: - Is of the form `|https://` or `|http://` or `*`; and - Does have a `domain=` option; and - Does not have a negated domain in its `domain=` option; and - Does not have `csp=` option; and - Does not have a `redirect=` option If a filter does not fulfill ALL the conditions above, no change in behavior. A filter which matches ALL of the above will be processed in a special manner: - The `domain=` option will be decomposed so as to create as many distinct filter as there is distinct value in the `domain=` option - This also apply to the `badfilter` version of the filter, which means it now become possible to `badfilter` only one of the distinct filter without having to `badfilter` all of them. - The logger will always report these special filters with only a single hostname in the `domain=` option. *** [1] HNTrie is currently WASM-ed on Firefox.
2019-04-19 22:33:46 +02:00
// Prime tokenizer: we get a normalized URL in return.
urlRegister = this.urlTokenizer.setURL(requestURL);
Add HNTrie-based filter classes to store origin-only filters Related issue: - https://github.com/uBlockOrigin/uBlock-issues/issues/528#issuecomment-484408622 Following STrie-related work in above issue, I noticed that a large number of filters in EasyList were filters which only had to match against the document origin. For instance, among just the top 10 most populous buckets, there were four such buckets with over hundreds of entries each: - bits: 72, token: "http", 146 entries - bits: 72, token: "https", 139 entries - bits: 88, token: "http", 122 entries - bits: 88, token: "https", 118 entries These filters in these buckets have to be matched against all the network requests. In order to leverage HNTrie for these filters[1], they are now handled in a special way so as to ensure they all end up in a single HNTrie (per bucket), which means that instead of scanning hundreds of entries per URL, there is now a single scan per bucket per URL for these apply-everywhere filters. Now, any filter which fulfill ALL the following condition will be processed in a special manner internally: - Is of the form `|https://` or `|http://` or `*`; and - Does have a `domain=` option; and - Does not have a negated domain in its `domain=` option; and - Does not have `csp=` option; and - Does not have a `redirect=` option If a filter does not fulfill ALL the conditions above, no change in behavior. A filter which matches ALL of the above will be processed in a special manner: - The `domain=` option will be decomposed so as to create as many distinct filter as there is distinct value in the `domain=` option - This also apply to the `badfilter` version of the filter, which means it now become possible to `badfilter` only one of the distinct filter without having to `badfilter` all of them. - The logger will always report these special filters with only a single hostname in the `domain=` option. *** [1] HNTrie is currently WASM-ed on Firefox.
2019-04-19 22:33:46 +02:00
this.filterRegister = null;
2016-12-08 02:18:58 +01:00
Add HNTrie-based filter classes to store origin-only filters Related issue: - https://github.com/uBlockOrigin/uBlock-issues/issues/528#issuecomment-484408622 Following STrie-related work in above issue, I noticed that a large number of filters in EasyList were filters which only had to match against the document origin. For instance, among just the top 10 most populous buckets, there were four such buckets with over hundreds of entries each: - bits: 72, token: "http", 146 entries - bits: 72, token: "https", 139 entries - bits: 88, token: "http", 122 entries - bits: 88, token: "https", 118 entries These filters in these buckets have to be matched against all the network requests. In order to leverage HNTrie for these filters[1], they are now handled in a special way so as to ensure they all end up in a single HNTrie (per bucket), which means that instead of scanning hundreds of entries per URL, there is now a single scan per bucket per URL for these apply-everywhere filters. Now, any filter which fulfill ALL the following condition will be processed in a special manner internally: - Is of the form `|https://` or `|http://` or `*`; and - Does have a `domain=` option; and - Does not have a negated domain in its `domain=` option; and - Does not have `csp=` option; and - Does not have a `redirect=` option If a filter does not fulfill ALL the conditions above, no change in behavior. A filter which matches ALL of the above will be processed in a special manner: - The `domain=` option will be decomposed so as to create as many distinct filter as there is distinct value in the `domain=` option - This also apply to the `badfilter` version of the filter, which means it now become possible to `badfilter` only one of the distinct filter without having to `badfilter` all of them. - The logger will always report these special filters with only a single hostname in the `domain=` option. *** [1] HNTrie is currently WASM-ed on Firefox.
2019-04-19 22:33:46 +02:00
// These registers will be used by various filters
pageHostnameRegister = requestHostnameRegister =
µb.URI.hostnameFromURI(requestURL);
2016-11-08 13:13:26 +01:00
Add HNTrie-based filter classes to store origin-only filters Related issue: - https://github.com/uBlockOrigin/uBlock-issues/issues/528#issuecomment-484408622 Following STrie-related work in above issue, I noticed that a large number of filters in EasyList were filters which only had to match against the document origin. For instance, among just the top 10 most populous buckets, there were four such buckets with over hundreds of entries each: - bits: 72, token: "http", 146 entries - bits: 72, token: "https", 139 entries - bits: 88, token: "http", 122 entries - bits: 88, token: "https", 118 entries These filters in these buckets have to be matched against all the network requests. In order to leverage HNTrie for these filters[1], they are now handled in a special way so as to ensure they all end up in a single HNTrie (per bucket), which means that instead of scanning hundreds of entries per URL, there is now a single scan per bucket per URL for these apply-everywhere filters. Now, any filter which fulfill ALL the following condition will be processed in a special manner internally: - Is of the form `|https://` or `|http://` or `*`; and - Does have a `domain=` option; and - Does not have a negated domain in its `domain=` option; and - Does not have `csp=` option; and - Does not have a `redirect=` option If a filter does not fulfill ALL the conditions above, no change in behavior. A filter which matches ALL of the above will be processed in a special manner: - The `domain=` option will be decomposed so as to create as many distinct filter as there is distinct value in the `domain=` option - This also apply to the `badfilter` version of the filter, which means it now become possible to `badfilter` only one of the distinct filter without having to `badfilter` all of them. - The logger will always report these special filters with only a single hostname in the `domain=` option. *** [1] HNTrie is currently WASM-ed on Firefox.
2019-04-19 22:33:46 +02:00
// Exception filters
if ( this.realmMatchString(AllowAction, typeBits, FirstParty) ) {
Add HNTrie-based filter classes to store origin-only filters Related issue: - https://github.com/uBlockOrigin/uBlock-issues/issues/528#issuecomment-484408622 Following STrie-related work in above issue, I noticed that a large number of filters in EasyList were filters which only had to match against the document origin. For instance, among just the top 10 most populous buckets, there were four such buckets with over hundreds of entries each: - bits: 72, token: "http", 146 entries - bits: 72, token: "https", 139 entries - bits: 88, token: "http", 122 entries - bits: 88, token: "https", 118 entries These filters in these buckets have to be matched against all the network requests. In order to leverage HNTrie for these filters[1], they are now handled in a special way so as to ensure they all end up in a single HNTrie (per bucket), which means that instead of scanning hundreds of entries per URL, there is now a single scan per bucket per URL for these apply-everywhere filters. Now, any filter which fulfill ALL the following condition will be processed in a special manner internally: - Is of the form `|https://` or `|http://` or `*`; and - Does have a `domain=` option; and - Does not have a negated domain in its `domain=` option; and - Does not have `csp=` option; and - Does not have a `redirect=` option If a filter does not fulfill ALL the conditions above, no change in behavior. A filter which matches ALL of the above will be processed in a special manner: - The `domain=` option will be decomposed so as to create as many distinct filter as there is distinct value in the `domain=` option - This also apply to the `badfilter` version of the filter, which means it now become possible to `badfilter` only one of the distinct filter without having to `badfilter` all of them. - The logger will always report these special filters with only a single hostname in the `domain=` option. *** [1] HNTrie is currently WASM-ed on Firefox.
2019-04-19 22:33:46 +02:00
// Important block filters.
if ( this.realmMatchString(BlockImportant, typeBits, FirstParty) ) {
Add HNTrie-based filter classes to store origin-only filters Related issue: - https://github.com/uBlockOrigin/uBlock-issues/issues/528#issuecomment-484408622 Following STrie-related work in above issue, I noticed that a large number of filters in EasyList were filters which only had to match against the document origin. For instance, among just the top 10 most populous buckets, there were four such buckets with over hundreds of entries each: - bits: 72, token: "http", 146 entries - bits: 72, token: "https", 139 entries - bits: 88, token: "http", 122 entries - bits: 88, token: "https", 118 entries These filters in these buckets have to be matched against all the network requests. In order to leverage HNTrie for these filters[1], they are now handled in a special way so as to ensure they all end up in a single HNTrie (per bucket), which means that instead of scanning hundreds of entries per URL, there is now a single scan per bucket per URL for these apply-everywhere filters. Now, any filter which fulfill ALL the following condition will be processed in a special manner internally: - Is of the form `|https://` or `|http://` or `*`; and - Does have a `domain=` option; and - Does not have a negated domain in its `domain=` option; and - Does not have `csp=` option; and - Does not have a `redirect=` option If a filter does not fulfill ALL the conditions above, no change in behavior. A filter which matches ALL of the above will be processed in a special manner: - The `domain=` option will be decomposed so as to create as many distinct filter as there is distinct value in the `domain=` option - This also apply to the `badfilter` version of the filter, which means it now become possible to `badfilter` only one of the distinct filter without having to `badfilter` all of them. - The logger will always report these special filters with only a single hostname in the `domain=` option. *** [1] HNTrie is currently WASM-ed on Firefox.
2019-04-19 22:33:46 +02:00
return 1;
}
return 2;
2016-11-08 13:13:26 +01:00
}
Add HNTrie-based filter classes to store origin-only filters Related issue: - https://github.com/uBlockOrigin/uBlock-issues/issues/528#issuecomment-484408622 Following STrie-related work in above issue, I noticed that a large number of filters in EasyList were filters which only had to match against the document origin. For instance, among just the top 10 most populous buckets, there were four such buckets with over hundreds of entries each: - bits: 72, token: "http", 146 entries - bits: 72, token: "https", 139 entries - bits: 88, token: "http", 122 entries - bits: 88, token: "https", 118 entries These filters in these buckets have to be matched against all the network requests. In order to leverage HNTrie for these filters[1], they are now handled in a special way so as to ensure they all end up in a single HNTrie (per bucket), which means that instead of scanning hundreds of entries per URL, there is now a single scan per bucket per URL for these apply-everywhere filters. Now, any filter which fulfill ALL the following condition will be processed in a special manner internally: - Is of the form `|https://` or `|http://` or `*`; and - Does have a `domain=` option; and - Does not have a negated domain in its `domain=` option; and - Does not have `csp=` option; and - Does not have a `redirect=` option If a filter does not fulfill ALL the conditions above, no change in behavior. A filter which matches ALL of the above will be processed in a special manner: - The `domain=` option will be decomposed so as to create as many distinct filter as there is distinct value in the `domain=` option - This also apply to the `badfilter` version of the filter, which means it now become possible to `badfilter` only one of the distinct filter without having to `badfilter` all of them. - The logger will always report these special filters with only a single hostname in the `domain=` option. *** [1] HNTrie is currently WASM-ed on Firefox.
2019-04-19 22:33:46 +02:00
return 0;
2016-11-08 13:13:26 +01:00
};
/******************************************************************************/
2015-04-07 03:26:05 +02:00
// https://github.com/chrisaljoudi/uBlock/issues/116
2016-11-08 13:13:26 +01:00
// Some type of requests are exceptional, they need custom handling,
// not the generic handling.
// https://github.com/chrisaljoudi/uBlock/issues/519
// Use exact type match for anything beyond `other`. Also, be prepared to
// support unknown types.
2014-07-30 03:10:00 +02:00
FilterContainer.prototype.matchString = function(fctxt, modifiers = 0) {
let typeBits = typeNameToTypeValue[fctxt.type];
if ( modifiers === 0 ) {
if ( typeBits === undefined ) {
typeBits = otherTypeBitValue;
} else if ( typeBits === 0 || typeBits > otherTypeBitValue ) {
modifiers |= 0b0001;
}
}
if ( (modifiers & 0b0001) !== 0 ) {
if ( typeBits === undefined ) { return 0; }
typeBits |= 0x80000000;
2015-01-17 13:53:19 +01:00
}
Add HNTrie-based filter classes to store origin-only filters Related issue: - https://github.com/uBlockOrigin/uBlock-issues/issues/528#issuecomment-484408622 Following STrie-related work in above issue, I noticed that a large number of filters in EasyList were filters which only had to match against the document origin. For instance, among just the top 10 most populous buckets, there were four such buckets with over hundreds of entries each: - bits: 72, token: "http", 146 entries - bits: 72, token: "https", 139 entries - bits: 88, token: "http", 122 entries - bits: 88, token: "https", 118 entries These filters in these buckets have to be matched against all the network requests. In order to leverage HNTrie for these filters[1], they are now handled in a special way so as to ensure they all end up in a single HNTrie (per bucket), which means that instead of scanning hundreds of entries per URL, there is now a single scan per bucket per URL for these apply-everywhere filters. Now, any filter which fulfill ALL the following condition will be processed in a special manner internally: - Is of the form `|https://` or `|http://` or `*`; and - Does have a `domain=` option; and - Does not have a negated domain in its `domain=` option; and - Does not have `csp=` option; and - Does not have a `redirect=` option If a filter does not fulfill ALL the conditions above, no change in behavior. A filter which matches ALL of the above will be processed in a special manner: - The `domain=` option will be decomposed so as to create as many distinct filter as there is distinct value in the `domain=` option - This also apply to the `badfilter` version of the filter, which means it now become possible to `badfilter` only one of the distinct filter without having to `badfilter` all of them. - The logger will always report these special filters with only a single hostname in the `domain=` option. *** [1] HNTrie is currently WASM-ed on Firefox.
2019-04-19 22:33:46 +02:00
const partyBits = fctxt.is3rdPartyToDoc() ? ThirdParty : FirstParty;
2014-06-24 00:42:43 +02:00
// Prime tokenizer: we get a normalized URL in return.
urlRegister = this.urlTokenizer.setURL(fctxt.url);
Add HNTrie-based filter classes to store origin-only filters Related issue: - https://github.com/uBlockOrigin/uBlock-issues/issues/528#issuecomment-484408622 Following STrie-related work in above issue, I noticed that a large number of filters in EasyList were filters which only had to match against the document origin. For instance, among just the top 10 most populous buckets, there were four such buckets with over hundreds of entries each: - bits: 72, token: "http", 146 entries - bits: 72, token: "https", 139 entries - bits: 88, token: "http", 122 entries - bits: 88, token: "https", 118 entries These filters in these buckets have to be matched against all the network requests. In order to leverage HNTrie for these filters[1], they are now handled in a special way so as to ensure they all end up in a single HNTrie (per bucket), which means that instead of scanning hundreds of entries per URL, there is now a single scan per bucket per URL for these apply-everywhere filters. Now, any filter which fulfill ALL the following condition will be processed in a special manner internally: - Is of the form `|https://` or `|http://` or `*`; and - Does have a `domain=` option; and - Does not have a negated domain in its `domain=` option; and - Does not have `csp=` option; and - Does not have a `redirect=` option If a filter does not fulfill ALL the conditions above, no change in behavior. A filter which matches ALL of the above will be processed in a special manner: - The `domain=` option will be decomposed so as to create as many distinct filter as there is distinct value in the `domain=` option - This also apply to the `badfilter` version of the filter, which means it now become possible to `badfilter` only one of the distinct filter without having to `badfilter` all of them. - The logger will always report these special filters with only a single hostname in the `domain=` option. *** [1] HNTrie is currently WASM-ed on Firefox.
2019-04-19 22:33:46 +02:00
this.filterRegister = null;
2015-02-05 00:06:31 +01:00
// These registers will be used by various filters
pageHostnameRegister = fctxt.getDocHostname();
requestHostnameRegister = fctxt.getHostname();
Add HNTrie-based filter classes to store origin-only filters Related issue: - https://github.com/uBlockOrigin/uBlock-issues/issues/528#issuecomment-484408622 Following STrie-related work in above issue, I noticed that a large number of filters in EasyList were filters which only had to match against the document origin. For instance, among just the top 10 most populous buckets, there were four such buckets with over hundreds of entries each: - bits: 72, token: "http", 146 entries - bits: 72, token: "https", 139 entries - bits: 88, token: "http", 122 entries - bits: 88, token: "https", 118 entries These filters in these buckets have to be matched against all the network requests. In order to leverage HNTrie for these filters[1], they are now handled in a special way so as to ensure they all end up in a single HNTrie (per bucket), which means that instead of scanning hundreds of entries per URL, there is now a single scan per bucket per URL for these apply-everywhere filters. Now, any filter which fulfill ALL the following condition will be processed in a special manner internally: - Is of the form `|https://` or `|http://` or `*`; and - Does have a `domain=` option; and - Does not have a negated domain in its `domain=` option; and - Does not have `csp=` option; and - Does not have a `redirect=` option If a filter does not fulfill ALL the conditions above, no change in behavior. A filter which matches ALL of the above will be processed in a special manner: - The `domain=` option will be decomposed so as to create as many distinct filter as there is distinct value in the `domain=` option - This also apply to the `badfilter` version of the filter, which means it now become possible to `badfilter` only one of the distinct filter without having to `badfilter` all of them. - The logger will always report these special filters with only a single hostname in the `domain=` option. *** [1] HNTrie is currently WASM-ed on Firefox.
2019-04-19 22:33:46 +02:00
// Important block filters.
if ( this.realmMatchString(BlockImportant, typeBits, partyBits) ) {
return 1;
}
Add HNTrie-based filter classes to store origin-only filters Related issue: - https://github.com/uBlockOrigin/uBlock-issues/issues/528#issuecomment-484408622 Following STrie-related work in above issue, I noticed that a large number of filters in EasyList were filters which only had to match against the document origin. For instance, among just the top 10 most populous buckets, there were four such buckets with over hundreds of entries each: - bits: 72, token: "http", 146 entries - bits: 72, token: "https", 139 entries - bits: 88, token: "http", 122 entries - bits: 88, token: "https", 118 entries These filters in these buckets have to be matched against all the network requests. In order to leverage HNTrie for these filters[1], they are now handled in a special way so as to ensure they all end up in a single HNTrie (per bucket), which means that instead of scanning hundreds of entries per URL, there is now a single scan per bucket per URL for these apply-everywhere filters. Now, any filter which fulfill ALL the following condition will be processed in a special manner internally: - Is of the form `|https://` or `|http://` or `*`; and - Does have a `domain=` option; and - Does not have a negated domain in its `domain=` option; and - Does not have `csp=` option; and - Does not have a `redirect=` option If a filter does not fulfill ALL the conditions above, no change in behavior. A filter which matches ALL of the above will be processed in a special manner: - The `domain=` option will be decomposed so as to create as many distinct filter as there is distinct value in the `domain=` option - This also apply to the `badfilter` version of the filter, which means it now become possible to `badfilter` only one of the distinct filter without having to `badfilter` all of them. - The logger will always report these special filters with only a single hostname in the `domain=` option. *** [1] HNTrie is currently WASM-ed on Firefox.
2019-04-19 22:33:46 +02:00
// Block filters
if ( this.realmMatchString(BlockAction, typeBits, partyBits) ) {
// Exception filters
if ( this.realmMatchString(AllowAction, typeBits, partyBits) ) {
return 2;
}
Add HNTrie-based filter classes to store origin-only filters Related issue: - https://github.com/uBlockOrigin/uBlock-issues/issues/528#issuecomment-484408622 Following STrie-related work in above issue, I noticed that a large number of filters in EasyList were filters which only had to match against the document origin. For instance, among just the top 10 most populous buckets, there were four such buckets with over hundreds of entries each: - bits: 72, token: "http", 146 entries - bits: 72, token: "https", 139 entries - bits: 88, token: "http", 122 entries - bits: 88, token: "https", 118 entries These filters in these buckets have to be matched against all the network requests. In order to leverage HNTrie for these filters[1], they are now handled in a special way so as to ensure they all end up in a single HNTrie (per bucket), which means that instead of scanning hundreds of entries per URL, there is now a single scan per bucket per URL for these apply-everywhere filters. Now, any filter which fulfill ALL the following condition will be processed in a special manner internally: - Is of the form `|https://` or `|http://` or `*`; and - Does have a `domain=` option; and - Does not have a negated domain in its `domain=` option; and - Does not have `csp=` option; and - Does not have a `redirect=` option If a filter does not fulfill ALL the conditions above, no change in behavior. A filter which matches ALL of the above will be processed in a special manner: - The `domain=` option will be decomposed so as to create as many distinct filter as there is distinct value in the `domain=` option - This also apply to the `badfilter` version of the filter, which means it now become possible to `badfilter` only one of the distinct filter without having to `badfilter` all of them. - The logger will always report these special filters with only a single hostname in the `domain=` option. *** [1] HNTrie is currently WASM-ed on Firefox.
2019-04-19 22:33:46 +02:00
return 1;
2014-06-24 00:42:43 +02:00
}
Add HNTrie-based filter classes to store origin-only filters Related issue: - https://github.com/uBlockOrigin/uBlock-issues/issues/528#issuecomment-484408622 Following STrie-related work in above issue, I noticed that a large number of filters in EasyList were filters which only had to match against the document origin. For instance, among just the top 10 most populous buckets, there were four such buckets with over hundreds of entries each: - bits: 72, token: "http", 146 entries - bits: 72, token: "https", 139 entries - bits: 88, token: "http", 122 entries - bits: 88, token: "https", 118 entries These filters in these buckets have to be matched against all the network requests. In order to leverage HNTrie for these filters[1], they are now handled in a special way so as to ensure they all end up in a single HNTrie (per bucket), which means that instead of scanning hundreds of entries per URL, there is now a single scan per bucket per URL for these apply-everywhere filters. Now, any filter which fulfill ALL the following condition will be processed in a special manner internally: - Is of the form `|https://` or `|http://` or `*`; and - Does have a `domain=` option; and - Does not have a negated domain in its `domain=` option; and - Does not have `csp=` option; and - Does not have a `redirect=` option If a filter does not fulfill ALL the conditions above, no change in behavior. A filter which matches ALL of the above will be processed in a special manner: - The `domain=` option will be decomposed so as to create as many distinct filter as there is distinct value in the `domain=` option - This also apply to the `badfilter` version of the filter, which means it now become possible to `badfilter` only one of the distinct filter without having to `badfilter` all of them. - The logger will always report these special filters with only a single hostname in the `domain=` option. *** [1] HNTrie is currently WASM-ed on Firefox.
2019-04-19 22:33:46 +02:00
return 0;
};
/******************************************************************************/
FilterContainer.prototype.toLogData = function() {
Add HNTrie-based filter classes to store origin-only filters Related issue: - https://github.com/uBlockOrigin/uBlock-issues/issues/528#issuecomment-484408622 Following STrie-related work in above issue, I noticed that a large number of filters in EasyList were filters which only had to match against the document origin. For instance, among just the top 10 most populous buckets, there were four such buckets with over hundreds of entries each: - bits: 72, token: "http", 146 entries - bits: 72, token: "https", 139 entries - bits: 88, token: "http", 122 entries - bits: 88, token: "https", 118 entries These filters in these buckets have to be matched against all the network requests. In order to leverage HNTrie for these filters[1], they are now handled in a special way so as to ensure they all end up in a single HNTrie (per bucket), which means that instead of scanning hundreds of entries per URL, there is now a single scan per bucket per URL for these apply-everywhere filters. Now, any filter which fulfill ALL the following condition will be processed in a special manner internally: - Is of the form `|https://` or `|http://` or `*`; and - Does have a `domain=` option; and - Does not have a negated domain in its `domain=` option; and - Does not have `csp=` option; and - Does not have a `redirect=` option If a filter does not fulfill ALL the conditions above, no change in behavior. A filter which matches ALL of the above will be processed in a special manner: - The `domain=` option will be decomposed so as to create as many distinct filter as there is distinct value in the `domain=` option - This also apply to the `badfilter` version of the filter, which means it now become possible to `badfilter` only one of the distinct filter without having to `badfilter` all of them. - The logger will always report these special filters with only a single hostname in the `domain=` option. *** [1] HNTrie is currently WASM-ed on Firefox.
2019-04-19 22:33:46 +02:00
if ( this.filterRegister === null ) { return; }
const logData = toLogDataInternal(
this.catbitsRegister,
this.tokenRegister,
this.filterRegister
);
logData.source = 'static';
Add HNTrie-based filter classes to store origin-only filters Related issue: - https://github.com/uBlockOrigin/uBlock-issues/issues/528#issuecomment-484408622 Following STrie-related work in above issue, I noticed that a large number of filters in EasyList were filters which only had to match against the document origin. For instance, among just the top 10 most populous buckets, there were four such buckets with over hundreds of entries each: - bits: 72, token: "http", 146 entries - bits: 72, token: "https", 139 entries - bits: 88, token: "http", 122 entries - bits: 88, token: "https", 118 entries These filters in these buckets have to be matched against all the network requests. In order to leverage HNTrie for these filters[1], they are now handled in a special way so as to ensure they all end up in a single HNTrie (per bucket), which means that instead of scanning hundreds of entries per URL, there is now a single scan per bucket per URL for these apply-everywhere filters. Now, any filter which fulfill ALL the following condition will be processed in a special manner internally: - Is of the form `|https://` or `|http://` or `*`; and - Does have a `domain=` option; and - Does not have a negated domain in its `domain=` option; and - Does not have `csp=` option; and - Does not have a `redirect=` option If a filter does not fulfill ALL the conditions above, no change in behavior. A filter which matches ALL of the above will be processed in a special manner: - The `domain=` option will be decomposed so as to create as many distinct filter as there is distinct value in the `domain=` option - This also apply to the `badfilter` version of the filter, which means it now become possible to `badfilter` only one of the distinct filter without having to `badfilter` all of them. - The logger will always report these special filters with only a single hostname in the `domain=` option. *** [1] HNTrie is currently WASM-ed on Firefox.
2019-04-19 22:33:46 +02:00
logData.tokenHash = this.tokenRegister;
logData.result = this.filterRegister === null
? 0
: (this.catbitsRegister & 1 ? 2 : 1);
return logData;
2014-06-24 00:42:43 +02:00
};
/******************************************************************************/
FilterContainer.prototype.getFilterCount = function() {
2016-03-17 18:56:21 +01:00
return this.acceptedCount - this.discardedCount;
2014-06-24 00:42:43 +02:00
};
/******************************************************************************/
// action: 1=test, 2=record
FilterContainer.prototype.benchmark = async function(action) {
const requests = await µb.loadBenchmarkDataset();
if ( Array.isArray(requests) === false || requests.length === 0 ) {
console.info('No requests found to benchmark');
return;
}
console.info(`Benchmarking staticNetFilteringEngine.matchString()...`);
const fctxt = µb.filteringContext.duplicate();
let expected, recorded;
if ( action === 1 ) {
try {
expected = JSON.parse(
vAPI.localStorage.getItem('FilterContainer.benchmark.results')
);
} catch(ex) {
}
}
if ( action === 2 ) {
recorded = [];
}
const t0 = self.performance.now();
for ( let i = 0; i < requests.length; i++ ) {
const request = requests[i];
fctxt.setURL(request.url);
fctxt.setDocOriginFromURL(request.frameUrl);
fctxt.setType(request.cpt);
const r = this.matchString(fctxt);
if ( recorded !== undefined ) { recorded.push(r); }
if ( expected !== undefined && r !== expected[i] ) {
console.log('Mismatch with reference results:');
console.log(`\tExpected ${expected[i]}, got ${r}:`);
console.log(`\ttype=${fctxt.type}`);
console.log(`\turl=${fctxt.url}`);
console.log(`\tdocOrigin=${fctxt.getDocOrigin()}`);
}
}
const t1 = self.performance.now();
const dur = t1 - t0;
console.info(`Evaluated ${requests.length} requests in ${dur.toFixed(0)} ms`);
console.info(`\tAverage: ${(dur / requests.length).toFixed(3)} ms per request`);
if ( expected !== undefined ) {
console.info(`\tBlocked: ${expected.reduce((n,r)=>{return r===1?n+1:n;},0)}`);
console.info(`\tExcepted: ${expected.reduce((n,r)=>{return r===2?n+1:n;},0)}`);
}
if ( recorded !== undefined ) {
vAPI.localStorage.setItem(
'FilterContainer.benchmark.results',
JSON.stringify(recorded)
);
}
};
/******************************************************************************/
FilterContainer.prototype.test = function(docURL, type, url) {
const fctxt = µb.filteringContext.duplicate();
fctxt.setDocOriginFromURL(docURL);
fctxt.setType(type);
fctxt.setURL(url);
const r = this.matchString(fctxt);
console.log(`${r}`);
if ( r !== 0 ) {
console.log(this.toLogData());
}
};
Add HNTrie-based filter classes to store origin-only filters Related issue: - https://github.com/uBlockOrigin/uBlock-issues/issues/528#issuecomment-484408622 Following STrie-related work in above issue, I noticed that a large number of filters in EasyList were filters which only had to match against the document origin. For instance, among just the top 10 most populous buckets, there were four such buckets with over hundreds of entries each: - bits: 72, token: "http", 146 entries - bits: 72, token: "https", 139 entries - bits: 88, token: "http", 122 entries - bits: 88, token: "https", 118 entries These filters in these buckets have to be matched against all the network requests. In order to leverage HNTrie for these filters[1], they are now handled in a special way so as to ensure they all end up in a single HNTrie (per bucket), which means that instead of scanning hundreds of entries per URL, there is now a single scan per bucket per URL for these apply-everywhere filters. Now, any filter which fulfill ALL the following condition will be processed in a special manner internally: - Is of the form `|https://` or `|http://` or `*`; and - Does have a `domain=` option; and - Does not have a negated domain in its `domain=` option; and - Does not have `csp=` option; and - Does not have a `redirect=` option If a filter does not fulfill ALL the conditions above, no change in behavior. A filter which matches ALL of the above will be processed in a special manner: - The `domain=` option will be decomposed so as to create as many distinct filter as there is distinct value in the `domain=` option - This also apply to the `badfilter` version of the filter, which means it now become possible to `badfilter` only one of the distinct filter without having to `badfilter` all of them. - The logger will always report these special filters with only a single hostname in the `domain=` option. *** [1] HNTrie is currently WASM-ed on Firefox.
2019-04-19 22:33:46 +02:00
/******************************************************************************-
With default filter lists:
As of 2019-04-18:
{bits: "0", token: "ad", size: 926, f: FilterBucket}
{bits: "0", token: "ads", size: 636, f: FilterBucket}
{bits: "41", token: "phncdn", size: 253, f: FilterBucket}
{bits: "0", token: "analytic", size: 174, f: FilterBucket}
{bits: "0", token: "tracking", size: 155, f: FilterBucket}
{bits: "48", token: "http", size: 146, f: FilterBucket}
{bits: "48", token: "https", size: 139, f: FilterBucket}
{bits: "58", token: "http", size: 122, f: FilterBucket}
{bits: "0", token: "adv", size: 121, f: FilterBucket}
{bits: "58", token: "https", size: 118, f: FilterBucket}
{bits: "0", token: "advertis", size: 102, f: FilterBucket}
{bits: "8", token: "doublecl", size: 96, f: FilterBucket}
{bits: "41", token: "imasdk", size: 90, f: FilterBucket}
{bits: "0", token: "cdn", size: 89, f: FilterBucket}
{bits: "0", token: "track", size: 87, f: FilterBucket}
{bits: "0", token: "stats", size: 82, f: FilterBucket}
{bits: "0", token: "banner", size: 74, f: FilterBucket}
{bits: "0", token: "log", size: 72, f: FilterBucket}
{bits: "0", token: "ga", size: 71, f: FilterBucket}
{bits: "0", token: "gif", size: 67, f: FilterBucket}
{bits: "0", token: "cloudfro", size: 64, f: FilterBucket}
{bits: "0", token: "amazonaw", size: 61, f: FilterBucket}
{bits: "41", token: "ajax", size: 58, f: FilterBucket}
{bits: "0", token: "tracker", size: 56, f: FilterBucket}
{bits: "40", token: "pagead2", size: 53, f: FilterBucket}
{bits: "0", token: "affiliat", size: 53, f: FilterBucket}
*/
FilterContainer.prototype.bucketHistogram = function() {
const results = [];
for ( const [ bits, category ] of this.categories ) {
for ( const [ th, f ] of category ) {
if ( f instanceof FilterPair ) {
const token = µBlock.urlTokenizer.stringFromTokenHash(th);
results.push({ bits: bits.toString(16), token, size: f.size, f });
continue;
}
Add HNTrie-based filter classes to store origin-only filters Related issue: - https://github.com/uBlockOrigin/uBlock-issues/issues/528#issuecomment-484408622 Following STrie-related work in above issue, I noticed that a large number of filters in EasyList were filters which only had to match against the document origin. For instance, among just the top 10 most populous buckets, there were four such buckets with over hundreds of entries each: - bits: 72, token: "http", 146 entries - bits: 72, token: "https", 139 entries - bits: 88, token: "http", 122 entries - bits: 88, token: "https", 118 entries These filters in these buckets have to be matched against all the network requests. In order to leverage HNTrie for these filters[1], they are now handled in a special way so as to ensure they all end up in a single HNTrie (per bucket), which means that instead of scanning hundreds of entries per URL, there is now a single scan per bucket per URL for these apply-everywhere filters. Now, any filter which fulfill ALL the following condition will be processed in a special manner internally: - Is of the form `|https://` or `|http://` or `*`; and - Does have a `domain=` option; and - Does not have a negated domain in its `domain=` option; and - Does not have `csp=` option; and - Does not have a `redirect=` option If a filter does not fulfill ALL the conditions above, no change in behavior. A filter which matches ALL of the above will be processed in a special manner: - The `domain=` option will be decomposed so as to create as many distinct filter as there is distinct value in the `domain=` option - This also apply to the `badfilter` version of the filter, which means it now become possible to `badfilter` only one of the distinct filter without having to `badfilter` all of them. - The logger will always report these special filters with only a single hostname in the `domain=` option. *** [1] HNTrie is currently WASM-ed on Firefox.
2019-04-19 22:33:46 +02:00
if ( f instanceof FilterBucket ) {
const token = µBlock.urlTokenizer.stringFromTokenHash(th);
results.push({ bits: bits.toString(16), token, size: f.size, f });
continue;
}
if ( f instanceof FilterHostnameDict ) {
const token = µBlock.urlTokenizer.stringFromTokenHash(th);
results.push({ bits: bits.toString(16), token, size: f.size, f });
continue;
}
if ( f instanceof FilterJustOrigin ) {
const token = µBlock.urlTokenizer.stringFromTokenHash(th);
results.push({ bits: bits.toString(16), token, size: f.size, f });
continue;
}
}
}
results.sort((a, b) => {
return b.size - a.size;
});
console.log(results);
};
/*******************************************************************************
Add HNTrie-based filter classes to store origin-only filters Related issue: - https://github.com/uBlockOrigin/uBlock-issues/issues/528#issuecomment-484408622 Following STrie-related work in above issue, I noticed that a large number of filters in EasyList were filters which only had to match against the document origin. For instance, among just the top 10 most populous buckets, there were four such buckets with over hundreds of entries each: - bits: 72, token: "http", 146 entries - bits: 72, token: "https", 139 entries - bits: 88, token: "http", 122 entries - bits: 88, token: "https", 118 entries These filters in these buckets have to be matched against all the network requests. In order to leverage HNTrie for these filters[1], they are now handled in a special way so as to ensure they all end up in a single HNTrie (per bucket), which means that instead of scanning hundreds of entries per URL, there is now a single scan per bucket per URL for these apply-everywhere filters. Now, any filter which fulfill ALL the following condition will be processed in a special manner internally: - Is of the form `|https://` or `|http://` or `*`; and - Does have a `domain=` option; and - Does not have a negated domain in its `domain=` option; and - Does not have `csp=` option; and - Does not have a `redirect=` option If a filter does not fulfill ALL the conditions above, no change in behavior. A filter which matches ALL of the above will be processed in a special manner: - The `domain=` option will be decomposed so as to create as many distinct filter as there is distinct value in the `domain=` option - This also apply to the `badfilter` version of the filter, which means it now become possible to `badfilter` only one of the distinct filter without having to `badfilter` all of them. - The logger will always report these special filters with only a single hostname in the `domain=` option. *** [1] HNTrie is currently WASM-ed on Firefox.
2019-04-19 22:33:46 +02:00
With default filter lists:
Add HNTrie-based filter classes to store origin-only filters Related issue: - https://github.com/uBlockOrigin/uBlock-issues/issues/528#issuecomment-484408622 Following STrie-related work in above issue, I noticed that a large number of filters in EasyList were filters which only had to match against the document origin. For instance, among just the top 10 most populous buckets, there were four such buckets with over hundreds of entries each: - bits: 72, token: "http", 146 entries - bits: 72, token: "https", 139 entries - bits: 88, token: "http", 122 entries - bits: 88, token: "https", 118 entries These filters in these buckets have to be matched against all the network requests. In order to leverage HNTrie for these filters[1], they are now handled in a special way so as to ensure they all end up in a single HNTrie (per bucket), which means that instead of scanning hundreds of entries per URL, there is now a single scan per bucket per URL for these apply-everywhere filters. Now, any filter which fulfill ALL the following condition will be processed in a special manner internally: - Is of the form `|https://` or `|http://` or `*`; and - Does have a `domain=` option; and - Does not have a negated domain in its `domain=` option; and - Does not have `csp=` option; and - Does not have a `redirect=` option If a filter does not fulfill ALL the conditions above, no change in behavior. A filter which matches ALL of the above will be processed in a special manner: - The `domain=` option will be decomposed so as to create as many distinct filter as there is distinct value in the `domain=` option - This also apply to the `badfilter` version of the filter, which means it now become possible to `badfilter` only one of the distinct filter without having to `badfilter` all of them. - The logger will always report these special filters with only a single hostname in the `domain=` option. *** [1] HNTrie is currently WASM-ed on Firefox.
2019-04-19 22:33:46 +02:00
As of 2019-04-13:
{"FilterPlainHnAnchored" => 12619}
{"FilterPlainPrefix1" => 8743}
{"FilterGenericHnAnchored" => 5231}
{"FilterOriginHit" => 4149}
{"FilterPair" => 2381}
{"FilterBucket" => 1940}
{"FilterPlainHostname" => 1612}
{"FilterOriginHitSet" => 1430}
{"FilterPlainLeftAnchored" => 799}
{"FilterGeneric" => 588}
{"FilterPlain" => 510}
{"FilterOriginMiss" => 299}
{"FilterDataHolder" => 280}
{"FilterOriginMissSet" => 150}
{"FilterTrue" => 130}
{"FilterRegex" => 124}
{"FilterPlainRightAnchored" => 110}
{"FilterGenericHnAndRightAnchored" => 95}
{"FilterHostnameDict" => 59}
{"FilterPlainPrefix0" => 29}
{"FilterExactMatch" => 5}
{"FilterOriginMixedSet" => 3}
Observations:
- No need for FilterPlainPrefix0.
- FilterPlainHnAnchored and FilterPlainPrefix1 are good candidates
for storing in a plain string trie.
As of 2019-04-25:
{"FilterPlainHnAnchored" => 11078}
{"FilterPlainPrefix1" => 7195}
{"FilterPrefix1Trie" => 5720}
{"FilterOriginHit" => 3561}
{"FilterWildcard2HnAnchored" => 2943}
{"FilterPair" => 2391}
{"FilterBucket" => 1922}
{"FilterWildcard1HnAnchored" => 1910}
{"FilterHnAnchoredTrie" => 1586}
{"FilterPlainHostname" => 1391}
{"FilterOriginHitSet" => 1155}
{"FilterPlain" => 634}
{"FilterWildcard1" => 423}
{"FilterGenericHnAnchored" => 389}
{"FilterOriginMiss" => 302}
{"FilterGeneric" => 163}
{"FilterOriginMissSet" => 150}
{"FilterRegex" => 124}
{"FilterPlainRightAnchored" => 110}
{"FilterGenericHnAndRightAnchored" => 95}
{"FilterHostnameDict" => 59}
{"FilterPlainLeftAnchored" => 30}
{"FilterJustOrigin" => 22}
{"FilterHTTPJustOrigin" => 19}
{"FilterHTTPSJustOrigin" => 18}
{"FilterExactMatch" => 5}
{"FilterOriginMixedSet" => 3}
*/
FilterContainer.prototype.filterClassHistogram = function() {
const filterClassDetails = new Map();
for ( let i = 0; i < filterClasses.length; i++ ) {
filterClassDetails.set(i, { name: filterClasses[i].name, count: 0, });
}
// Artificial classes to report content of tries
filterClassDetails.set(1000, { name: 'FilterPlainTrie', count: 0, });
filterClassDetails.set(1001, { name: 'FilterPlainHnAnchoredTrie', count: 0, });
const countFilter = function(f) {
if ( f instanceof Object === false ) { return; }
filterClassDetails.get(f.fid).count += 1;
if ( f.wrapped ) {
countFilter(f.wrapped);
}
};
for ( const category of this.categories.values() ) {
for ( const f of category.values() ) {
countFilter(f);
if ( f instanceof FilterBucket ) {
for ( const g of f.filters ) { countFilter(g); }
if ( f.plainTrie !== null ) {
filterClassDetails.get(1000).count += f.plainTrie.size;
}
if ( f.plainHnAnchoredTrie !== null ) {
filterClassDetails.get(1001).count += f.plainHnAnchoredTrie.size;
}
continue;
}
if ( f instanceof FilterPair ) {
countFilter(f.f1);
countFilter(f.f2);
continue;
}
}
}
const results = Array.from(filterClassDetails.values()).sort((a, b) => {
return b.count - a.count;
});
console.log(results);
};
/******************************************************************************/
2014-06-24 00:42:43 +02:00
return new FilterContainer();
/******************************************************************************/
})();