From a944873b837b4eae98f2f0ee32ecb09cd0e32b95 Mon Sep 17 00:00:00 2001 From: gorhill Date: Tue, 29 Dec 2015 11:34:41 -0500 Subject: [PATCH] code review: convert static filtering's tokenizer to a global utility --- .jshintrc | 29 ++++++----- src/js/static-net-filtering.js | 82 +++++------------------------- src/js/tab.js | 2 +- src/js/utils.js | 93 +++++++++++++++++++++++++++------- 4 files changed, 106 insertions(+), 100 deletions(-) diff --git a/.jshintrc b/.jshintrc index df4628b92..625274fe4 100644 --- a/.jshintrc +++ b/.jshintrc @@ -1,21 +1,24 @@ { "browser": true, "devel": true, + "eqeqeq": true, "esnext": true, - "globalstrict": true, - "undef": true, - "unused": true, - "nonew": false, - "sub": true, - "laxbreak": true, - "validthis": true, - "newcap": false, - "-W058": true, // suppress "Missing '()' invoking a constructor" message "globals": { + "chrome": false, + "Components": false, // global variable in Firefox + "safari": false, "self": false, "vAPI": false, - "chrome": false, - "safari": false, - "Components": false // global variable in Firefox - } + "µBlock": false + }, + "globalstrict": true, + "laxbreak": true, + "newcap": false, + "nonew": false, + "strict": "global", + "sub": true, + "undef": true, + "unused": true, + "validthis": true, + "-W058": true // suppress "Missing '()' invoking a constructor" message } diff --git a/src/js/static-net-filtering.js b/src/js/static-net-filtering.js index 33f700627..e7ec8c8b0 100644 --- a/src/js/static-net-filtering.js +++ b/src/js/static-net-filtering.js @@ -19,8 +19,8 @@ Home: https://github.com/gorhill/uBlock */ -/* jshint bitwise: false, esnext: true */ -/* global punycode, µBlock */ +/* jshint bitwise: false */ +/* global punycode */ /******************************************************************************/ @@ -1657,19 +1657,10 @@ FilterParser.prototype.makeToken = function() { /******************************************************************************/ /******************************************************************************/ -var TokenEntry = function() { - this.beg = 0; - this.token = ''; -}; - -/******************************************************************************/ -/******************************************************************************/ - var FilterContainer = function() { - this.reAnyToken = /[%0-9a-z]+/g; this.reIsGeneric = /[\^\*]/; - this.tokens = []; this.filterParser = new FilterParser(); + this.urlTokenizer = µb.urlTokenizer; this.reset(); }; @@ -2254,42 +2245,6 @@ FilterContainer.prototype.filterRegexFromCompiled = function(compiled, flags) { /******************************************************************************/ -// Since the addition of the `important` evaluation, this means it is now -// likely that the url will have to be scanned more than once. So this is -// to ensure we do it once only, and reuse results. - -FilterContainer.prototype.tokenize = function(url) { - var tokens = this.tokens; - var re = this.reAnyToken; - var matches, tokenEntry; - re.lastIndex = 0; - var i = 0; - while ( (matches = re.exec(url)) ) { - tokenEntry = tokens[i]; - if ( tokenEntry === undefined ) { - tokenEntry = tokens[i] = new TokenEntry(); - } - tokenEntry.beg = matches.index; - tokenEntry.token = matches[0]; - i += 1; - - // https://github.com/chrisaljoudi/uBlock/issues/1118 - // Crazy case... but I guess we have to expect the worst... - if ( i === 2048 ) { - break; - } - } - - // Sentinel - tokenEntry = tokens[i]; - if ( tokenEntry === undefined ) { - tokenEntry = tokens[i] = new TokenEntry(); - } - tokenEntry.token = ''; -}; - -/******************************************************************************/ - FilterContainer.prototype.matchTokens = function(bucket, url) { // Hostname-only filters var f = bucket['.']; @@ -2299,7 +2254,7 @@ FilterContainer.prototype.matchTokens = function(bucket, url) { return true; } - var tokens = this.tokens; + var tokens = this.urlTokenizer.getTokens(); var tokenEntry, token; var i = 0; for (;;) { @@ -2336,22 +2291,20 @@ FilterContainer.prototype.matchTokens = function(bucket, url) { // not the generic handling. FilterContainer.prototype.matchStringExactType = function(context, requestURL, requestType) { - var url = requestURL.toLowerCase(); - - // These registers will be used by various filters - pageHostnameRegister = context.pageHostname || ''; - requestHostnameRegister = µb.URI.hostnameFromURI(requestURL); - - var party = isFirstParty(context.pageDomain, requestHostnameRegister) ? FirstParty : ThirdParty; - // Be prepared to support unknown types var type = typeNameToTypeValue[requestType] || 0; if ( type === 0 ) { return undefined; } - // Tokenize only once - this.tokenize(url); + // Prime tokenizer: we get a normalized URL in return. + var url = this.urlTokenizer.setURL(requestURL); + + // These registers will be used by various filters + pageHostnameRegister = context.pageHostname || ''; + requestHostnameRegister = µb.URI.hostnameFromURI(url); + + var party = isFirstParty(context.pageDomain, requestHostnameRegister) ? FirstParty : ThirdParty; this.fRegister = null; @@ -2426,12 +2379,6 @@ FilterContainer.prototype.matchString = function(context) { return this.matchStringExactType(context, context.requestURL, context.requestType); } - // https://github.com/gorhill/httpswitchboard/issues/239 - // Convert url to lower case: - // `match-case` option not supported, but then, I saw only one - // occurrence of it in all the supported lists (bulgaria list). - var url = context.requestURL.toLowerCase(); - // The logic here is simple: // // block = !whitelisted && blacklisted @@ -2453,14 +2400,13 @@ FilterContainer.prototype.matchString = function(context) { // filters are tested *only* if there is a (unlikely) hit on a block // filter. + // Prime tokenizer: we get a normalized URL in return. + var url = this.urlTokenizer.setURL(context.requestURL); // These registers will be used by various filters pageHostnameRegister = context.pageHostname || ''; requestHostnameRegister = context.requestHostname; - // Tokenize only once - this.tokenize(url); - this.fRegister = null; var party = isFirstParty(context.pageDomain, context.requestHostname) ? FirstParty : ThirdParty; diff --git a/src/js/tab.js b/src/js/tab.js index 009d63d32..05611fdf0 100644 --- a/src/js/tab.js +++ b/src/js/tab.js @@ -769,7 +769,7 @@ vAPI.tabs.registerListeners(); if ( pageStore !== null ) { state = pageStore.getNetFilteringSwitch(); if ( state && this.userSettings.showIconBadge && pageStore.perLoadBlockedRequestCount ) { - badge = this.utils.formatCount(pageStore.perLoadBlockedRequestCount); + badge = this.formatCount(pageStore.perLoadBlockedRequestCount); } } diff --git a/src/js/utils.js b/src/js/utils.js index 465e41a65..14fcb2893 100644 --- a/src/js/utils.js +++ b/src/js/utils.js @@ -1,7 +1,7 @@ /******************************************************************************* - µBlock - a browser extension to block requests. - Copyright (C) 2014 Raymond Hill + uBlock Origin - a browser extension to block requests. + Copyright (C) 2014-2015 Raymond Hill This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -19,22 +19,87 @@ Home: https://github.com/gorhill/uBlock */ -/* global µBlock */ 'use strict'; /******************************************************************************/ -// This will inserted as a module in the µBlock object. +// A standalone URL tokenizer will allow us to use URL tokens in more than +// just static filtering engine. This opens the door to optimize other +// filtering engine parts aside static filtering. This also allows: +// - Tokenize only on demand. +// - To potentially avoid tokenizing when same URL is fed to tokenizer. +// - Benchmarking shows this to be a common occurrence. -µBlock.utils = (function() { +µBlock.urlTokenizer = { + setURL: function(url) { + if ( url !== this._urlIn ) { + this._urlIn = url; + this._urlOut = url.toLowerCase(); + this._tokenized = false; + } + return this._urlOut; + }, + + // Tokenize on demand. + getTokens: function() { + if ( this._tokenized === false ) { + this._tokenize(); + this._tokenized = true; + } + return this._tokens; + }, + + isTokenized: function() { + return this._tokens !== null && this._tokens[0].token !== ''; + }, + + _Entry: function() { + this.beg = 0; + this.token = ''; + }, + + // https://github.com/chrisaljoudi/uBlock/issues/1118 + // We limit to a maximum number of tokens. + _init: function() { + this._tokens = new Array(2048); + for ( var i = 0; i < 2048; i++ ) { + this._tokens[i] = new this._Entry(); + } + + this._init = null; + }, + + _tokenize: function() { + var tokens = this._tokens, + re = this._reAnyToken, + url = this._urlOut; + var matches, entry; + re.lastIndex = 0; + + for ( var i = 0; i < 2047; i++ ) { + matches = re.exec(url); + if ( matches === null ) { + break; + } + entry = tokens[i]; + entry.beg = matches.index; + entry.token = matches[0]; + } + tokens[i].token = ''; // Sentinel + }, + + _urlIn: '', + _urlOut: '', + _tokenized: false, + _tokens: null, + _reAnyToken: /[%0-9a-z]+/g +}; + +µBlock.urlTokenizer._init(); /******************************************************************************/ -var exports = {}; - -/******************************************************************************/ - -exports.formatCount = function(count) { +µBlock.formatCount = function(count) { if ( typeof count !== 'number' ) { return ''; } @@ -58,11 +123,3 @@ exports.formatCount = function(count) { // https://www.youtube.com/watch?v=DyvzfyqYm_s /******************************************************************************/ - -return exports; - -/******************************************************************************/ - -})(); - -/******************************************************************************/