uBlock/src/js/uritools.js

539 lines
17 KiB
JavaScript
Raw Normal View History

2014-06-24 00:42:43 +02:00
/*******************************************************************************
2016-04-12 14:48:24 +02:00
uBlock Origin - a browser extension to block requests.
Copyright (C) 2014-present Raymond Hill
2014-06-24 00:42:43 +02:00
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see {http://www.gnu.org/licenses/}.
Home: https://github.com/gorhill/uBlock
*/
2016-04-12 14:48:24 +02:00
/* global publicSuffixList */
2014-06-24 00:42:43 +02:00
2016-10-29 17:15:04 +02:00
'use strict';
2014-06-24 00:42:43 +02:00
/*******************************************************************************
RFC 3986 as reference: http://tools.ietf.org/html/rfc3986#appendix-A
Naming convention from https://en.wikipedia.org/wiki/URI_scheme#Examples
*/
/******************************************************************************/
µBlock.URI = (function() {
/******************************************************************************/
var punycode = self.punycode;
2014-06-24 00:42:43 +02:00
// Favorite regex tool: http://regex101.com/
// Ref: <http://tools.ietf.org/html/rfc3986#page-50>
// I removed redundant capture groups: capture less = peform faster. See
// <http://jsperf.com/old-uritools-vs-new-uritools>
// Performance improvements welcomed.
// jsperf: <http://jsperf.com/old-uritools-vs-new-uritools>
var reRFC3986 = /^([^:\/?#]+:)?(\/\/[^\/?#]*)?([^?#]*)(\?[^#]*)?(#.*)?/;
// Derived
var reSchemeFromURI = /^[^:\/?#]+:/;
var reAuthorityFromURI = /^(?:[^:\/?#]+:)?(\/\/[^\/?#]+)/;
var reOriginFromURI = /^(?:[^:\/?#]+:)\/\/[^\/?#]+/;
var reCommonHostnameFromURL = /^https?:\/\/([0-9a-z_][0-9a-z._-]*[0-9a-z])\//;
2016-01-22 17:13:29 +01:00
var rePathFromURI = /^(?:[^:\/?#]+:)?(?:\/\/[^\/?#]*)?([^?#]*)/;
var reMustNormalizeHostname = /[^0-9a-z._-]/;
2014-06-24 00:42:43 +02:00
// These are to parse authority field, not parsed by above official regex
// IPv6 is seen as an exception: a non-compatible IPv6 is first tried, and
// if it fails, the IPv6 compatible regex istr used. This helps
// peformance by avoiding the use of a too complicated regex first.
// https://github.com/gorhill/httpswitchboard/issues/211
// "While a hostname may not contain other characters, such as the
// "underscore character (_), other DNS names may contain the underscore"
var reHostPortFromAuthority = /^(?:[^@]*@)?([^:]*)(:\d*)?$/;
2014-06-24 00:42:43 +02:00
var reIPv6PortFromAuthority = /^(?:[^@]*@)?(\[[0-9a-f:]*\])(:\d*)?$/i;
var reHostFromNakedAuthority = /^[0-9a-z._-]+[0-9a-z]$/i;
var reHostFromAuthority = /^(?:[^@]*@)?([^:]+)(?::\d*)?$/;
2014-06-24 00:42:43 +02:00
var reIPv6FromAuthority = /^(?:[^@]*@)?(\[[0-9a-f:]+\])(?::\d*)?$/i;
// Coarse (but fast) tests
var reValidHostname = /^([a-z\d]+(-*[a-z\d]+)*)(\.[a-z\d]+(-*[a-z\d])*)*$/;
var reIPAddressNaive = /^\d+\.\d+\.\d+\.\d+$|^\[[\da-zA-Z:]+\]$/;
/******************************************************************************/
var reset = function(o) {
o.scheme = '';
o.hostname = '';
o._ipv4 = undefined;
o._ipv6 = undefined;
o.port = '';
o.path = '';
o.query = '';
o.fragment = '';
return o;
};
var resetAuthority = function(o) {
o.hostname = '';
o._ipv4 = undefined;
o._ipv6 = undefined;
o.port = '';
return o;
};
/******************************************************************************/
// This will be exported
var URI = {
scheme: '',
authority: '',
hostname: '',
_ipv4: undefined,
_ipv6: undefined,
port: '',
domain: undefined,
path: '',
query: '',
fragment: '',
schemeBit: (1 << 0),
userBit: (1 << 1),
passwordBit: (1 << 2),
hostnameBit: (1 << 3),
portBit: (1 << 4),
pathBit: (1 << 5),
queryBit: (1 << 6),
fragmentBit: (1 << 7),
allBits: (0xFFFF)
};
URI.authorityBit = (URI.userBit | URI.passwordBit | URI.hostnameBit | URI.portBit);
URI.normalizeBits = (URI.schemeBit | URI.hostnameBit | URI.pathBit | URI.queryBit);
/******************************************************************************/
// See: https://en.wikipedia.org/wiki/URI_scheme#Examples
// URI = scheme ":" hier-part [ "?" query ] [ "#" fragment ]
//
// foo://example.com:8042/over/there?name=ferret#nose
// \_/ \______________/\_________/ \_________/ \__/
// | | | | |
// scheme authority path query fragment
// | _____________________|__
// / \ / \
// urn:example:animal:ferret:nose
URI.set = function(uri) {
if ( uri === undefined ) {
return reset(URI);
}
var matches = reRFC3986.exec(uri);
if ( !matches ) {
return reset(URI);
}
this.scheme = matches[1] !== undefined ? matches[1].slice(0, -1) : '';
this.authority = matches[2] !== undefined ? matches[2].slice(2).toLowerCase() : '';
this.path = matches[3] !== undefined ? matches[3] : '';
// <http://tools.ietf.org/html/rfc3986#section-6.2.3>
// "In general, a URI that uses the generic syntax for authority
// "with an empty path should be normalized to a path of '/'."
if ( this.authority !== '' && this.path === '' ) {
this.path = '/';
}
this.query = matches[4] !== undefined ? matches[4].slice(1) : '';
this.fragment = matches[5] !== undefined ? matches[5].slice(1) : '';
// Assume very simple authority, i.e. just a hostname (highest likelihood
// case for µBlock)
if ( reHostFromNakedAuthority.test(this.authority) ) {
this.hostname = this.authority;
this.port = '';
return this;
}
// Authority contains more than just a hostname
matches = reHostPortFromAuthority.exec(this.authority);
if ( !matches ) {
matches = reIPv6PortFromAuthority.exec(this.authority);
if ( !matches ) {
return resetAuthority(URI);
}
}
this.hostname = matches[1] !== undefined ? matches[1] : '';
// http://en.wikipedia.org/wiki/FQDN
if ( this.hostname.endsWith('.') ) {
2014-06-24 00:42:43 +02:00
this.hostname = this.hostname.slice(0, -1);
}
this.port = matches[2] !== undefined ? matches[2].slice(1) : '';
return this;
};
/******************************************************************************/
// URI = scheme ":" hier-part [ "?" query ] [ "#" fragment ]
//
// foo://example.com:8042/over/there?name=ferret#nose
// \_/ \______________/\_________/ \_________/ \__/
// | | | | |
// scheme authority path query fragment
// | _____________________|__
// / \ / \
// urn:example:animal:ferret:nose
URI.assemble = function(bits) {
if ( bits === undefined ) {
bits = this.allBits;
}
var s = [];
if ( this.scheme && (bits & this.schemeBit) ) {
s.push(this.scheme, ':');
}
if ( this.hostname && (bits & this.hostnameBit) ) {
s.push('//', this.hostname);
}
if ( this.port && (bits & this.portBit) ) {
s.push(':', this.port);
}
if ( this.path && (bits & this.pathBit) ) {
s.push(this.path);
}
if ( this.query && (bits & this.queryBit) ) {
s.push('?', this.query);
}
if ( this.fragment && (bits & this.fragmentBit) ) {
s.push('#', this.fragment);
}
return s.join('');
};
/******************************************************************************/
2016-04-28 17:28:08 +02:00
URI.originFromURI = function(uri) {
const matches = reOriginFromURI.exec(uri);
2016-04-28 17:28:08 +02:00
return matches !== null ? matches[0].toLowerCase() : '';
};
/******************************************************************************/
2014-06-24 00:42:43 +02:00
URI.schemeFromURI = function(uri) {
var matches = reSchemeFromURI.exec(uri);
if ( !matches ) {
return '';
}
return matches[0].slice(0, -1).toLowerCase();
};
/******************************************************************************/
URI.authorityFromURI = function(uri) {
var matches = reAuthorityFromURI.exec(uri);
if ( !matches ) {
return '';
}
return matches[1].slice(2).toLowerCase();
};
/******************************************************************************/
// The most used function, so it better be fast.
// https://github.com/gorhill/uBlock/issues/1559
// See http://en.wikipedia.org/wiki/FQDN
// https://bugzilla.mozilla.org/show_bug.cgi?id=1360285
// Revisit punycode dependency when above issue is fixed in Firefox.
2014-06-24 00:42:43 +02:00
URI.hostnameFromURI = function(uri) {
2015-03-18 12:13:53 +01:00
var matches = reCommonHostnameFromURL.exec(uri);
if ( matches !== null ) { return matches[1]; }
2015-03-18 12:13:53 +01:00
matches = reAuthorityFromURI.exec(uri);
if ( matches === null ) { return ''; }
2014-06-24 00:42:43 +02:00
var authority = matches[1].slice(2);
// Assume very simple authority (most common case for µBlock)
if ( reHostFromNakedAuthority.test(authority) ) {
return authority.toLowerCase();
}
matches = reHostFromAuthority.exec(authority);
if ( matches === null ) {
2014-06-24 00:42:43 +02:00
matches = reIPv6FromAuthority.exec(authority);
if ( matches === null ) { return ''; }
2014-06-24 00:42:43 +02:00
}
var hostname = matches[1];
2016-04-12 14:48:24 +02:00
while ( hostname.endsWith('.') ) {
2014-06-24 00:42:43 +02:00
hostname = hostname.slice(0, -1);
}
if ( reMustNormalizeHostname.test(hostname) ) {
hostname = punycode.toASCII(hostname.toLowerCase());
}
return hostname;
2014-06-24 00:42:43 +02:00
};
/******************************************************************************/
2015-01-08 16:37:19 +01:00
URI.domainFromHostname = function(hostname) {
let entry = domainCache.get(hostname);
if ( entry !== undefined ) {
2015-01-08 16:37:19 +01:00
entry.tstamp = Date.now();
return entry.domain;
}
if ( reIPAddressNaive.test(hostname) === false ) {
return domainCacheAdd(hostname, psl.getDomain(hostname));
}
return domainCacheAdd(hostname, hostname);
};
2018-04-05 21:22:19 +02:00
URI.domainFromHostnameNoCache = function(hostname) {
return reIPAddressNaive.test(hostname) ? hostname : psl.getDomain(hostname);
};
2015-01-08 16:37:19 +01:00
URI.domain = function() {
return this.domainFromHostname(this.hostname);
};
2014-06-24 00:42:43 +02:00
// It is expected that there is higher-scoped `publicSuffixList` lingering
// somewhere. Cache it. See <https://github.com/gorhill/publicsuffixlist.js>.
var psl = publicSuffixList;
2015-01-08 16:37:19 +01:00
/******************************************************************************/
URI.entityFromDomain = function(domain) {
var pos = domain.indexOf('.');
return pos !== -1 ? domain.slice(0, pos) + '.*' : '';
};
/******************************************************************************/
2016-01-22 17:13:29 +01:00
URI.pathFromURI = function(uri) {
var matches = rePathFromURI.exec(uri);
return matches !== null ? matches[1] : '';
};
/******************************************************************************/
2015-01-08 16:37:19 +01:00
// Trying to alleviate the worries of looking up too often the domain name from
// a hostname. With a cache, uBlock benefits given that it deals with a
// specific set of hostnames within a narrow time span -- in other words, I
// believe probability of cache hit are high in uBlock.
const domainCache = new Map();
const domainCacheCountLowWaterMark = 40;
const domainCacheCountHighWaterMark = 60;
const domainCacheEntryJunkyardMax =
2018-04-05 21:22:19 +02:00
domainCacheCountHighWaterMark - domainCacheCountLowWaterMark;
const DomainCacheEntry = function(domain) {
2015-01-08 16:37:19 +01:00
this.init(domain);
};
DomainCacheEntry.prototype = {
init: function(domain) {
this.domain = domain;
this.tstamp = Date.now();
return this;
},
dispose: function() {
this.domain = '';
if ( domainCacheEntryJunkyard.length < domainCacheEntryJunkyardMax ) {
domainCacheEntryJunkyard.push(this);
}
},
2014-06-24 00:42:43 +02:00
};
const domainCacheEntryFactory = function(domain) {
2018-04-05 21:22:19 +02:00
return domainCacheEntryJunkyard.length !== 0 ?
domainCacheEntryJunkyard.pop().init(domain) :
new DomainCacheEntry(domain);
2015-01-08 16:37:19 +01:00
};
const domainCacheEntryJunkyard = [];
2015-01-08 16:37:19 +01:00
const domainCacheAdd = function(hostname, domain) {
const entry = domainCache.get(hostname);
if ( entry !== undefined ) {
entry.tstamp = Date.now();
2015-01-08 16:37:19 +01:00
} else {
2018-04-05 21:22:19 +02:00
domainCache.set(hostname, domainCacheEntryFactory(domain));
if ( domainCache.size === domainCacheCountHighWaterMark ) {
2015-01-08 16:37:19 +01:00
domainCachePrune();
}
}
return domain;
};
const domainCacheEntrySort = function(a, b) {
2018-04-05 21:22:19 +02:00
return domainCache.get(b).tstamp - domainCache.get(a).tstamp;
2014-06-24 00:42:43 +02:00
};
const domainCachePrune = function() {
const hostnames = Array.from(domainCache.keys())
.sort(domainCacheEntrySort)
.slice(domainCacheCountLowWaterMark);
let i = hostnames.length;
2015-01-08 16:37:19 +01:00
while ( i-- ) {
const hostname = hostnames[i];
2018-04-05 21:22:19 +02:00
domainCache.get(hostname).dispose();
domainCache.delete(hostname);
2015-01-08 16:37:19 +01:00
}
};
2018-04-05 21:22:19 +02:00
window.addEventListener('publicSuffixList', function() {
domainCache.clear();
});
2015-04-29 16:29:44 +02:00
2014-06-24 00:42:43 +02:00
/******************************************************************************/
URI.domainFromURI = function(uri) {
if ( !uri ) {
return '';
}
return this.domainFromHostname(this.hostnameFromURI(uri));
};
/******************************************************************************/
2016-10-29 17:15:04 +02:00
URI.isNetworkURI = function(uri) {
2017-05-27 17:51:24 +02:00
return reNetworkURI.test(uri);
2016-10-29 17:15:04 +02:00
};
2017-05-27 17:51:24 +02:00
var reNetworkURI = /^(?:ftps?|https?|wss?):\/\//;
2016-10-29 17:15:04 +02:00
/******************************************************************************/
URI.isNetworkScheme = function(scheme) {
2017-05-27 17:51:24 +02:00
return reNetworkScheme.test(scheme);
2016-10-29 17:15:04 +02:00
};
2017-05-27 17:51:24 +02:00
var reNetworkScheme = /^(?:ftps?|https?|wss?)$/;
2016-10-29 17:15:04 +02:00
/******************************************************************************/
2014-06-24 00:42:43 +02:00
// Normalize the way µBlock expects it
URI.normalizedURI = function() {
// Will be removed:
// - port
// - user id/password
// - fragment
return this.assemble(this.normalizeBits);
};
/******************************************************************************/
URI.rootURL = function() {
if ( !this.hostname ) {
return '';
}
return this.assemble(this.schemeBit | this.hostnameBit);
};
/******************************************************************************/
URI.isValidHostname = function(hostname) {
var r;
try {
r = reValidHostname.test(hostname);
}
catch (e) {
return false;
}
return r;
};
/******************************************************************************/
// Return the parent domain. For IP address, there is no parent domain.
URI.parentHostnameFromHostname = function(hostname) {
// `locahost` => ``
// `example.org` => `example.org`
// `www.example.org` => `example.org`
// `tomato.www.example.org` => `example.org`
var domain = this.domainFromHostname(hostname);
// `locahost` === `` => bye
// `example.org` === `example.org` => bye
// `www.example.org` !== `example.org` => stay
// `tomato.www.example.org` !== `example.org` => stay
if ( domain === '' || domain === hostname ) {
return undefined;
}
// Parent is hostname minus first label
return hostname.slice(hostname.indexOf('.') + 1);
};
/******************************************************************************/
// Return all possible parent hostnames which can be derived from `hostname`,
// ordered from direct parent up to domain inclusively.
URI.parentHostnamesFromHostname = function(hostname) {
// TODO: I should create an object which is optimized to receive
// the list of hostnames by making it reusable (junkyard etc.) and which
// has its own element counter property in order to avoid memory
// alloc/dealloc.
var domain = this.domainFromHostname(hostname);
if ( domain === '' || domain === hostname ) {
return [];
}
var nodes = [];
var pos;
for (;;) {
pos = hostname.indexOf('.');
if ( pos < 0 ) {
break;
}
hostname = hostname.slice(pos + 1);
nodes.push(hostname);
if ( hostname === domain ) {
break;
}
}
return nodes;
};
/******************************************************************************/
// Return all possible hostnames which can be derived from `hostname`,
// ordered from self up to domain inclusively.
URI.allHostnamesFromHostname = function(hostname) {
var nodes = this.parentHostnamesFromHostname(hostname);
nodes.unshift(hostname);
return nodes;
};
/******************************************************************************/
URI.toString = function() {
return this.assemble();
};
/******************************************************************************/
// Export
return URI;
/******************************************************************************/
})();
/******************************************************************************/