2017-11-02 20:49:11 +01:00
|
|
|
/*******************************************************************************
|
|
|
|
|
|
|
|
uBlock Origin - a browser extension to block requests.
|
|
|
|
Copyright (C) 2017 Raymond Hill
|
|
|
|
|
|
|
|
This program is free software: you can redistribute it and/or modify
|
|
|
|
it under the terms of the GNU General Public License as published by
|
|
|
|
the Free Software Foundation, either version 3 of the License, or
|
|
|
|
(at your option) any later version.
|
|
|
|
|
|
|
|
This program is distributed in the hope that it will be useful,
|
|
|
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
|
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
|
|
GNU General Public License for more details.
|
|
|
|
|
|
|
|
You should have received a copy of the GNU General Public License
|
|
|
|
along with this program. If not, see {http://www.gnu.org/licenses/}.
|
|
|
|
|
|
|
|
Home: https://github.com/gorhill/uBlock
|
|
|
|
*/
|
|
|
|
|
|
|
|
'use strict';
|
|
|
|
|
|
|
|
/*******************************************************************************
|
|
|
|
|
|
|
|
The original prototype was to develop an idea I had about using jump indices
|
|
|
|
in a TypedArray for quickly matching hostnames (or more generally strings)[1].
|
|
|
|
Once I had a working, un-optimized prototype, I realized I had ended up
|
|
|
|
with something formally named a "trie": <https://en.wikipedia.org/wiki/Trie>,
|
|
|
|
hence the name. I have no idea whether the implementation here or one
|
|
|
|
resembling it has been done elsewhere.
|
|
|
|
|
|
|
|
"HN" in HNTrieBuilder stands for "HostName", because the trie is specialized
|
|
|
|
to deal with matching hostnames -- which is a bit more complicated than
|
|
|
|
matching plain strings.
|
|
|
|
|
|
|
|
For example, `www.abc.com` is deemed matching `abc.com`, because the former
|
|
|
|
is a subdomain of the latter. The opposite is of course not true.
|
|
|
|
|
|
|
|
The resulting read-only trie created as a result of using HNTrieBuilder are
|
|
|
|
simply just typed arrays filled with integers. The matching algorithm is
|
|
|
|
just a matter of reading/comparing these integers, and further using them as
|
|
|
|
indices in the array as a way to move around in the trie.
|
|
|
|
|
|
|
|
There is still place for optimizations. Specifically, I could force the
|
|
|
|
strings to be properly sorted so that `HNTrie.matches` could bail earlier
|
|
|
|
when trying to find a matching descendant -- but suspect the gain would be
|
|
|
|
marginal, if measurable.
|
|
|
|
|
|
|
|
[1] To solve <https://github.com/gorhill/uBlock/issues/3193>
|
|
|
|
|
|
|
|
*/
|
|
|
|
|
|
|
|
var HNTrieBuilder = function() {
|
|
|
|
this.reset();
|
|
|
|
};
|
|
|
|
|
|
|
|
/*******************************************************************************
|
|
|
|
|
|
|
|
A plain javascript array is used to build the trie. It will be casted into
|
|
|
|
the appropriate read-only TypedArray[1] at vacuum time.
|
|
|
|
|
|
|
|
[1] Depending on the size: Uint8Array, Uint16Array, or Uint32Array.
|
|
|
|
|
|
|
|
*/
|
|
|
|
|
|
|
|
HNTrieBuilder.prototype.reset = function() {
|
|
|
|
this.buf = [];
|
|
|
|
this.bufsz = 0;
|
|
|
|
this.buf[0] = 0;
|
|
|
|
this.buf[1] = 0;
|
|
|
|
this.buf[2] = 0;
|
|
|
|
return this;
|
|
|
|
};
|
|
|
|
|
|
|
|
/*******************************************************************************
|
|
|
|
|
|
|
|
Helpers for convenience.
|
|
|
|
|
|
|
|
*/
|
|
|
|
|
|
|
|
HNTrieBuilder.fromDomainOpt = function(domainOpt) {
|
|
|
|
var builder = new HNTrieBuilder();
|
|
|
|
builder.fromDomainOpt(domainOpt);
|
|
|
|
return builder.vacuum();
|
|
|
|
};
|
|
|
|
|
|
|
|
HNTrieBuilder.fromIterable = function(hostnames) {
|
|
|
|
var builder = new HNTrieBuilder();
|
|
|
|
builder.fromIterable(hostnames);
|
|
|
|
return builder.vacuum();
|
|
|
|
};
|
|
|
|
|
|
|
|
HNTrieBuilder.print = function(trie) {
|
|
|
|
var buf = trie.buf,
|
|
|
|
i = 0, cc = [], ic, indent = 0,
|
|
|
|
forks = [];
|
|
|
|
for (;;) {
|
|
|
|
if ( buf[i] !== 0 ) {
|
|
|
|
forks.push(i, indent);
|
|
|
|
}
|
2017-11-03 13:36:16 +01:00
|
|
|
cc.unshift(buf[i+2]);
|
2017-11-02 20:49:11 +01:00
|
|
|
for ( ic = 0; ic < buf[i+3]; ic++ ) {
|
|
|
|
cc.unshift(buf[i+4+ic]);
|
|
|
|
}
|
|
|
|
console.log('\xB7'.repeat(indent) + String.fromCharCode.apply(null, cc));
|
|
|
|
indent += cc.length;
|
|
|
|
cc = [];
|
|
|
|
i = buf[i+1];
|
|
|
|
if ( i === 0 ) {
|
|
|
|
if ( forks.length === 0 ) { break; }
|
|
|
|
indent = forks.pop();
|
|
|
|
i = forks.pop();
|
|
|
|
i = buf[i];
|
|
|
|
}
|
|
|
|
}
|
|
|
|
};
|
|
|
|
|
|
|
|
/*******************************************************************************
|
|
|
|
|
|
|
|
Since this trie is specialized for matching hostnames, the stored strings are
|
|
|
|
reversed internally, because of hostname comparison logic:
|
|
|
|
|
2017-11-03 13:36:16 +01:00
|
|
|
Correct matching:
|
2017-11-02 20:49:11 +01:00
|
|
|
index 0123456
|
|
|
|
abc.com
|
|
|
|
|
|
|
|
|
www.abc.com
|
|
|
|
index 01234567890
|
|
|
|
|
2017-11-03 13:36:16 +01:00
|
|
|
Incorrect matching (typically used for plain strings):
|
2017-11-02 20:49:11 +01:00
|
|
|
index 0123456
|
|
|
|
abc.com
|
|
|
|
|
|
|
|
|
www.abc.com
|
|
|
|
index 01234567890
|
|
|
|
|
|
|
|
*/
|
|
|
|
|
|
|
|
HNTrieBuilder.prototype.add = function(hn) {
|
|
|
|
var ichar = hn.length - 1;
|
|
|
|
if ( ichar === -1 ) { return; }
|
|
|
|
var c = hn.charCodeAt(ichar),
|
|
|
|
i = 0, inext;
|
|
|
|
for (;;) {
|
|
|
|
if ( this.buf[i+2] !== c ) { // match not found
|
|
|
|
inext = this.buf[i]; // move to descendant
|
|
|
|
if ( inext === 0 ) { break; } // no descendant
|
|
|
|
} else { // match found
|
|
|
|
if ( c === 0 ) { return; }
|
|
|
|
inext = this.buf[i+1]; // move to sibling
|
|
|
|
ichar -= 1;
|
|
|
|
c = ichar === -1 ? 0 : hn.charCodeAt(ichar);
|
|
|
|
}
|
|
|
|
i = inext;
|
|
|
|
}
|
|
|
|
// Any new string added will always cause a new descendant to be created.
|
|
|
|
// The only time this is not the case is when trying to store a string
|
|
|
|
// which is already in the trie.
|
|
|
|
inext = this.bufsz; // new descendant cell
|
|
|
|
this.buf[i] = inext;
|
|
|
|
this.buf[inext+0] = 0; // jump index to descendant
|
|
|
|
this.buf[inext+1] = 0; // jump index to sibling
|
|
|
|
this.buf[inext+2] = c; // character code
|
|
|
|
this.bufsz += 3;
|
|
|
|
if ( c === 0 ) { return; } // character zero is always last cell
|
2017-11-03 13:36:16 +01:00
|
|
|
do {
|
|
|
|
i = inext; // new branch sprouting made from
|
|
|
|
ichar -= 1; // all characters left to store
|
2017-11-02 20:49:11 +01:00
|
|
|
c = ichar === -1 ? 0 : hn.charCodeAt(ichar);
|
|
|
|
inext = this.bufsz;
|
|
|
|
this.buf[i+1] = inext;
|
|
|
|
this.buf[inext+0] = 0;
|
|
|
|
this.buf[inext+1] = 0;
|
|
|
|
this.buf[inext+2] = c;
|
|
|
|
this.bufsz += 3;
|
|
|
|
} while ( c!== 0 );
|
|
|
|
};
|
|
|
|
|
|
|
|
/*******************************************************************************
|
|
|
|
|
|
|
|
Not using String.split('|') to avoid memory churning.
|
|
|
|
|
|
|
|
*/
|
|
|
|
|
|
|
|
HNTrieBuilder.prototype.fromDomainOpt = function(hostnames) {
|
2017-12-08 06:33:02 +01:00
|
|
|
return this.fromIterable(hostnames.split('|'));
|
2017-11-02 20:49:11 +01:00
|
|
|
};
|
|
|
|
|
|
|
|
HNTrieBuilder.prototype.fromIterable = function(hostnames) {
|
2017-12-08 13:07:05 +01:00
|
|
|
var hns = Array.from(hostnames).sort(function(a, b) {
|
|
|
|
return a.length - b.length;
|
|
|
|
});
|
2017-12-08 06:33:02 +01:00
|
|
|
// https://github.com/gorhill/uBlock/issues/3328
|
|
|
|
// Must sort from shortest to longest.
|
2017-12-08 13:07:05 +01:00
|
|
|
for ( var hn of hns ) {
|
2017-11-02 20:49:11 +01:00
|
|
|
this.add(hn);
|
|
|
|
}
|
|
|
|
return this;
|
|
|
|
};
|
|
|
|
|
|
|
|
/******************************************************************************/
|
|
|
|
|
|
|
|
HNTrieBuilder.prototype.matches = function(needle) {
|
|
|
|
var ichar = needle.length - 1,
|
|
|
|
buf = this.buf, i = 0, c;
|
|
|
|
for (;;) {
|
|
|
|
c = ichar === -1 ? 0 : needle.charCodeAt(ichar);
|
|
|
|
while ( buf[i+2] !== c ) {
|
|
|
|
i = buf[i];
|
|
|
|
if ( i === 0 ) { return false; }
|
|
|
|
}
|
|
|
|
if ( c === 0 ) { return true; }
|
|
|
|
i = buf[i+1];
|
|
|
|
if ( i === 0 ) { return c === 0x2E; }
|
|
|
|
ichar -= 1;
|
|
|
|
}
|
|
|
|
};
|
|
|
|
|
|
|
|
/*******************************************************************************
|
|
|
|
|
|
|
|
Before vacuuming, each cell is 3 entry-long:
|
|
|
|
- Jump index to descendant (if any)
|
|
|
|
- Jump index to sibling (if any)
|
|
|
|
- character code
|
|
|
|
|
|
|
|
All strings stored in the un-vacuumed trie are zero-terminated, and the
|
|
|
|
character zero does occupy a cell like any other character. Let's use _ to
|
|
|
|
represent character zero for sake of comments. The asterisk will be used to
|
|
|
|
highlight a node with a descendant.
|
|
|
|
|
|
|
|
Cases, before vacuuming:
|
|
|
|
|
2017-11-03 13:36:16 +01:00
|
|
|
abc.com, abc.org: 16 cells
|
2017-11-02 20:49:11 +01:00
|
|
|
*
|
|
|
|
_ -- a -- b -- c -- . -- c -- o -- m
|
|
|
|
_ -- a -- b -- c -- . -- o -- r -- g
|
|
|
|
|
2017-11-03 13:36:16 +01:00
|
|
|
abc.com, xyz.com: 12 cells
|
2017-11-02 20:49:11 +01:00
|
|
|
*
|
|
|
|
_ -- a -- b -- c -- . -- c -- o -- m
|
|
|
|
_ -- x -- y -- z
|
|
|
|
|
2017-11-03 13:36:16 +01:00
|
|
|
ab.com, b.com: 8 cells
|
2017-11-02 20:49:11 +01:00
|
|
|
*
|
|
|
|
_ -- a -- b -- . -- c -- o -- m
|
|
|
|
_
|
|
|
|
|
2017-11-03 13:36:16 +01:00
|
|
|
b.com, ab.com: 8 cells
|
2017-11-02 20:49:11 +01:00
|
|
|
*
|
|
|
|
_ -- b -- . -- c -- o -- m
|
|
|
|
_ -- a
|
|
|
|
|
|
|
|
Vacuuming is the process of merging sibling cells with no descendants. Cells
|
|
|
|
with descendants can't be merged.
|
|
|
|
|
|
|
|
Each time we arrive at the end of a horizontal branch (sibling jump index is
|
|
|
|
0), we walk back to the nearest previous node with descendants, and repeat
|
|
|
|
the process. Since there is no index information on where to come back, a
|
|
|
|
stack is used to remember cells with descendants (descendant jump index is
|
|
|
|
non zero) encountered on the way
|
|
|
|
|
|
|
|
After vacuuming, each cell is 4+n entry-long:
|
|
|
|
- Jump index to descendant (if any)
|
|
|
|
- Jump index to sibling (if any)
|
|
|
|
- character code
|
|
|
|
- length of merged character code(s)
|
|
|
|
|
|
|
|
Cases, after vacuuming:
|
|
|
|
|
2017-11-03 13:36:16 +01:00
|
|
|
abc.com, abc.org: 2 cells
|
2017-11-02 20:49:11 +01:00
|
|
|
*
|
|
|
|
[abc.co]m
|
|
|
|
[abc.or]g
|
|
|
|
|
2017-11-03 13:36:16 +01:00
|
|
|
abc.com, xyz.com: 3 cells
|
2017-11-02 20:49:11 +01:00
|
|
|
*
|
|
|
|
[ab]c -- [.co]m
|
|
|
|
[xy]z
|
|
|
|
|
2017-11-03 13:36:16 +01:00
|
|
|
ab.com, b.com: 3 cells
|
2017-11-02 20:49:11 +01:00
|
|
|
*
|
|
|
|
a -- [b.co]m
|
|
|
|
_
|
|
|
|
|
2017-11-03 13:36:16 +01:00
|
|
|
b.com, ab.com: 3 cells
|
2017-11-02 20:49:11 +01:00
|
|
|
*
|
|
|
|
_ -- [b.co]m
|
|
|
|
a
|
|
|
|
|
|
|
|
It's possible for a character zero cell to have descendants.
|
|
|
|
|
|
|
|
It's not possible for a character zero cell to have next siblings.
|
|
|
|
|
|
|
|
This will have to be taken into account during both vacuuming and matching.
|
|
|
|
|
|
|
|
Character zero cells with no descendant are discarded during vacuuming.
|
|
|
|
Character zero cells with a descendant, or character zero cells which are a
|
|
|
|
decendant are kept into the vacuumed trie.
|
|
|
|
|
|
|
|
A vacuumed trie is very efficient memory- and lookup-wise, but is also
|
|
|
|
read-only: no string can be added or removed. The read-only trie is really
|
|
|
|
just a self-sufficient array of integers, and can easily be exported/imported
|
|
|
|
as a JSON array. It is theoretically possible to "decompile" a trie (vacuumed
|
|
|
|
or not) into the set of strings originally added to it (in the order they
|
|
|
|
were added with the current implementation), but so far I do not need this
|
|
|
|
feature.
|
|
|
|
|
2017-11-03 13:36:16 +01:00
|
|
|
TODO: It's possible to build the vacuumed trie on the fly as items are
|
|
|
|
added to it. I need to carefully list all possible cases which can arise
|
|
|
|
at insertion time. The benefits will be: faster creation time (expected), no
|
|
|
|
longer read-only trie (items can be added at any time).
|
|
|
|
|
2017-11-02 20:49:11 +01:00
|
|
|
*/
|
|
|
|
|
|
|
|
HNTrieBuilder.prototype.vacuum = function() {
|
|
|
|
if ( this.bufsz === 0 ) { return null; }
|
|
|
|
var input = this.buf,
|
|
|
|
output = [], outsz = 0,
|
|
|
|
forks = [],
|
|
|
|
iin = 0, iout;
|
|
|
|
for (;;) {
|
|
|
|
iout = outsz;
|
|
|
|
output[iout+0] = 0;
|
|
|
|
output[iout+1] = 0;
|
|
|
|
output[iout+2] = input[iin+2]; // first character
|
|
|
|
output[iout+3] = 0;
|
|
|
|
outsz += 4;
|
|
|
|
if ( input[iin] !== 0 ) { // cell with descendant
|
|
|
|
forks.push(iout, iin); // defer processing
|
|
|
|
}
|
|
|
|
for (;;) { // merge sibling cell(s)
|
|
|
|
iin = input[iin+1]; // sibling cell
|
|
|
|
if ( iin === 0 ) { break; } // no more sibling cell
|
|
|
|
if ( input[iin] !== 0 ) { break; } // cell with a descendant
|
|
|
|
if ( input[iin+2] === 0 ) { break; } // don't merge \x00
|
|
|
|
output[outsz] = input[iin+2]; // add character data
|
|
|
|
outsz += 1;
|
|
|
|
}
|
|
|
|
if ( outsz !== iout + 4 ) { // cells were merged
|
|
|
|
output[iout+3] = outsz - iout - 4; // so adjust count
|
|
|
|
}
|
|
|
|
if ( iin !== 0 && input[iin] !== 0 ) { // can't merge this cell
|
|
|
|
output[iout+1] = outsz;
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
if ( forks.length === 0 ) { break; } // no more descendants: bye
|
|
|
|
iin = forks.pop(); // process next descendant
|
|
|
|
iout = forks.pop();
|
|
|
|
iin = input[iin];
|
|
|
|
output[iout] = outsz;
|
|
|
|
}
|
|
|
|
var trie; // pick optimal read-only
|
|
|
|
if ( outsz < 256 ) { // container array.
|
|
|
|
trie = new this.HNTrie8(output, outsz);
|
|
|
|
} else if ( outsz < 65536 ) {
|
|
|
|
trie = new this.HNTrie16(output, outsz);
|
|
|
|
} else {
|
|
|
|
trie = new this.HNTrie32(output, outsz);
|
|
|
|
}
|
|
|
|
this.reset(); // free working array
|
|
|
|
return trie;
|
|
|
|
};
|
|
|
|
|
|
|
|
/*******************************************************************************
|
|
|
|
|
|
|
|
The following internal classes are the actual output of the vacuum() method.
|
|
|
|
|
|
|
|
They use the minimal amount of data to be able to efficiently lookup strings
|
|
|
|
in a read-only trie.
|
|
|
|
|
|
|
|
Given that javascript optimizers mind that the type of an argument passed to
|
|
|
|
a function always stays the same each time the function is called, there need
|
|
|
|
to be three separate implementation of matches() to allow the javascript
|
|
|
|
optimizer to do its job.
|
|
|
|
|
|
|
|
The matching code deals only with looking up values in a TypedArray (beside
|
|
|
|
calls to String.charCodeAt), so I expect this to be fast and good candidate
|
|
|
|
for optimization by javascript engines.
|
|
|
|
|
|
|
|
*/
|
|
|
|
|
|
|
|
HNTrieBuilder.prototype.HNTrie8 = function(buf, bufsz) {
|
|
|
|
this.buf = new Uint8Array(buf.slice(0, bufsz));
|
|
|
|
};
|
|
|
|
|
|
|
|
HNTrieBuilder.prototype.HNTrie8.prototype.matches = function(needle) {
|
|
|
|
var ichar = needle.length,
|
|
|
|
i = 0, c1, c2, ccnt, ic, i1, i2;
|
|
|
|
for (;;) {
|
|
|
|
ichar -= 1;
|
|
|
|
c1 = ichar === -1 ? 0 : needle.charCodeAt(ichar);
|
|
|
|
while ( (c2 = this.buf[i+2]) !== c1 ) { // quick test: first character
|
|
|
|
if ( c2 === 0 && c1 === 0x2E ) { return true; }
|
|
|
|
i = this.buf[i]; // next descendant
|
|
|
|
if ( i === 0 ) { return false; } // no more descendants
|
|
|
|
}
|
|
|
|
if ( c1 === 0 ) { return true; }
|
|
|
|
ccnt = this.buf[i+3];
|
|
|
|
if ( ccnt !== 0 ) { // cell is only one character
|
2017-11-05 12:45:43 +01:00
|
|
|
if ( ccnt > ichar ) { return false; }
|
2017-11-02 20:49:11 +01:00
|
|
|
ic = ccnt; i1 = ichar-1; i2 = i+4;
|
|
|
|
while ( ic-- && needle.charCodeAt(i1-ic) === this.buf[i2+ic] );
|
|
|
|
if ( ic !== -1 ) { return false; }
|
|
|
|
ichar -= ccnt;
|
|
|
|
}
|
|
|
|
i = this.buf[i+1]; // next sibling
|
|
|
|
if ( i === 0 ) {
|
|
|
|
return ichar === 0 || needle.charCodeAt(ichar-1) === 0x2E;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
};
|
|
|
|
|
|
|
|
HNTrieBuilder.prototype.HNTrie16 = function(buf, bufsz) {
|
|
|
|
this.buf = new Uint16Array(buf.slice(0, bufsz));
|
|
|
|
};
|
|
|
|
|
|
|
|
HNTrieBuilder.prototype.HNTrie16.prototype.matches = function(needle) {
|
|
|
|
var ichar = needle.length,
|
|
|
|
i = 0, c1, c2, ccnt, ic, i1, i2;
|
|
|
|
for (;;) {
|
|
|
|
ichar -= 1;
|
|
|
|
c1 = ichar === -1 ? 0 : needle.charCodeAt(ichar);
|
|
|
|
while ( (c2 = this.buf[i+2]) !== c1 ) { // quick test: first character
|
|
|
|
if ( c2 === 0 && c1 === 0x2E ) { return true; }
|
|
|
|
i = this.buf[i]; // next descendant
|
|
|
|
if ( i === 0 ) { return false; } // no more descendants
|
|
|
|
}
|
|
|
|
if ( c1 === 0 ) { return true; }
|
|
|
|
ccnt = this.buf[i+3];
|
|
|
|
if ( ccnt !== 0 ) { // cell is only one character
|
2017-11-05 12:45:43 +01:00
|
|
|
if ( ccnt > ichar ) { return false; }
|
2017-11-02 20:49:11 +01:00
|
|
|
ic = ccnt; i1 = ichar-1; i2 = i+4;
|
|
|
|
while ( ic-- && needle.charCodeAt(i1-ic) === this.buf[i2+ic] );
|
|
|
|
if ( ic !== -1 ) { return false; }
|
|
|
|
ichar -= ccnt;
|
|
|
|
}
|
|
|
|
i = this.buf[i+1]; // next sibling
|
|
|
|
if ( i === 0 ) {
|
|
|
|
return ichar === 0 || needle.charCodeAt(ichar-1) === 0x2E;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
};
|
|
|
|
|
|
|
|
HNTrieBuilder.prototype.HNTrie32 = function(buf, bufsz) {
|
|
|
|
this.buf = new Uint32Array(buf.slice(0, bufsz));
|
|
|
|
};
|
|
|
|
|
|
|
|
HNTrieBuilder.prototype.HNTrie32.prototype.matches = function(needle) {
|
|
|
|
var ichar = needle.length,
|
|
|
|
i = 0, c1, c2, ccnt, ic, i1, i2;
|
|
|
|
for (;;) {
|
|
|
|
ichar -= 1;
|
|
|
|
c1 = ichar === -1 ? 0 : needle.charCodeAt(ichar);
|
|
|
|
while ( (c2 = this.buf[i+2]) !== c1 ) { // quick test: first character
|
|
|
|
if ( c2 === 0 && c1 === 0x2E ) { return true; }
|
|
|
|
i = this.buf[i]; // next descendant
|
|
|
|
if ( i === 0 ) { return false; } // no more descendants
|
|
|
|
}
|
|
|
|
if ( c1 === 0 ) { return true; }
|
|
|
|
ccnt = this.buf[i+3];
|
|
|
|
if ( ccnt !== 0 ) { // cell is only one character
|
2017-11-05 12:45:43 +01:00
|
|
|
if ( ccnt > ichar ) { return false; }
|
2017-11-02 20:49:11 +01:00
|
|
|
ic = ccnt; i1 = ichar-1; i2 = i+4;
|
|
|
|
while ( ic-- && needle.charCodeAt(i1-ic) === this.buf[i2+ic] );
|
|
|
|
if ( ic !== -1 ) { return false; }
|
|
|
|
ichar -= ccnt;
|
|
|
|
}
|
|
|
|
i = this.buf[i+1]; // next sibling
|
|
|
|
if ( i === 0 ) {
|
|
|
|
return ichar === 0 || needle.charCodeAt(ichar-1) === 0x2E;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
};
|
2017-11-05 18:33:46 +01:00
|
|
|
|
|
|
|
/*******************************************************************************
|
|
|
|
|
|
|
|
Experimenting: WebAssembly version.
|
|
|
|
Developed using this simple online tool: https://wasdk.github.io/WasmFiddle/
|
|
|
|
|
|
|
|
>>> start of C code
|
|
|
|
unsigned short buffer[0];
|
|
|
|
int matches(int id, int cclen)
|
|
|
|
{
|
|
|
|
unsigned short* cc0 = &buffer[0];
|
|
|
|
unsigned short* cc = cc0 + cclen;
|
|
|
|
unsigned short* cell0 = &buffer[512+id];
|
|
|
|
unsigned short* cell = cell0;
|
|
|
|
unsigned short* ww;
|
|
|
|
int c1, c2, ccnt;
|
|
|
|
for (;;) {
|
|
|
|
c1 = cc <= cc0 ? 0 : *--cc;
|
|
|
|
for (;;) {
|
|
|
|
c2 = cell[2];
|
|
|
|
if ( c2 == c1 ) { break; }
|
|
|
|
if ( c2 == 0 && c1 == 0x2E ) { return 1; }
|
|
|
|
if ( cell[0] == 0 ) { return 0; }
|
|
|
|
cell = cell0 + cell[0];
|
|
|
|
}
|
|
|
|
if ( c1 == 0 ) { return 1; }
|
|
|
|
ccnt = cell[3];
|
|
|
|
if ( ccnt != 0 ) {
|
|
|
|
if ( cc - ccnt < cc0 ) { return 0; }
|
|
|
|
ww = cell + 4;
|
|
|
|
while ( ccnt-- ) {
|
|
|
|
if ( *--cc != *ww++ ) { return 0; }
|
|
|
|
}
|
|
|
|
}
|
|
|
|
if ( cell[1] == 0 ) {
|
|
|
|
if ( cc == cc0 ) { return 1; }
|
|
|
|
if ( *--cc == 0x2E ) { return 1; }
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
cell = cell0 + cell[1];
|
|
|
|
}
|
|
|
|
}
|
|
|
|
int getLinearMemoryOffset() {
|
|
|
|
return (int)&buffer[0];
|
|
|
|
}
|
|
|
|
<<< end of C code
|
|
|
|
|
|
|
|
Observations:
|
|
|
|
- When growing memory, we must re-create the typed array js-side. The content
|
|
|
|
of the array is preserved by grow().
|
|
|
|
- It's slower than the javascript version... Possible explanations:
|
|
|
|
- Call overhead: https://github.com/WebAssembly/design/issues/1120
|
|
|
|
- Having to copy whole input string in buffer before call.
|
|
|
|
|
|
|
|
var HNTrie16wasm = (function() {
|
|
|
|
var module;
|
|
|
|
var instance;
|
|
|
|
var memory;
|
|
|
|
var memoryOrigin = 0;
|
|
|
|
var memoryUsed = 1024;
|
|
|
|
var cbuffer;
|
|
|
|
var tbuffer;
|
|
|
|
var tbufferSize = 0;
|
|
|
|
var matchesFn;
|
|
|
|
|
|
|
|
var init = function() {
|
|
|
|
module = new WebAssembly.Module(new Uint8Array([0,97,115,109,1,0,0,0,1,139,128,128,128,0,2,96,2,127,127,1,127,96,0,1,127,3,131,128,128,128,0,2,0,1,4,132,128,128,128,0,1,112,0,0,5,131,128,128,128,0,1,0,1,6,129,128,128,128,0,0,7,172,128,128,128,0,3,6,109,101,109,111,114,121,2,0,7,109,97,116,99,104,101,115,0,0,21,103,101,116,76,105,110,101,97,114,77,101,109,111,114,121,79,102,102,115,101,116,0,1,10,217,130,128,128,0,2,202,130,128,128,0,1,5,127,32,1,65,1,116,65,12,106,33,3,32,0,65,1,116,65,140,8,106,34,2,33,0,2,64,2,64,2,64,2,64,2,64,2,64,3,64,65,0,33,5,2,64,32,3,65,12,77,13,0,32,3,65,126,106,34,3,47,1,0,33,5,11,2,64,32,5,32,0,47,1,4,34,1,70,13,0,2,64,32,5,65,46,71,13,0,3,64,32,1,65,255,255,3,113,69,13,5,32,0,47,1,0,34,1,69,13,6,32,2,32,1,65,1,116,106,34,0,47,1,4,34,1,65,46,71,13,0,12,2,11,11,3,64,32,0,47,1,0,34,1,69,13,3,32,5,32,2,32,1,65,1,116,106,34,0,47,1,4,71,13,0,11,11,65,1,33,6,32,5,69,13,5,2,64,2,64,32,0,47,1,6,34,1,69,13,0,32,3,32,1,65,1,116,107,65,12,73,13,8,32,1,65,127,115,33,5,32,0,65,8,106,33,1,3,64,32,5,65,1,106,34,5,69,13,1,32,1,47,1,0,33,4,32,1,65,2,106,33,1,32,4,32,3,65,126,106,34,3,47,1,0,70,13,0,12,2,11,11,32,0,47,1,2,34,1,69,13,5,32,2,32,1,65,1,116,106,33,0,12,1,11,11,65,0,15,11,65,0,15,11,65,1,15,11,65,0,15,11,32,3,65,12,70,13,0,32,3,65,126,106,47,1,0,65,46,70,33,6,11,32,6,15,11,65,0,11,132,128,128,128,0,0,65,12,11]));
|
|
|
|
instance = new WebAssembly.Instance(module);
|
|
|
|
memory = instance.exports.memory;
|
|
|
|
memoryOrigin = instance.exports.getLinearMemoryOffset();
|
|
|
|
cbuffer = new Uint16Array(memory.buffer, memoryOrigin, 512);
|
|
|
|
tbuffer = new Uint16Array(memory.buffer, memoryOrigin + 1024);
|
|
|
|
memoryUsed = memoryOrigin + 1024;
|
|
|
|
matchesFn = instance.exports.matches;
|
|
|
|
};
|
|
|
|
|
|
|
|
return {
|
|
|
|
create: function(data) {
|
|
|
|
if ( module === undefined ) { init(); }
|
|
|
|
var bytesNeeded = memoryUsed + ((data.length * 2 + 3) & ~3);
|
|
|
|
if ( bytesNeeded > memory.buffer.byteLength ) {
|
|
|
|
memory.grow((bytesNeeded - memory.buffer.byteLength + 65535) >>> 16);
|
|
|
|
cbuffer = new Uint16Array(memory.buffer, memoryOrigin, 512);
|
|
|
|
tbuffer = new Uint16Array(memory.buffer, memoryOrigin + 1024);
|
|
|
|
}
|
|
|
|
for ( var i = 0, j = tbufferSize; i < data.length; i++, j++ ) {
|
|
|
|
tbuffer[j] = data[i];
|
|
|
|
}
|
|
|
|
var id = tbufferSize;
|
|
|
|
tbufferSize += data.length;
|
|
|
|
if ( tbufferSize & 1 ) { tbufferSize += 1; }
|
|
|
|
memoryUsed += tbufferSize * 2;
|
|
|
|
return id;
|
|
|
|
},
|
|
|
|
reset: function() {
|
|
|
|
module = undefined;
|
|
|
|
instance = undefined;
|
|
|
|
memory = undefined;
|
|
|
|
memory.grow(1);
|
|
|
|
memoryUsed = 1024;
|
|
|
|
cbuffer = undefined;
|
|
|
|
tbuffer = undefined;
|
|
|
|
tbufferSize = 0;
|
|
|
|
},
|
|
|
|
matches: function(id, hn) {
|
|
|
|
var len = hn.length;
|
|
|
|
if ( len > 512 ) {
|
|
|
|
hn = hn.slice(-512);
|
|
|
|
var pos = hn.indexOf('.');
|
|
|
|
if ( pos !== 0 ) {
|
|
|
|
hn = hn.slice(pos + 1);
|
|
|
|
}
|
|
|
|
len = hn.length;
|
|
|
|
}
|
|
|
|
var needle = cbuffer, i = len;
|
|
|
|
while ( i-- ) {
|
|
|
|
needle[i] = hn.charCodeAt(i);
|
|
|
|
}
|
|
|
|
return matchesFn(id, len) === 1;
|
|
|
|
}
|
|
|
|
};
|
|
|
|
})();
|
|
|
|
*/
|