diff options
author | Mathias Bynens <mathias@qiwi.be> | 2014-05-24 13:08:37 +0200 |
---|---|---|
committer | Mathias Bynens <mathias@qiwi.be> | 2014-05-24 14:12:39 +0200 |
commit | 9f63ec2d00bcb3ccf79a4780b040652fce7176b7 (patch) | |
tree | 3f3b0cec5fe79b64cf0b048887ed7ffdc151bb50 /scripts/export-data.js | |
parent | 4c760cf495ef087aa3c6df1021ed5061cf15c4e7 (diff) | |
download | he-9f63ec2d00bcb3ccf79a4780b040652fce7176b7.zip he-9f63ec2d00bcb3ccf79a4780b040652fce7176b7.tar.gz he-9f63ec2d00bcb3ccf79a4780b040652fce7176b7.tar.bz2 |
Don’t encode invalid code points whose character references would refer to another code point
I.e. the code points listed in the first column of the overrides table at <http://www.whatwg.org/specs/web-apps/current-work/multipage/tokenization.html#table-charref-overrides>.
Closes #19.
Diffstat (limited to 'scripts/export-data.js')
-rw-r--r-- | scripts/export-data.js | 28 |
1 files changed, 28 insertions, 0 deletions
diff --git a/scripts/export-data.js b/scripts/export-data.js index dec1c2c..d7705cc 100644 --- a/scripts/export-data.js +++ b/scripts/export-data.js @@ -58,6 +58,12 @@ var invalidRawCodePoints = readJSON('invalid-raw-code-points'); // raw code points. http://whatwg.org/html/tokenization.html#data-state invalidRawCodePoints.unshift(0x0000); +var overrides = Object.keys( + JSON.parse(fs.readFileSync('data/decode-map-overrides.json', 'utf-8')) +).map(function(codePoint) { + return Number(codePoint); +}); + module.exports = { 'encodeMap': readJSON('encode-map'), 'encodeASCII': encodeASCII, // not used @@ -82,6 +88,28 @@ module.exports = { ')([=a-zA-Z0-9])?'; }()), 'regexLoneSurrogate': '[\\uD800-\\uDBFF](?:[^\\uDC00-\\uDFFF]|$)|(?:[^\\uD800-\uDBFF]|^)[\\uDC00-\\uDFFF]', + 'ascii': (function() { + return regenerate() + // Add all ASCII symbols (not just printable ASCII). + .addRange(0x0, 0x7F) + // Remove code points listed in the first column of the overrides table. + // http://whatwg.org/html/tokenization.html#table-charref-overrides + .remove(overrides) + .toString(); + }()), + 'otherBMP': (function() { + return regenerate() + // Add all BMP symbols. + .addRange(0x0, 0xFFFF) + // Remove ASCII newlines. + .remove('\r', '\n') + // Remove printable ASCII symbols. + .removeRange(0x20, 0x7F) + // Remove code points listed in the first column of the overrides table. + // http://whatwg.org/html/tokenization.html#table-charref-overrides + .remove(overrides) + .toString(); + }()), 'testData': fs.readFileSync('data/entities.json', 'utf-8').trim(), 'version': JSON.parse(fs.readFileSync('package.json', 'utf-8')).version }; |