summaryrefslogtreecommitdiffstats
path: root/scripts/export-data.js
diff options
context:
space:
mode:
authorMathias Bynens <mathias@qiwi.be>2014-05-24 13:08:37 +0200
committerMathias Bynens <mathias@qiwi.be>2014-05-24 14:12:39 +0200
commit9f63ec2d00bcb3ccf79a4780b040652fce7176b7 (patch)
tree3f3b0cec5fe79b64cf0b048887ed7ffdc151bb50 /scripts/export-data.js
parent4c760cf495ef087aa3c6df1021ed5061cf15c4e7 (diff)
downloadhe-9f63ec2d00bcb3ccf79a4780b040652fce7176b7.zip
he-9f63ec2d00bcb3ccf79a4780b040652fce7176b7.tar.gz
he-9f63ec2d00bcb3ccf79a4780b040652fce7176b7.tar.bz2
Don’t encode invalid code points whose character references would refer to another code point
I.e. the code points listed in the first column of the overrides table at <http://www.whatwg.org/specs/web-apps/current-work/multipage/tokenization.html#table-charref-overrides>. Closes #19.
Diffstat (limited to 'scripts/export-data.js')
-rw-r--r--scripts/export-data.js28
1 files changed, 28 insertions, 0 deletions
diff --git a/scripts/export-data.js b/scripts/export-data.js
index dec1c2c..d7705cc 100644
--- a/scripts/export-data.js
+++ b/scripts/export-data.js
@@ -58,6 +58,12 @@ var invalidRawCodePoints = readJSON('invalid-raw-code-points');
// raw code points. http://whatwg.org/html/tokenization.html#data-state
invalidRawCodePoints.unshift(0x0000);
+var overrides = Object.keys(
+ JSON.parse(fs.readFileSync('data/decode-map-overrides.json', 'utf-8'))
+).map(function(codePoint) {
+ return Number(codePoint);
+});
+
module.exports = {
'encodeMap': readJSON('encode-map'),
'encodeASCII': encodeASCII, // not used
@@ -82,6 +88,28 @@ module.exports = {
')([=a-zA-Z0-9])?';
}()),
'regexLoneSurrogate': '[\\uD800-\\uDBFF](?:[^\\uDC00-\\uDFFF]|$)|(?:[^\\uD800-\uDBFF]|^)[\\uDC00-\\uDFFF]',
+ 'ascii': (function() {
+ return regenerate()
+ // Add all ASCII symbols (not just printable ASCII).
+ .addRange(0x0, 0x7F)
+ // Remove code points listed in the first column of the overrides table.
+ // http://whatwg.org/html/tokenization.html#table-charref-overrides
+ .remove(overrides)
+ .toString();
+ }()),
+ 'otherBMP': (function() {
+ return regenerate()
+ // Add all BMP symbols.
+ .addRange(0x0, 0xFFFF)
+ // Remove ASCII newlines.
+ .remove('\r', '\n')
+ // Remove printable ASCII symbols.
+ .removeRange(0x20, 0x7F)
+ // Remove code points listed in the first column of the overrides table.
+ // http://whatwg.org/html/tokenization.html#table-charref-overrides
+ .remove(overrides)
+ .toString();
+ }()),
'testData': fs.readFileSync('data/entities.json', 'utf-8').trim(),
'version': JSON.parse(fs.readFileSync('package.json', 'utf-8')).version
};