1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
|
'use strict';
const fs = require('fs');
const jsesc = require('jsesc');
const _ = require('lodash');
const sortObject = require('sort-object');
// https://html.spec.whatwg.org/entities.json
const data = require('../data/entities.json');
const encodeMap = {};
let encodeMultipleSymbols = [];
let encodeSingleCodePoints = [];
const decodeMap = {};
const decodeMapLegacy = {};
_.forOwn(data, function(value, key) {
const referenceWithLeadingAmpersand = key;
const referenceWithoutLeadingAmpersand = referenceWithLeadingAmpersand.replace(/^&/, '');
const referenceOnly = referenceWithoutLeadingAmpersand.replace(/;$/, '');
const string = value.characters;
const codePoints = value.codepoints;
if (/;$/.test(referenceWithoutLeadingAmpersand)) {
// Only enter this branch if the entity has a trailing semicolon.
const tmp = encodeMap[string];
// Prefer short named character references with as few uppercase letters as
// possible.
if ( // Only add an entry if…
!tmp || ( // …there is no entry for this string yet, or…
tmp.length > referenceOnly.length || // …this reference is shorter, or…
(
// …this reference contains fewer uppercase letters.
tmp.length == referenceOnly.length &&
(referenceOnly.match(/[A-Z]/g) || []).length <
(tmp.match(/[A-Z]/g) || []).length
)
)
) {
encodeMap[string] = referenceOnly;
} else {
// Do nothing.
}
if (codePoints.length == 1) {
encodeSingleCodePoints.push(codePoints[0]);
} else {
encodeMultipleSymbols.push(string);
}
}
if (/;$/.test(referenceWithoutLeadingAmpersand)) {
decodeMap[referenceWithoutLeadingAmpersand.replace(/;$/, '')] = string;
} else {
decodeMapLegacy[referenceWithoutLeadingAmpersand] = string;
}
});
encodeMultipleSymbols = _.uniq(
encodeMultipleSymbols.sort(), // Sort strings by code point value.
true
);
encodeSingleCodePoints = _.uniq(
_.sortBy(encodeSingleCodePoints), // Sort numerically.
true
);
const legacyReferences = _.keys(decodeMapLegacy).sort(function(a, b) {
// Optimize the regular expression that will be generated based on this data
// by sorting the references by length in descending order.
if (a.length > b.length) {
return -1;
}
if (a.length < b.length) {
return 1;
}
// If the length of both strings is equal, sort alphabetically.
if (a < b) {
return -1;
}
if (a > b) {
return 1;
}
return 0;
});
const writeJSON = function(fileName, object) {
const json = jsesc(object, {
'compact': false,
'json': true
});
fs.writeFileSync(fileName, json + '\n');
};
writeJSON('data/decode-map.json', sortObject(decodeMap));
writeJSON('data/decode-map-legacy.json', sortObject(decodeMapLegacy));
writeJSON('data/decode-legacy-named-references.json', legacyReferences);
writeJSON('data/encode-map.json', sortObject(encodeMap));
writeJSON('data/encode-paired-symbols.json', encodeMultipleSymbols);
writeJSON('data/encode-lone-code-points.json', encodeSingleCodePoints);
|