1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
|
var fs = require('fs');
var jsesc = require('jsesc');
var _ = require('lodash');
// http://www.whatwg.org/specs/web-apps/current-work/entities.json
var data = require('../data/entities.json');
var encodeMap = {};
var encodeMultipleSymbols = [];
var encodeSingleCodePoints = [];
var decodeMap = {};
var decodeMapLegacy = {};
_.forOwn(data, function(value, key) {
var referenceWithLeadingAmpersand = key;
var referenceWithoutLeadingAmpersand = referenceWithLeadingAmpersand.replace(/^&/, '');
var referenceOnly = referenceWithoutLeadingAmpersand.replace(/;$/, '');
var string = value.characters;
var codePoints = value.codepoints;
var tmp;
if (/;$/.test(referenceWithoutLeadingAmpersand)) {
// only if the entity has a trailing semicolon
tmp = encodeMap[string];
// Prefer short named character references with as few uppercase letters as possible
if ( // only add an entry if…
!tmp || ( // …there is no entry for this string yet, or…
tmp.length > referenceOnly.length || // …this reference is shorter, or…
(
// …this reference contains fewer uppercase letters
tmp.length == referenceOnly.length &&
(referenceOnly.match(/[A-Z]/g) || []).length <
(tmp.match(/[A-Z]/g) || []).length
)
)
) {
encodeMap[string] = referenceOnly;
} else {
// do nothing
}
if (codePoints.length == 1) {
encodeSingleCodePoints.push(codePoints[0]);
} else {
encodeMultipleSymbols.push(string);
}
}
if (/;$/.test(referenceWithoutLeadingAmpersand)) {
decodeMap[referenceWithoutLeadingAmpersand.replace(/;$/, '')] = string;
} else {
decodeMapLegacy[referenceWithoutLeadingAmpersand] = string;
}
});
encodeMultipleSymbols = _.uniq(
encodeMultipleSymbols.sort(), // sort strings by code point value
true
);
encodeSingleCodePoints = _.uniq(
_.sortBy(encodeSingleCodePoints), // numeric sort
true
);
var legacyReferences = _.keys(decodeMapLegacy).sort(function(a, b) {
if (a.length > b.length) {
return -1;
}
if (a.length < b.length) {
return 1;
}
// a.length == b.length, so sort alphabetically
return a - b;
});
var writeJSON = function(fileName, object) {
var json = jsesc(object, {
'compact': false,
'json': true
});
fs.writeFileSync(fileName, json + '\n');
};
writeJSON('data/decode-map.json', decodeMap);
writeJSON('data/decode-map-legacy.json', decodeMapLegacy);
writeJSON('data/decode-legacy-named-references.json', legacyReferences);
writeJSON('data/encode-map.json', encodeMap);
writeJSON('data/encode-paired-symbols.json', encodeMultipleSymbols);
writeJSON('data/encode-lone-code-points.json', encodeSingleCodePoints);
|