summaryrefslogtreecommitdiffstats
path: root/scripts/process-data.js
blob: db106ae96d0fe77455e1b108055dc32e8e914c4b (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
'use strict';

const fs = require('fs');
const jsesc = require('jsesc');
const _ = require('lodash');
const sortObject = require('sort-object');

// https://html.spec.whatwg.org/entities.json
const data = require('../data/entities.json');

const encodeMap = {};
let encodeMultipleSymbols = [];
let encodeSingleCodePoints = [];
const decodeMap = {};
const decodeMapLegacy = {};

_.forOwn(data, function(value, key) {
	const referenceWithLeadingAmpersand = key;
	const referenceWithoutLeadingAmpersand = referenceWithLeadingAmpersand.replace(/^&/, '');
	const referenceOnly = referenceWithoutLeadingAmpersand.replace(/;$/, '');
	const string = value.characters;
	const codePoints = value.codepoints;
	if (/;$/.test(referenceWithoutLeadingAmpersand)) {
		// Only enter this branch if the entity has a trailing semicolon.
		const tmp = encodeMap[string];
		// Prefer short named character references with as few uppercase letters as
		// possible.
		if ( // Only add an entry if…
			!tmp || ( // …there is no entry for this string yet, or…
				tmp.length > referenceOnly.length || // …this reference is shorter, or…
				(
					// …this reference contains fewer uppercase letters.
					tmp.length == referenceOnly.length &&
					(referenceOnly.match(/[A-Z]/g) || []).length <
					(tmp.match(/[A-Z]/g) || []).length
				)
			)
		) {
			encodeMap[string] = referenceOnly;
		} else {
			// Do nothing.
		}
		if (codePoints.length == 1) {
			encodeSingleCodePoints.push(codePoints[0]);
		} else {
			encodeMultipleSymbols.push(string);
		}
	}
	if (/;$/.test(referenceWithoutLeadingAmpersand)) {
		decodeMap[referenceWithoutLeadingAmpersand.replace(/;$/, '')] = string;
	} else {
		decodeMapLegacy[referenceWithoutLeadingAmpersand] = string;
	}
});

encodeMultipleSymbols = _.uniq(
	encodeMultipleSymbols.sort(), // Sort strings by code point value.
	true
);

encodeSingleCodePoints = _.uniq(
	_.sortBy(encodeSingleCodePoints), // Sort numerically.
	true
);

const legacyReferences = _.keys(decodeMapLegacy).sort(function(a, b) {
	// Optimize the regular expression that will be generated based on this data
	// by sorting the references by length in descending order.
	if (a.length > b.length) {
		return -1;
	}
	if (a.length < b.length) {
		return 1;
	}
	// If the length of both strings is equal, sort alphabetically.
	if (a < b) {
		return -1;
	}
	if (a > b) {
		return 1;
	}
	return 0;
});

const writeJSON = function(fileName, object) {
	const json = jsesc(object, {
		'compact': false,
		'json': true
	});
	fs.writeFileSync(fileName, json + '\n');
};

writeJSON('data/decode-map.json', sortObject(decodeMap));
writeJSON('data/decode-map-legacy.json', sortObject(decodeMapLegacy));
writeJSON('data/decode-legacy-named-references.json', legacyReferences);
writeJSON('data/encode-map.json', sortObject(encodeMap));
writeJSON('data/encode-paired-symbols.json', encodeMultipleSymbols);
writeJSON('data/encode-lone-code-points.json', encodeSingleCodePoints);