summaryrefslogtreecommitdiffstats
path: root/scripts/process-data.js
blob: 8189178dd673095ea513de4aa32b5f87338dda67 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
var fs = require('fs');
var jsesc = require('jsesc');
var _ = require('lodash');

// http://www.whatwg.org/specs/web-apps/current-work/entities.json
var data = require('../data/entities.json');

var encodeMap = {};
var encodeMultipleSymbols = [];
var encodeSingleCodePoints = [];
var decodeMap = {};
var decodeMapLegacy = {};

_.forOwn(data, function(value, key) {
	var referenceWithLeadingAmpersand = key;
	var referenceWithoutLeadingAmpersand = referenceWithLeadingAmpersand.replace(/^&/, '');
	var referenceOnly = referenceWithoutLeadingAmpersand.replace(/;$/, '');
	var string = value.characters;
	var codePoints = value.codepoints;
	var tmp;
	if (/;$/.test(referenceWithoutLeadingAmpersand)) {
		// only if the entity has a trailing semicolon
		tmp = encodeMap[string];
		// Prefer short named character references with as few uppercase letters as possible
		if ( // only add an entry if…
			!tmp || ( // …there is no entry for this string yet, or…
				tmp.length > referenceOnly.length || // …this reference is shorter, or…
				(
					// …this reference contains fewer uppercase letters
					tmp.length == referenceOnly.length &&
					(referenceOnly.match(/[A-Z]/g) || []).length <
					(tmp.match(/[A-Z]/g) || []).length
				)
			)
		) {
			encodeMap[string] = referenceOnly;
		} else {
			// do nothing
		}
		if (codePoints.length == 1) {
			encodeSingleCodePoints.push(codePoints[0]);
		} else {
			encodeMultipleSymbols.push(string);
		}
	}
	if (/;$/.test(referenceWithoutLeadingAmpersand)) {
		decodeMap[referenceWithoutLeadingAmpersand.replace(/;$/, '')] = string;
	} else {
		decodeMapLegacy[referenceWithoutLeadingAmpersand] = string;
	}
});

encodeMultipleSymbols = _.uniq(
	encodeMultipleSymbols.sort(), // sort strings by code point value
	true
);

encodeSingleCodePoints = _.uniq(
	_.sortBy(encodeSingleCodePoints), // numeric sort
	true
);

var legacyReferences = _.keys(decodeMapLegacy).sort(function(a, b) {
	if (a.length > b.length) {
		return -1;
	}
	if (a.length < b.length) {
		return 1;
	}
	// a.length == b.length, so sort alphabetically
	return a - b;
});

var writeJSON = function(fileName, object) {
	var json = jsesc(object, {
		'compact': false,
		'json': true
	});
	fs.writeFileSync(fileName, json + '\n');
};

writeJSON('data/decode-map.json', decodeMap);
writeJSON('data/decode-map-legacy.json', decodeMapLegacy);
writeJSON('data/decode-legacy-named-references.json', legacyReferences);
writeJSON('data/encode-map.json', encodeMap);
writeJSON('data/encode-paired-symbols.json', encodeMultipleSymbols);
writeJSON('data/encode-lone-code-points.json', encodeSingleCodePoints);