summaryrefslogtreecommitdiffstats
path: root/scintilla/scripts/GenerateCaseConvert.py
blob: 37506b7b953df1c4e2d7109c62e3e2b8eb513c5a (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
# Script to generate CaseConvert.cxx from Python's Unicode data
# Should be run rarely when a Python with a new version of Unicode data is available.
# Requires Python 3.3 or later
# Should not be run with old versions of Python.

# Current best approach divides case conversions into two cases: 
# simple symmetric and complex.
# Simple symmetric is where a lower and upper case pair convert to each
# other and the folded form is the same as the lower case. 
# There are 1006 symmetric pairs.
# These are further divided into ranges (stored as lower, upper, range length,
# range pitch and singletons (stored as lower, upper). 
# Complex is for cases that don't fit the above: where there are multiple
# characters in one of the forms or fold is different to lower or 
# lower(upper(x)) or upper(lower(x)) are not x. These are represented as UTF-8
# strings with original, folded, upper, and lower separated by '|'.
# There are 126 complex cases.

import codecs, itertools, os, string, sys, unicodedata

from FileGenerator import Regenerate

def contiguousRanges(l, diff):
    # l is s list of lists
    # group into lists where first element of each element differs by diff
    out = [[l[0]]]
    for s in l[1:]:
        if s[0] != out[-1][-1][0] + diff:
            out.append([])
        out[-1].append(s)
    return out

def flatten(listOfLists):
    "Flatten one level of nesting"
    return itertools.chain.from_iterable(listOfLists)
    
def conversionSets():
    # For all Unicode characters, see whether they have case conversions
    # Return 2 sets: one of simple symmetric conversion cases and another
    # with complex cases.
    complexes = []
    symmetrics = []
    for ch in range(sys.maxunicode):
        if ch >= 0xd800 and ch <= 0xDBFF:
            continue
        if ch >= 0xdc00 and ch <= 0xDFFF:
            continue
        uch = chr(ch)

        fold = uch.casefold()
        upper = uch.upper()
        lower = uch.lower()
        symmetric = False
        if uch != upper and len(upper) == 1 and uch == lower and uch == fold:
            lowerUpper = upper.lower()
            foldUpper = upper.casefold()
            if lowerUpper == foldUpper and lowerUpper == uch:
                symmetric = True
                symmetrics.append((ch, ord(upper), ch - ord(upper)))
        if uch != lower and len(lower) == 1 and uch == upper and lower == fold:
            upperLower = lower.upper()
            if upperLower == uch:
                symmetric = True

        if fold == uch:
            fold = ""
        if upper == uch:
            upper = ""
        if lower == uch:
            lower = ""

        if (fold or upper or lower) and not symmetric:
            complexes.append((uch, fold, upper, lower))

    return symmetrics, complexes

def groupRanges(symmetrics):
    # Group the symmetrics into groups where possible, returning a list
    # of ranges and a list of symmetrics that didn't fit into a range

    def distance(s):
        return s[2]

    groups = []
    uniquekeys = []
    for k, g in itertools.groupby(symmetrics, distance):
        groups.append(list(g))      # Store group iterator as a list
        uniquekeys.append(k)

    contiguousGroups = flatten([contiguousRanges(g, 1) for g in groups])
    longGroups = [(x[0][0], x[0][1], len(x), 1) for x in contiguousGroups if len(x) > 4]
    
    oneDiffs = [s for s in symmetrics if s[2] == 1]
    contiguousOnes = flatten([contiguousRanges(g, 2) for g in [oneDiffs]])
    longOneGroups = [(x[0][0], x[0][1], len(x), 2) for x in contiguousOnes if len(x) > 4]

    rangeGroups = sorted(longGroups+longOneGroups, key=lambda s: s[0])

    rangeCoverage = list(flatten([range(r[0], r[0]+r[2]*r[3], r[3]) for r in rangeGroups]))
    
    nonRanges = [(l, u) for l, u, d in symmetrics if l not in rangeCoverage]

    return rangeGroups, nonRanges

def escape(s):
	return "".join((chr(c) if chr(c) in string.ascii_letters else "\\x%x" % c) for c in s.encode('utf-8'))

def updateCaseConvert():
    symmetrics, complexes = conversionSets()
    
    rangeGroups, nonRanges = groupRanges(symmetrics)

    print(len(rangeGroups), "ranges")
    rangeLines = ["%d,%d,%d,%d, " % x for x in rangeGroups]

    print(len(nonRanges), "non ranges")
    nonRangeLines = ["%d,%d, " % x for x in nonRanges]
    
    print(len(symmetrics), "symmetric")
    
    complexLines = ['"%s|%s|%s|%s|"' % tuple(escape(t) for t in x) for x in complexes]
    print(len(complexLines), "complex")

    Regenerate("../src/CaseConvert.cxx", "//", rangeLines, nonRangeLines, complexLines)

updateCaseConvert()