diff options
Diffstat (limited to 'scintilla/scripts/GenerateCaseConvert.py')
-rw-r--r-- | scintilla/scripts/GenerateCaseConvert.py | 126 |
1 files changed, 126 insertions, 0 deletions
diff --git a/scintilla/scripts/GenerateCaseConvert.py b/scintilla/scripts/GenerateCaseConvert.py new file mode 100644 index 0000000..37506b7 --- /dev/null +++ b/scintilla/scripts/GenerateCaseConvert.py @@ -0,0 +1,126 @@ +# Script to generate CaseConvert.cxx from Python's Unicode data +# Should be run rarely when a Python with a new version of Unicode data is available. +# Requires Python 3.3 or later +# Should not be run with old versions of Python. + +# Current best approach divides case conversions into two cases: +# simple symmetric and complex. +# Simple symmetric is where a lower and upper case pair convert to each +# other and the folded form is the same as the lower case. +# There are 1006 symmetric pairs. +# These are further divided into ranges (stored as lower, upper, range length, +# range pitch and singletons (stored as lower, upper). +# Complex is for cases that don't fit the above: where there are multiple +# characters in one of the forms or fold is different to lower or +# lower(upper(x)) or upper(lower(x)) are not x. These are represented as UTF-8 +# strings with original, folded, upper, and lower separated by '|'. +# There are 126 complex cases. + +import codecs, itertools, os, string, sys, unicodedata + +from FileGenerator import Regenerate + +def contiguousRanges(l, diff): + # l is s list of lists + # group into lists where first element of each element differs by diff + out = [[l[0]]] + for s in l[1:]: + if s[0] != out[-1][-1][0] + diff: + out.append([]) + out[-1].append(s) + return out + +def flatten(listOfLists): + "Flatten one level of nesting" + return itertools.chain.from_iterable(listOfLists) + +def conversionSets(): + # For all Unicode characters, see whether they have case conversions + # Return 2 sets: one of simple symmetric conversion cases and another + # with complex cases. + complexes = [] + symmetrics = [] + for ch in range(sys.maxunicode): + if ch >= 0xd800 and ch <= 0xDBFF: + continue + if ch >= 0xdc00 and ch <= 0xDFFF: + continue + uch = chr(ch) + + fold = uch.casefold() + upper = uch.upper() + lower = uch.lower() + symmetric = False + if uch != upper and len(upper) == 1 and uch == lower and uch == fold: + lowerUpper = upper.lower() + foldUpper = upper.casefold() + if lowerUpper == foldUpper and lowerUpper == uch: + symmetric = True + symmetrics.append((ch, ord(upper), ch - ord(upper))) + if uch != lower and len(lower) == 1 and uch == upper and lower == fold: + upperLower = lower.upper() + if upperLower == uch: + symmetric = True + + if fold == uch: + fold = "" + if upper == uch: + upper = "" + if lower == uch: + lower = "" + + if (fold or upper or lower) and not symmetric: + complexes.append((uch, fold, upper, lower)) + + return symmetrics, complexes + +def groupRanges(symmetrics): + # Group the symmetrics into groups where possible, returning a list + # of ranges and a list of symmetrics that didn't fit into a range + + def distance(s): + return s[2] + + groups = [] + uniquekeys = [] + for k, g in itertools.groupby(symmetrics, distance): + groups.append(list(g)) # Store group iterator as a list + uniquekeys.append(k) + + contiguousGroups = flatten([contiguousRanges(g, 1) for g in groups]) + longGroups = [(x[0][0], x[0][1], len(x), 1) for x in contiguousGroups if len(x) > 4] + + oneDiffs = [s for s in symmetrics if s[2] == 1] + contiguousOnes = flatten([contiguousRanges(g, 2) for g in [oneDiffs]]) + longOneGroups = [(x[0][0], x[0][1], len(x), 2) for x in contiguousOnes if len(x) > 4] + + rangeGroups = sorted(longGroups+longOneGroups, key=lambda s: s[0]) + + rangeCoverage = list(flatten([range(r[0], r[0]+r[2]*r[3], r[3]) for r in rangeGroups])) + + nonRanges = [(l, u) for l, u, d in symmetrics if l not in rangeCoverage] + + return rangeGroups, nonRanges + +def escape(s): + return "".join((chr(c) if chr(c) in string.ascii_letters else "\\x%x" % c) for c in s.encode('utf-8')) + +def updateCaseConvert(): + symmetrics, complexes = conversionSets() + + rangeGroups, nonRanges = groupRanges(symmetrics) + + print(len(rangeGroups), "ranges") + rangeLines = ["%d,%d,%d,%d, " % x for x in rangeGroups] + + print(len(nonRanges), "non ranges") + nonRangeLines = ["%d,%d, " % x for x in nonRanges] + + print(len(symmetrics), "symmetric") + + complexLines = ['"%s|%s|%s|%s|"' % tuple(escape(t) for t in x) for x in complexes] + print(len(complexLines), "complex") + + Regenerate("../src/CaseConvert.cxx", "//", rangeLines, nonRangeLines, complexLines) + +updateCaseConvert() |