reference, declarationdefinition
definition → references, declarations, derived classes, virtual overrides
reference to multiple definitions → definitions
unreferenced
    1
    2
    3
    4
    5
    6
    7
    8
    9
   10
   11
   12
   13
   14
   15
   16
   17
   18
   19
   20
   21
   22
   23
   24
   25
   26
   27
   28
   29
   30
   31
   32
   33
   34
   35
   36
   37
   38
   39
   40
   41
   42
   43
   44
   45
   46
   47
   48
   49
   50
   51
   52
   53
   54
   55
   56
   57
   58
   59
   60
   61
   62
   63
   64
   65
   66
   67
   68
   69
   70
   71
   72
   73
   74
   75
   76
   77
   78
   79
   80
   81
   82
   83
   84
   85
   86
   87
   88
   89
   90
   91
   92
   93
   94
   95
   96
   97
   98
   99
  100
  101
  102
  103
  104
  105
  106
  107
  108
  109
  110
  111
  112
  113
  114
  115
  116
  117
  118
  119
  120
  121
  122
  123
  124
  125
  126
  127
  128
  129
  130
  131
  132
  133
  134
  135
  136
  137
  138
  139
  140
  141
  142
  143
#!/usr/bin/env python
"""
Unicode case folding database conversion utility

Parses the database and generates a C++ function which implements the case
folding algorithm. The database entries are of the form:

  <code>; <status>; <mapping>; # <name>

<status> can be one of four characters:
  C - Common mappings
  S - mappings for Simple case folding
  F - mappings for Full case folding
  T - special case for Turkish I characters

Right now this generates a function which implements simple case folding (C+S
entries).
"""

from __future__ import print_function

import sys
import re
try:
    from urllib.request import urlopen
except ImportError:
    from urllib2 import urlopen


# This variable will body of the mappings function
body = ""

# Reads file line-by-line, extracts Common and Simple case fold mappings and
# returns a (from_char, to_char, from_name) tuple.
def mappings(f):
    previous_from = -1
    expr = re.compile(r'^(.*); [CS]; (.*); # (.*)')
    for line in f:
        m = expr.match(line)
        if not m: continue
        from_char = int(m.group(1), 16)
        to_char = int(m.group(2), 16)
        from_name = m.group(3)

        if from_char <= previous_from:
            raise Exception("Duplicate or unsorted characters in input")
        yield from_char, to_char, from_name
        previous_from = from_char

# Computes the shift (to_char - from_char) in a mapping.
def shift(mapping):
    return mapping[1] - mapping[0]

# Computes the stride (from_char2 - from_char1) of two mappings.
def stride2(mapping1, mapping2):
    return mapping2[0] - mapping1[0]

# Computes the stride of a list of mappings. The list should have at least two
# mappings. All mappings in the list are assumed to have the same stride.
def stride(block):
    return stride2(block[0], block[1])


# b is a list of mappings. All the mappings are assumed to have the same
# shift and the stride between adjecant mappings (if any) is constant.
def dump_block(b):
    global body

    if len(b) == 1:
        # Special case for handling blocks of length 1. We don't even need to
        # emit the "if (C < X) return C" check below as all characters in this
        # range will be caught by the "C < X" check emitted by the first
        # non-trivial block.
        body  += "  // {2}\n  if (C == {0:#06x})\n    return {1:#06x};\n".format(*b[0])
        return

    first = b[0][0]
    last = first + stride(b) * (len(b)-1)
    modulo = first % stride(b)

    # All characters before this block map to themselves.
    body += "  if (C < {0:#06x})\n    return C;\n".format(first)
    body += "  // {0} characters\n".format(len(b))

    # Generic pattern: check upper bound (lower bound is checked by the "if"
    # above) and modulo of C, return C+shift.
    pattern = "  if (C <= {0:#06x} && C % {1} == {2})\n    return C + {3};\n"

    if stride(b) == 2 and shift(b[0]) == 1 and modulo == 0:
        # Special case:
        # We can elide the modulo-check because the expression "C|1" will map
        # the intervening characters to themselves.
        pattern = "  if (C <= {0:#06x})\n    return C | 1;\n"
    elif stride(b) == 1:
        # Another special case: X % 1 is always zero, so don't emit the
        # modulo-check.
        pattern = "  if (C <= {0:#06x})\n    return C + {3};\n"

    body += pattern.format(last, stride(b), modulo, shift(b[0]))

current_block = []
f = urlopen(sys.argv[1])
for m in mappings(f):
    if len(current_block) == 0:
        current_block.append(m)
        continue

    if shift(current_block[0]) != shift(m):
        # Incompatible shift, start a new block.
        dump_block(current_block)
        current_block = [m]
        continue

    if len(current_block) == 1 or stride(current_block) == stride2(current_block[-1], m):
        current_block.append(m)
        continue

    # Incompatible stride, start a new block.
    dump_block(current_block)
    current_block = [m]
f.close()

dump_block(current_block)

print('//===---------- Support/UnicodeCaseFold.cpp -------------------------------===//')
print('//')
print('// This file was generated by utils/unicode-case-fold.py from the Unicode')
print('// case folding database at')
print('//   ', sys.argv[1])
print('//')
print('// To regenerate this file, run:')
print('//   utils/unicode-case-fold.py \\')
print('//     "{}" \\'.format(sys.argv[1]))
print('//     > lib/Support/UnicodeCaseFold.cpp')
print('//')
print('//===----------------------------------------------------------------------===//')
print('')
print('#include "llvm/Support/Unicode.h"')
print('')
print("int llvm::sys::unicode::foldCharSimple(int C) {")
print(body)
print("  return C;")
print("}")