import regex import unicodedata if False: # This code is equivalent to: cpt.to_bytes(4, "little")) def cpt_to_utf8_str(cpt): if cpt <= 0xFF: return bytes([cpt, 0, 0, 0]) elif cpt <= 0xFFFF: return bytes([cpt & 0xFF, cpt >> 8, 0, 0]) elif cpt <= 0xFFFFFF: return bytes([cpt & 0xFF, (cpt >> 8) & 0xFF, (cpt >> 16) & 0xFF, 0]) else: return bytes([cpt & 0xFF, (cpt >> 8) & 0xFF, (cpt >> 16) & 0xFF, cpt >> 24]) # This code is equivalent to: regex_expr_compiled.match(chr(codepoint)) def is_match(codepoint, regex_expr): try: res = regex_expr.match(cpt_to_utf8_str(codepoint).decode('utf-32')) return res is not None except Exception: return False # Verify previous statements, using chr() and ord() for codepoint in range(0x110000): temp = cpt_to_utf8_str(codepoint) assert(temp == codepoint.to_bytes(4, "little")) try: char = temp.decode('utf-32') if codepoint == 0xFEFF: # BOM assert(char == "") # why? char = "\uFEFF" except UnicodeDecodeError: continue assert(char == chr(codepoint) ) assert(ord(char) == codepoint ) def get_matches(regex_expr): regex_expr_compiled = regex.compile(regex_expr) unicode_ranges = [] current_range = None for codepoint in range(0x110000): char = chr(codepoint) if regex_expr_compiled.match(char): if current_range is None: current_range = [codepoint, codepoint] else: current_range[1] = codepoint elif current_range is not None: unicode_ranges.append(tuple(current_range)) current_range = None if current_range is not None: unicode_ranges.append(tuple(current_range)) return unicode_ranges def print_cat(mode, cat, ranges): if mode == "range": print("const std::vector> unicode_ranges_{} = {{".format(cat)) if mode == "range_value": print("const std::vector> unicode_ranges_{} = {{".format(cat)) if mode == "map": print("const std::map unicode_map_{} = {{".format(cat)) for i, values in enumerate(ranges): end = ",\n" if (i%4 == 3 or i+1 == len(ranges)) else ", " values = ["0x%08X"%value for value in values] print("{" + ", ".join(values) + "}", end=end) print("};") print("") print_cat("range", "number", get_matches(r'\p{N}')) print_cat("range", "letter", get_matches(r'\p{L}')) print_cat("range", "separator", get_matches(r'\p{Z}')) print_cat("range", "accent_mark", get_matches(r'\p{M}')) print_cat("range", "punctuation", get_matches(r'\p{P}')) print_cat("range", "symbol", get_matches(r'\p{S}')) print_cat("range", "control", get_matches(r'\p{C}')) print_cat("range", "whitespace", get_matches(r'\s')) map_lowercase = [] map_uppercase = [] for codepoint in range(0x110000): char = chr(codepoint) lower = ord(char.lower()[0]) upper = ord(char.upper()[0]) if codepoint != lower: map_lowercase.append((codepoint,lower)) if codepoint != upper: map_uppercase.append((codepoint,upper)) print_cat("map", "lowercase", map_lowercase) print_cat("map", "uppercase", map_uppercase) # TODO: this is wrong # inv_map_nfd = {} # for codepoint in range(0x110000): # char = chr(codepoint) # norm = ord(unicodedata.normalize('NFD', char)[0]) # if codepoint != norm: # a, b = inv_map_nfd.get(norm, (codepoint, codepoint)) # inv_map_nfd[norm] = (min(a, codepoint), max(b, codepoint)) # nfd_ranges = [ (a, b, nfd) for nfd,(a,b) in inv_map_nfd.items() ] # nfd_ranges = list(sorted(nfd_ranges)) # del inv_map_nfd # print_cat("range_value", "nfd", nfd_ranges)