#!/usr/bin/env python3

# Copyright 2016 Patrick O. Perry.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import math
import re

BREAK_PROPERTY = "data/ucd/auxiliary/SentenceBreakProperty.txt"

pattern = re.compile(r"""^([0-9A-Fa-f]+)        # (first code)
                          (\.\.([0-9A-Fa-f]+))? # (.. last code)?
                          \s*
                          ;                     # ;
                          \s*
                          (\w+)                 # (property name)
                          \s*
                          (\#.*)?$              # (# comment)?""", re.X)

UNICODE_MAX = 0x10FFFF

# Parse SentenceBreakProperty.txt

try:
    file = open(BREAK_PROPERTY, "r")
except FileNotFoundError:
    file = open("../" + BREAK_PROPERTY, "r")

code_props = ['Other'] * (UNICODE_MAX + 1)
prop_names = set()
code_max = 0

properties = set({})
with file:
    for line in file:
        line = line.split("#")[0] # remove comment
        m = pattern.match(line)
        if m:
            first = int(m.group(1), 16)
            if m.group(3):
                last = int(m.group(3), 16)
            else:
                last = first
            name = m.group(4)
            for u in range(first, last + 1):
                code_props[u] = name
            prop_names.add(name)
            if last > code_max:
                code_max = last

prop_vals = {}
prop_vals['Other'] = 0;

for p in sorted(prop_names):
    prop_vals[p] = len(prop_vals)


def compute_tables(block_size):
    nblock = (UNICODE_MAX + 1) // block_size
    stage1 = [None] * nblock
    stage2 = []
    stage2_dict = {}
    for i in range(nblock):
        begin = i * block_size
        end = begin + block_size
        block = tuple(code_props[begin:end])
        if block in stage2_dict:
            j = stage2_dict[block]
        else:
            j = len(stage2)
            stage2_dict[block] = j
            stage2.append(block)
        stage1[i] = j
    return (stage1,stage2)


def stage1_item_size(nstage2):
    nbyte = math.ceil(math.log2(nstage2) / 8)
    size = 2**math.ceil(math.log2(nbyte))
    return size

page_size = 4096
block_size = 256

nbytes = {}

best_block_size = 1
smallest_size = UNICODE_MAX + 1

for i in range(1,17):
    block_size = 2**i
    stage1,stage2 = compute_tables(block_size)

    nbyte1 = len(stage1) * stage1_item_size(len(stage2))
    nbyte2 = len(stage2) * block_size

    nbyte1 = math.ceil(nbyte1 / page_size) * page_size
    nbyte2 = math.ceil(nbyte2 / page_size) * page_size
    nbyte = nbyte1 + nbyte2
    nbytes[block_size] = nbyte

    if nbyte < smallest_size:
        smallest_size = nbyte
        best_block_size = block_size


block_size = best_block_size
stage1,stage2 = compute_tables(block_size)

type1_size = stage1_item_size(len(stage2))

if type1_size == 1:
    type1 = 'uint8_t'
elif type1_size == 2:
    type1 = 'uint16_t'
elif type1_size == 4:
    type1 = 'uint32_t'
else:
    type1 = 'uint64_t'

type2 = 'int8_t'



# Write sentbreakprop.h to stdout

print("/* This file is automatically generated. DO NOT EDIT!")
print("   Instead, edit gen-sentbreak.py and re-run.  */")
print("")
print("/*")
print(" * Unicode Sentence_Break property values.")
print(" *")
print(" * Defined in UAX #29 \"Unicode Text Segmentation\"")
print(" *")
print(" *     http://www.unicode.org/reports/tr29/")
print(" *")
print(" * Section 4.1, Table 3.")
print(" *")
print(" *")
print(" * We use the two-stage lookup strategy described at")
print(" *")
print(" *     http://www.strchr.com/multi-stage_tables")
print(" *")
print(" */")
print("")
print("#ifndef SENTBREAKPROP_H")
print("#define SENTBREAKPROP_H")
print("")
print("#include <stdint.h>")
print("")
print("enum sent_break_prop {")
print("\tSENT_BREAK_OTHER = 0", end="")
for prop in sorted(prop_names):
    print(",\n\tSENT_BREAK_" + prop.upper() + " = " + str(prop_vals[prop]),
          end="")
print("\n};")
print("")
print("static const " + type1 + " sent_break_stage1[] = {")
for i in range(len(stage1) - 1):
    if i % 16  == 0:
        print("/* U+{:04X} */".format(i * block_size), end="")
    print("{0: >3},".format(stage1[i]), end="")
    if i % 16 == 15:
        print("")
print("{0: >3}".format(stage1[len(stage1) - 1]))
print("};")
print("")
print("static const " + type2 + " sent_break_stage2[][" +
        str(block_size) + "] = {")
#for i in range(len(stage2)):
for i in range(0,len(stage2)):
    print("  /* block " + str(i) + " */")
    print("  {", end="")
    for j in range(block_size):
        print("{0: >3}".format(prop_vals[stage2[i][j]]), end="")
        if j + 1 == block_size:
            print("\n  }", end="")
        else:
            print(",", end="")
            if j % 16 == 15:
                print("\n   ", end="")
    if i + 1 != len(stage2):
        print(",\n")
    else:
        print("")
print("};")

print("")
print("static int sent_break(uint32_t code)")
print("{")
print("\tconst uint32_t block_size = " + str(block_size) + ";")
print("\t" + type1 + " i = sent_break_stage1[code / block_size];")
print("\treturn sent_break_stage2[i][code % block_size];")
print("}")
print("")
print("#endif /* SENTBREAKPROP_H */")
