

#' @importFrom stringr str_detect str_extract str_replace_all str_squish

sign_grammar <- function(x, dic, mapping = NULL) {

  if (is.null(mapping)) {
    path    <- system.file("extdata", "etcsl_mapping.txt", package = "sumer")
    mapping <- read.csv2(path, sep = ";", na.strings = "")
  }

  # --- Input: single character string ---
  if (length(x) != 1L) stop("x must be a single character string.")

  # --- Convert to cuneiform, then tokenize into individual signs ---
  cune <- as.cuneiform(x, mapping = mapping)
  cune_str <- as.character(cune)

  # Tokenize: extract each cuneiform character (U+12000..U+1254F)
  # and bracket tokens (angle bracket sequences)
  chars <- strsplit(cune_str, "")[[1]]
  n_ch  <- length(chars)
  tokens <- character(0)
  i <- 1L
  while (i <= n_ch) {
    cp <- utf8ToInt(chars[i])
    if (cp == 0x27E8L) {
      # Bracket token
      bracket <- chars[i]
      i <- i + 1L
      while (i <= n_ch && utf8ToInt(chars[i]) != 0x27E9L) {
        bracket <- paste0(bracket, chars[i])
        i <- i + 1L
      }
      if (i <= n_ch) {
        bracket <- paste0(bracket, chars[i])
        i <- i + 1L
      }
      tokens <- c(tokens, bracket)
    } else if (cp >= 0x12000L && cp <= 0x1254FL) {
      tokens <- c(tokens, chars[i])
      i <- i + 1L
    } else {
      i <- i + 1L
    }
  }

  n <- length(tokens)
  if (n == 0L) {
    return(data.frame(position   = integer(0),
                      sign_name  = character(0),
                      cuneiform  = character(0),
                      type       = character(0),
                      n          = integer(0),
                      stringsAsFactors = FALSE))
  }

  # --- Get sign name for each token ---
  sign_names <- vapply(tokens, function(tok) {
    as.character(as.sign_name(tok, mapping = mapping))
  }, character(1), USE.NAMES = FALSE)

  # --- Look up each sign in the dictionary ---
  trans <- dic[dic$row_type == "trans.", ]

  # --- Collect all types across all signs first ---
  all_types_list <- character(0)
  for (i in seq_len(n)) {
    s <- sign_names[i]
    entries <- trans[trans$sign_name == s, ]
    if (nrow(entries) > 0L) {
      types <- trimws(entries$type)
      types <- types[!is.na(types) & types != ""]
      all_types_list <- c(all_types_list, types)
    }
  }
  all_types <- sort(unique(all_types_list))
  K <- length(all_types)

  if (K == 0L) {
    # No types found for any sign
    return(data.frame(position   = integer(0),
                      sign_name  = character(0),
                      cuneiform  = character(0),
                      type       = character(0),
                      n          = integer(0),
                      stringsAsFactors = FALSE))
  }

  # --- Build result: one row per sign per grammar type ---
  rows <- vector("list", n * K)
  k <- 0L

  for (i in seq_len(n)) {
    s <- sign_names[i]
    entries <- trans[trans$sign_name == s, ]

    # Count per type
    type_counts <- setNames(integer(K), all_types)
    if (nrow(entries) > 0L) {
      for (j in seq_len(nrow(entries))) {
        t <- trimws(entries$type[j])
        if (!is.na(t) && t != "" && t %in% all_types) {
          cnt <- if (is.na(entries$count[j])) 1L else as.integer(entries$count[j])
          type_counts[t] <- type_counts[t] + cnt
        }
      }
    }

    for (ti in seq_len(K)) {
      k <- k + 1L
      rows[[k]] <- data.frame(
        position   = i,
        sign_name  = s,
        cuneiform  = tokens[i],
        type       = all_types[ti],
        n          = type_counts[ti],
        stringsAsFactors = FALSE,
        row.names  = NULL
      )
    }
  }

  out <- do.call(rbind, rows[seq_len(k)])
  rownames(out) <- NULL
  out
}
