#' CCF: Copula Control Function
#'
#' @description `CCF()` computes copula control functions (CCFs) that can be
#' added in the outcome model as control variables to correct for endogeneity.
#' which returns \eqn{P^*}, \eqn{W^*}, and the first-stage residuals.
#'
#' @details The `formula` argument is either in the 1-bar form `Y ~ X | P` or the 2-bar form `Y ~ X | P | W`, where
#' `X` respresents the explanatory variable(s) in the `Y` model, `P` represents the continuous
#' endogenous regressors, and `W` represents the exogenous regressors.  If `X` contains no
#' exogenous regressors, then the 2sCOPE model reduces to the simpler model in Park and Gupta (2012)
#' and returns \eqn{P^*} (the copula transformation of `P`) as CCF and \eqn{W^*} (the copula transformation of `W`) as null.
#' When the structural outcome model includes an intercept, copula transformations of regressors in `P` and `W`  use the
#' optimized algorithm (Equation 9 in Qian, Koschmann, and Xie, 2025)  to avoid estimation bias.
#'
#' The function CCF() will compute copula control function for each endogenous regressor specified in `P`.
#' Only first-order terms of endogenous regressors need to be included in `P`, even when the structural outcome model
#' contains higher-order terms of endogenous regressors. This is because including copula control functions for the
#' first-order endogenous regressors is sufficient to control for endogeneity, while adding control functions for
#' higher-order endogenous terms---such as interactions among endogenous regressors, interactions between endogenous and
#' exogenous regressors, or squared endogenous regressors---is unnecessary and can substantially degrade the performance
#' of copula correction (Qian, Koschmann, and Xie, 2025). This parsimonious treatment of higher-order endogenous
#' regressors is a merit of copula correction.
#'
#' Thus, if `X` contains no higher-order terms of endogenous regressors, the simpler 1-bar form `Y ~ X | P`
#' can be used, and `CCF()` treats all regressors in `X` except those in `P` as exogenous.
#' When `X` includes higher-order endogenous terms, the 2-bar form  `Y ~ X | P | W` should be used to explicitly specify
#'  the exogenous regressors in `W` and ensure that the higher-order endogenous terms are not treated as exogenous variables.
#'
#' @param formula A formula describing the model to be fitted. The details of
#'   model specification are given under ``Details''.
#' @param data a data frame, list, or environment containing the variables in
#'   the model.
#'
#' @return A list of class "`ccf`" containing the following components:
#'   \item{ccf}{a matrix of the first-stage residuals as copula control functions.} 
#'   \item{pstar}{a matrix representing \eqn{P^*}} 
#'   \item{wstar}{a matrix representing \eqn{W^*}}
#'
#' @examples
#' data("diapers") #load data
#'
#' ### Specify logPrice as endogenous using the 1-bar option
#' #run the copula control function
#' ccf_1bar <- CCF(logVol ~ logPrice+Fshare+week+Q2+Q3+Q4|logPrice,data=diapers)
#' #print the first 5 elements of the first-stage residuals
#' head(ccf_1bar$ccf, 5)
#' head(ccf_1bar$pstar, 5) #print the first 5 elements of P*
#' head(ccf_1bar$wstar, 5) #print the first 5 elements of W*
#'
#' ### Specify logPrice as endogenous and the rest of the variables as exogenous
#' #using the 2-bar option, which will produce the same results,
#' ccf_2bar <- CCF(logVol ~ logPrice+Fshare+week+Q2+Q3+Q4|logPrice|
#'     Fshare+week+Q2+Q3+Q4, data = diapers) #run the copula control function
#' head(ccf_2bar$ccf, 5) #print first 5 elements of the 1st-stage resid
#' head(ccf_2bar$pstar, 5) #print first 5 elements of P*
#' head(ccf_2bar$wstar, 5) #print first 5 elements of W*
#'
#' ### Run Park & Gupta (2012) by specifying logPrice as the only regressor,
#' ### which is endogenous.
#'
#' #run the copula control function
#' ccf_pg <- CCF(logVol ~ logPrice|logPrice, data = diapers)
#' head(ccf_pg$ccf, 5) #print first 5 elements of the 1st-stage resid
#' head(ccf_pg$pstar, 5) #print first 5 elements of P*
#' head(ccf_pg$wstar, 5) #print first 5 elements of W*
#' # notice how the 1st-stage residuals and P* are equivalent, and wstar is NULL
#'
#' @references
#' Qian, Y., Koschmann, A., & Xie, H. (2025).
#' \emph{EXPRESS: A Practical Guide to Endogeneity Correction Using Copulas.}
#' Journal of Marketing. <doi:10.1177/00222429251410844>\cr
#'
#' Park, S., & Gupta, S. (2012).
#' \emph{Handling endogenous regressors by joint estimation using copulas.}
#' Marketing Science, 31(4), 567-586.\cr
#'
#' Yang, F., Qian, Y., & Xie, H. (2025).
#' \emph{Addressing Endogeneity Using a Two-Stage Copula Generated Regressor Approach.}
#' Journal of Marketing Research, 62(4), 601-623.
#' <doi:10.1177/00222437241296453>\cr
#' @export

CCF <- function(formula, data) {
  F.formula <- Formula::as.Formula(formula)
  if(sum(length(F.formula)) == 2)
    stop("Endogenous and exogenous regressors not specified in formula.")
  x <- as.matrix(model.matrix(F.formula, data = data, rhs = 1))
  n <- nrow(x)
  if (ncol(x) <= 1)  stop("No predictor variables specified.")

  p <- as.matrix(model.matrix(F.formula, data = data, rhs = 2))
  p <- p[,colnames(p)!= "(Intercept)", drop=F]
  if(length(setdiff(colnames(p),colnames(x)))>0) {
    if(length(setdiff(colnames(p),colnames(x))) == 1)
      stop("Endogenous regressor ", setdiff(colnames(p),colnames(x)),
           " was not included in the original regression formula.")
    else
      stop("Endogenous regressors ",
           paste(setdiff(colnames(p), colnames(x)), collapse = ", "),
           " were not included in the original regression formula.")
  }
  np <- ncol(p)

  if(sum(length(F.formula))==3) {
    w <- x[, setdiff(colnames(x), colnames(p)), drop = FALSE]
    if(ncol(w) == 1)
       w <- NULL
  } else if(sum(length(F.formula))==4) {
    w <- as.matrix(model.matrix(F.formula, data = data, rhs = 3))
    if(ncol(w) >= ncol(x))
      stop("Not all exogenous regressors specified in original formula.")
  } else {
    w <- NULL
  }



  if(!is.null(w) & length(intersect(colnames(p), colnames(w))) > 0 & .internal_env$p_and_w_remove_w){
    if(length(intersect(colnames(p), colnames(w))) == 1) {
      .internal_env$run_fstat <- T
      stop("Regressor ", paste(intersect(colnames(p), colnames(w)), collapse = ", "), " was specified as both endogenous and exogenous.")
    } else {
      .internal_env$run_fstat <- T
      stop("Regressors ", paste(intersect(colnames(p), colnames(w)), collapse = ", "), " were specified as both endogenous and exogenous.")
    }
  }

  is_higher_order <- grepl(":", colnames(p)) |
    grepl("\\^", colnames(p)) |
    grepl("I\\(", colnames(p)) |
    grepl("poly\\(", colnames(p))


  higher_order_labels <- colnames(p)[is_higher_order]
  ho_formula  <- if (length(higher_order_labels)) reformulate(higher_order_labels) else ~0
  fpw <- model.matrix(ho_formula, data = data)
  fpw <- fpw[, colnames(fpw) != "(Intercept)", drop = FALSE]
  subset_vec <- colnames(w)[-1][
    sapply(colnames(w)[-1], function(v) any(grepl(v, colnames(fpw), fixed = TRUE)))
  ]

  w <- w[ , -which(colnames(w) %in% c("(Intercept)",colnames(p))), drop=F]
  nw <- ncol(w)
  wnames<-colnames(w)
  pnames<-colnames(p)

  if(length(fpw) > 0 & .internal_env$high_order_terms_warning == T) {
    if(sum(length(F.formula))==3) {
      stop("Please use the 2-bar formula option to specify higher-order terms as endogenous.")
      .internal_env$run_fstat <- T
    }
  warning("The list of endogenous regressors should only include first-order terms of endogenous regressors.
  Adding control functions for higher order terms of endogenous regressors is unnecessary and can\n  worsen model performance. Removing ",
    colnames(fpw), " from the list of endogenous regressors is advised.")
  }
  if(!is.null(w)) {
    if((ncol(x)-ncol(p)-1) > ncol(w)) {
      .internal_env$run_fstat <- T
      stop("Not all regressors in original formula specified as either endogenous or exogenous.")
    }
    if(nw == 0)
      w <- NULL
  }

  fstats <- NULL
  pstar <- matrix(0,n,np)
  for (i in 1:np){
    pstartemp = ecdf(p[,i])(p[,i])
    pstartemp[pstartemp==1]=n/(n+1)
    pstartemp= qnorm(pstartemp)
    pstar[,i] = pstartemp
  }
  if ((is.null(w))) {
    if((ncol(x)-1) > ncol(p)) {
      .internal_env$run_fstat <- T
      stop("Exogenous regressors not specified in formula.")
    }
    if(.internal_env$p_and_g_output) {
      message("Note: No exogenous regressors specified, using copula_origin (Park & Gupta). \nNote that the stage 1 residuals are equiavalent to Pstar and that\nWstar is NULL because there are no exogenous regressors.")
    }
    stage1_resid <- pstar
    wstar <- NULL
  }else{
    wstar <- matrix(0,n,nw)
    for (i in 1:nw){
      wstartemp = ecdf(w[,i])(w[,i])
      wstartemp[wstartemp==1]=n/(n+1)
      wstartemp = qnorm(wstartemp)
      wstar[,i] = wstartemp
    }
    colnames(wstar) <- wnames

    stage1_resid = matrix(0,n,np)
    if(.internal_env$run_fstat)
      fstats <- data.frame()
    for (j in 1:np){
      dataj<-data.frame(pj=pstar[,j], wstar)
      regj<-lm(pj~., data=dataj)
      stage1_resid[,j]<- regj$resid
      if(.internal_env$run_fstat) {
        stage1F<- car::Anova(regj, type=3)
        temp_df <- as.data.frame(stage1F[stage1F[,3]>10 & !is.na(stage1F[,3]),])
        significant_with <- sapply(rownames(temp_df), function(term) {
          if (grepl("^I\\..*\\.$", term)) {
            core <- sub("^I\\.", "", term)
            core <- sub("\\.$", "", core)
            core <- gsub("\\.\\.\\.", " * ", core)
            return(paste0("I(", core, ")"))
          } else {
            return(term)
          }
        })
        rownames(temp_df) <- NULL
        endogenous_var <- pnames[j]
        if(nrow(temp_df) > 0) {
          fstats <- dplyr::filter(rbind(fstats, cbind(endogenous_var, significant_with, temp_df)),
                          significant_with != "(Intercept)")
          rownames(fstats) <- NULL
        } else {
          fstats <- NULL
        }
      }
    }
  }
  colnames(stage1_resid) <- paste0("ccf: ",colnames(p))
  colnames(pstar) <- colnames(p)
  on.exit({.internal_env$run_fstat <- T})
  z <- list(ccf= stage1_resid, pstar = pstar, wstar = wstar)
  attr(z, "f_stat") <- fstats
  attr(z, "x") <- x
  attr(z, "p") <- p
  attr(z, "np") <- np
  attr(z, "pstar") <- pstar
  class(z) <- "ccf"
  z
}
