% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/multifor.R
\encoding{UTF-8}
\name{multifor}
\alias{multifor}
\title{Construct a multi forest prediction rule and calculate multi-class and discriminatory variable importance scores as described in Hornung & Hapfelmeier (2024).}
\usage{
multifor(
  formula = NULL,
  data = NULL,
  num.trees = ifelse(nrow(data) <= 5000, 5000, 1000),
  importance = "both",
  write.forest = TRUE,
  probability = TRUE,
  min.node.size = NULL,
  max.depth = NULL,
  replace = FALSE,
  sample.fraction = ifelse(replace, 1, 0.7),
  case.weights = NULL,
  keep.inbag = FALSE,
  inbag = NULL,
  holdout = FALSE,
  oob.error = TRUE,
  num.threads = NULL,
  verbose = TRUE,
  seed = NULL,
  dependent.variable.name = NULL,
  mtry = NULL,
  npervar = 5
)
}
\arguments{
\item{formula}{Object of class \code{formula} or \code{character} describing the model to fit. Interaction terms supported only for numerical variables.}

\item{data}{Training data of class \code{data.frame}, or \code{matrix}, \code{dgCMatrix} (Matrix).}

\item{num.trees}{Number of trees. Default is 5000 for datasets with a maximum of 5000 observations and 1000 for datasets with more than 5000 observations.}

\item{importance}{Variable importance mode, one of the following: "both" (the default), "multiclass", "discriminatory", "none". If "multiclass", multi-class VIM values are computed, if "discriminatory", discriminatory VIM values are computed, and if "both", both multi-class and discriminatory VIM values are computed. See the 'Details' section below for details.}

\item{write.forest}{Save \code{multifor.forest} object, required for prediction. Set to \code{FALSE} to reduce memory usage if no prediction intended.}

\item{probability}{Grow a probability forest as in Malley et al. (2012). Using this option (default is \code{TRUE}), class probability predictions are obtained.}

\item{min.node.size}{Minimal node size. Default 5 for probability and 1 for classification.}

\item{max.depth}{Maximal tree depth. A value of NULL or 0 (the default) corresponds to unlimited depth, 1 to tree stumps (1 split per tree).}

\item{replace}{Sample with replacement. Default is \code{FALSE}.}

\item{sample.fraction}{Fraction of observations to sample. Default is 1 for sampling with replacement and 0.7 for sampling without replacement. This can be a vector of class-specific values.}

\item{case.weights}{Weights for sampling of training observations. Observations with larger weights will be selected with higher probability in the bootstrap (or subsampled) samples for the trees.}

\item{keep.inbag}{Save how often observations are in-bag in each tree.}

\item{inbag}{Manually set observations per tree. List of size num.trees, containing inbag counts for each observation. Can be used for stratified sampling.}

\item{holdout}{Hold-out mode. Hold-out all samples with case weight 0 and use these for variable importance and prediction error.}

\item{oob.error}{Compute OOB prediction error. Default is \code{TRUE}.}

\item{num.threads}{Number of threads. Default is number of CPUs available.}

\item{verbose}{Show computation status and estimated runtime.}

\item{seed}{Random seed. Default is \code{NULL}, which generates the seed from \code{R}. Set to \code{0} to ignore the \code{R} seed.}

\item{dependent.variable.name}{Name of outcome variable, needed if no formula given.}

\item{mtry}{Number of candidate variables to sample for each split. Default is the (rounded down) square root of the number variables.}

\item{npervar}{Number of splits to sample per candidate variable. Default is 5.}
}
\value{
Object of class \code{multifor} with elements
  \item{\code{predictions}}{Predicted classes (for \code{probability=FALSE}) or class probabilities (for \code{probability=TRUE}), based on out-of-bag samples.}
  \item{\code{num.trees}}{Number of trees.}
  \item{\code{num.independent.variables}}{Number of independent variables.}
  \item{\code{min.node.size}}{Value of minimal node size used.}
  \item{\code{mtry}}{Number of candidate variables sampled for each split.}
  \item{\code{var.imp.multiclass}}{Multi-class VIM values. Only computed for independent variables that feature at least as many unique values as the outcome variable has classes. For other variables, the entries in the vector \code{var.imp.multiclass} will be \code{NA}.}
  \item{\code{var.imp.discr}}{Discriminatory VIM values for all independent variables.}
  \item{\code{prediction.error}}{Overall out-of-bag prediction error. For classification this is the fraction of missclassified samples and for probability estimation the Brier score.}
  \item{\code{confusion.matrix}}{Contingency table for classes and predictions based on out-of-bag samples (classification only).}
  \item{\code{forest}}{Saved forest (If write.forest set to TRUE). Note that the variable IDs in the \code{split.varIDs} object do not necessarily represent the column number in R.}
  \item{\code{treetype}}{Type of forest/tree. Classification or probability.}
  \item{\code{call}}{Function call.}
  \item{\code{importance.mode}}{Importance mode used.}
  \item{\code{num.samples}}{Number of samples.}
  \item{\code{replace}}{Sample with replacement.}
  \item{\code{plotres}}{List ob objects needed by the plot functions: \code{data} contains the data; \code{yvarname} is the name of the outcome variable.}
}
\description{
Implements multi forests, a random forest variant tailored for multi-class 
outcomes (Hornung & Hapfelmeier, 2024). Multi forests feature the multi-class 
variable importance measure (VIM) and the discriminatory VIM.\cr 
The \emph{multi-class VIM} measures the degree to which the variables are 
specifically associated with one or more classes. In contrast, conventional VIMs, 
such as the permutation VIM or the Gini importance, measure the overall influence 
of variables regardless of their class-association. Therefore, these measures 
rank not only class-associated variables high, but also variables that only 
discriminate well between groups of classes. This is problematic, if only 
class-associated variables are to be identified.\cr
Similar to conventional VIMs, the \emph{discriminatory VIM} measures the general 
influence of the variables.\cr\cr
NOTE: To learn about the shapes of the influences of the variables with the largest 
multi-class VIM values on the multi-class outcome, it is crucial to apply the 
\code{\link{plot.multifor}} function to the \code{multifor} object. Two further 
plot functions are \code{\link{plotMcl}} and \code{\link{plotVar}}.\cr\cr
NOTE also: The purpose of the multi forest algorithm is mainly to calculate 
the multi-class VIM values. A large-scale real data comparison study in 
Hornung & Hapfelmeier (2024) revealed that multi forests often have a slightly 
lower predictive performance than conventional random forests. This was 
especially true with respect to calibration and for data sets with many outcome classes. 
Therefore, if it is important to maximize the predictive performance or for 
data sets with many classes, for prediction other classifiers than multi 
forests (e.g. conventional random forests) should be explored.
}
\details{
The multi-class VIM is only calculated for variables that feature at least as
many unique values as there are outcome classes.\cr
Before learning the multi forest,
the categories of unordered categorical variables are ordered using an approach
by Coppersmith et al. (1999), which ensures that close categories feature similar 
outcome class distributions. This approach is also used in the \code{ranger} R package,
when using the option \code{respect.unordered.factors="order"}.
}
\examples{
\dontrun{

## Load package:

library("diversityForest")



## Set seed to make results reproducible:

set.seed(1234)



## Load the "ctg" data set:

data(ctg)



## Construct a multi forest:

model <- multifor(dependent.variable.name = "CLASS", data = ctg, 
                  num.trees = 20)

# NOTE: num.trees = 20 (in the above) would be much too small for practical 
# purposes. This small number of trees was simply used to keep the
# runtime of the example short.
# The default number of trees is num.trees = 5000 for datasets with a maximum of
# 5000 observations and num.trees = 1000 for datasets larger than that.



## The out-of-bag estimated Brier score (note that by default
## 'probability = TRUE' is used in 'multifor'):

model$prediction.error



## Inspect the multi-class and the discriminatory VIM values:

model$var.imp.multiclass

# --> Note that there are no multi-class VIM values for some of the variables.
# These are those for which there are fewer unique values than outcome classes.
# See the "Details" section above.

model$var.imp.discr


## Inspect the 5 variables with the largest multi-class VIM values and the
## 5 variables with the largest discriminatory VIM values:

sort(model$var.imp.multiclass, decreasing = TRUE)[1:5]

sort(model$var.imp.discr, decreasing = TRUE)[1:5]



## Instead of passing the name of the outcome variable through the 
## 'dependent.variable.name' argument as above, the formula interface can also 
## be used. Below, we fit a multi forest with only the first five variables 
## from the 'ctg' data set:

model <- multifor(CLASS ~ b + e + LBE + LB + AC, data=ctg, num.trees = 20)


## As expected, the out-of-bag estimated prediction error is much larger
## for this model:

model$prediction.error



## NOTE: Visual exploration of the results of the multi-class VIM analysis
## is crucial.
## Therefore, in practice the next step would be to apply the
## 'plot.multifor' function to the object 'model'.

# plot(model)





## Prediction:


# Separate 'ctg' data set randomly in training
# and test data:

data(ctg)
train.idx <- sample(nrow(ctg), 2/3 * nrow(ctg))
ctg.train <- ctg[train.idx, ]
ctg.test <- ctg[-train.idx, ]

# Construct multi forest on training data:
# NOTE again: num.trees = 20 is specified too small for practical purposes.
model_train <- multifor(dependent.variable.name = "CLASS", data = ctg.train, 
                        importance = "none", probability = FALSE, 
                        num.trees = 20)
# NOTE: Because we are only interested in prediction here, we do not
# calculate VIM values (by setting importance = "none"), because this
# speeds up calculations.
# NOTE also: Because we are interested in class label prediction here
# rather than class probability prediction we specified 'probability = FALSE'
# above.

# Predict class values of the test data:
pred.ctg <- predict(model_train, data = ctg.test)

# Compare predicted and true class values of the test data:
table(ctg.test$CLASS, pred.ctg$predictions)



## Repeat the analysis for class probability prediction
## (default 'probability = TRUE'):

model_train <- multifor(dependent.variable.name = "CLASS", data = ctg.train, 
                        importance = "none", num.trees = 20)

# Predict class probabilities in the test data:
pred.ctg <- predict(model_train, data = ctg.test)

# The predictions are now a matrix of class probabilities:
head(pred.ctg$predictions)

# Obtain class predictions by choosing the classes with the maximum predicted
# probabilities (the function 'which.is.max' chooses one class randomly if
# there are several classes with maximum probability):
library("nnet")
classes <- levels(ctg.train$CLASS)
pred_classes <- factor(classes[apply(pred.ctg$predictions, 1, which.is.max)], 
                       levels=classes)

# Compare predicted and true class values of the test data:
table(ctg.test$CLASS, pred_classes)

}

}
\references{
\itemize{
  \item Hornung, R., Hapfelmeier, A. (2024). Multi forests: Variable importance for multi-class outcomes. arXiv:2409.08925, <\doi{10.48550/arXiv.2409.08925}>.
  \item Hornung, R. (2022). Diversity forests: Using split sampling to enable innovative complex split procedures in random forests. SN Computer Science 3(2):1, <\doi{10.1007/s42979-021-00920-1}>.
  \item Wright, M. N., Ziegler, A. (2017). ranger: A fast implementation of random forests for high dimensional data in C++ and R. Journal of Statistical Software 77:1-17, <\doi{10.18637/jss.v077.i01}>.
  \item Breiman, L. (2001). Random forests. Machine Learning 45:5-32, <\doi{10.1023/A:1010933404324}>.
  \item Malley, J. D., Kruppa, J., Dasgupta, A., Malley, K. G., & Ziegler, A. (2012). Probability machines: consistent probability estimation using nonparametric learning machines. Methods of Information in Medicine 51:74-81, <\doi{10.3414/ME00-01-0052}>.
  \item Coppersmith, D., Hong, S. J., Hosking, J. R. (1999). Partitioning nominal attributes in decision trees. Data Mining and Knowledge Discovery 3:197-217, <\doi{10.1023/A:1009869804967}>.
  }
}
\seealso{
\code{\link{predict.multifor}}
}
\author{
Roman Hornung, Marvin N. Wright
}
