% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/ParseSettings.R
\name{.parse_evaluation_settings}
\alias{.parse_evaluation_settings}
\title{Internal function for parsing settings related to model evaluation}
\usage{
.parse_evaluation_settings(
  config = NULL,
  data,
  parallel,
  outcome_type,
  hpo_metric,
  development_batch_id,
  vimp_aggregation_method,
  vimp_aggregation_rank_threshold,
  prep_cluster_method,
  prep_cluster_linkage_method,
  prep_cluster_cut_method,
  prep_cluster_similarity_threshold,
  prep_cluster_similarity_metric,
  evaluate_top_level_only = waiver(),
  skip_evaluation_elements = waiver(),
  ensemble_method = waiver(),
  evaluation_metric = waiver(),
  sample_limit = waiver(),
  detail_level = waiver(),
  estimation_type = waiver(),
  aggregate_results = waiver(),
  confidence_level = waiver(),
  bootstrap_ci_method = waiver(),
  feature_cluster_method = waiver(),
  feature_cluster_cut_method = waiver(),
  feature_linkage_method = waiver(),
  feature_similarity_metric = waiver(),
  feature_similarity_threshold = waiver(),
  sample_cluster_method = waiver(),
  sample_linkage_method = waiver(),
  sample_similarity_metric = waiver(),
  eval_aggregation_method = waiver(),
  eval_aggregation_rank_threshold = waiver(),
  eval_icc_type = waiver(),
  stratification_method = waiver(),
  stratification_threshold = waiver(),
  time_max = waiver(),
  evaluation_times = waiver(),
  dynamic_model_loading = waiver(),
  parallel_evaluation = waiver(),
  ...
)
}
\arguments{
\item{config}{A list of settings, e.g. from an xml file.}

\item{data}{Data set as loaded using the \code{.load_data} function.}

\item{parallel}{Logical value that whether familiar uses parallelisation. If
\code{FALSE} it will override \code{parallel_evaluation}.}

\item{outcome_type}{Type of outcome found in the data set.}

\item{hpo_metric}{Metric defined for hyperparameter optimisation.}

\item{development_batch_id}{Identifiers of batches used for model development.
These identifiers are used to determine the cohorts used to determine a
setting for \code{time_max}, if the \code{outcome_type} is \code{survival}, and both
\code{time_max} and \code{evaluation_times} are not provided.}

\item{vimp_aggregation_method}{Method for variable importance aggregation that
was used for feature selection.}

\item{vimp_aggregation_rank_threshold}{Rank threshold for variable importance
aggregation used during feature selection.}

\item{prep_cluster_method}{Cluster method used during pre-processing.}

\item{prep_cluster_linkage_method}{Cluster linkage method used during
pre-processing.}

\item{prep_cluster_cut_method}{Cluster cut method used during pre-processing.}

\item{prep_cluster_similarity_threshold}{Cluster similarity threshold used
during pre-processing.}

\item{prep_cluster_similarity_metric}{Cluster similarity metric used during
pre-processing.}

\item{evaluate_top_level_only}{(\emph{optional}) Flag that signals that only
evaluation at the most global experiment level is required. Consider a
cross-validation experiment with additional external validation. The global
experiment level consists of data that are used for development, internal
validation and external validation. The next lower experiment level are the
individual cross-validation iterations.

When the flag is \code{true}, evaluations take place on the global level only,
and no results are generated for the next lower experiment levels. In our
example, this means that results from individual cross-validation iterations
are not computed and shown. When the flag is \code{false}, results are computed
from both the global layer and the next lower level.

Setting the flag to \code{true} saves computation time.}

\item{skip_evaluation_elements}{(\emph{optional}) Specifies which evaluation steps,
if any, should be skipped as part of the evaluation process. Defaults to
\code{none}, which means that all relevant evaluation steps are performed. It can
have one or more of the following values:
\itemize{
\item \code{none}, \code{false}: no steps are skipped.
\item \code{all}, \code{true}: all steps are skipped.
\item \code{auc_data}: data for assessing and plotting the area under the receiver
operating characteristic curve are not computed.
\item \code{calibration_data}: data for assessing and plotting model calibration are
not computed.
\item \code{calibration_info}: data required to assess calibration, such as baseline
survival curves, are not collected. These data will still be present in the
models.
\item \code{confusion_matrix}: data for assessing and plotting a confusion matrix are
not collected.
\item \code{decision_curve_analyis}: data for performing a decision curve analysis
are not computed.
\item \code{feature_expressions}: data for assessing and plotting sample clustering
are not computed.
\item \code{feature_similarity}: data for assessing and plotting feature clusters are
not computed.
\item \code{fs_vimp}: data for assessing and plotting feature selection-based
variable importance are not collected.
\item \code{hyperparameters}: data for assessing model hyperparameters are not
collected. These data will still be present in the models.
\item \code{ice_data}: data for individual conditional expectation and partial
dependence plots are not created.
\item \code{model_performance}: data for assessing and visualising model performance
are not created.
\item \code{model_vimp}: data for assessing and plotting model-based variable
importance are not collected.
\item \code{permutation_vimp}: data for assessing and plotting model-agnostic
permutation variable importance are not computed.
\item \code{prediction_data}: predictions for each sample are not made and exported.
\item \code{risk_stratification_data}: data for assessing and plotting Kaplan-Meier
survival curves are not collected.
\item \code{risk_stratification_info}: data for assessing stratification into risk
groups are not computed.
\item \code{univariate_analysis}: data for assessing and plotting univariate feature
importance are not computed.
}}

\item{ensemble_method}{(\emph{optional}) Method for ensembling predictions from
models for the same sample. Available methods are:
\itemize{
\item \code{median} (default): Use the median of the predicted values as the ensemble
value for a sample.
\item \code{mean}: Use the mean of the predicted values as the ensemble value for a
sample.
}

This parameter is only used if \code{detail_level} is \code{ensemble}.}

\item{evaluation_metric}{(\emph{optional}) One or more metrics for assessing model
performance. See the vignette on performance metrics for the available
metrics.

Confidence intervals (or rather credibility intervals) are computed for each
metric during evaluation. This is done using bootstraps, the number of which
depends on the value of \code{confidence_level} (Davison and Hinkley, 1997).

If unset, the metric in the \code{optimisation_metric} variable is used.}

\item{sample_limit}{(\emph{optional}) Set the upper limit of the number of samples
that are used during evaluation steps. Cannot be less than 20.

This setting can be specified per data element by providing a parameter
value in a named list with data elements, e.g.
\code{list("sample_similarity"=100, "permutation_vimp"=1000)}.

This parameter can be set for the following data elements:
\code{sample_similarity} and \code{ice_data}.}

\item{detail_level}{(\emph{optional}) Sets the level at which results are computed
and aggregated.
\itemize{
\item \code{ensemble}: Results are computed at the ensemble level, i.e. over all
models in the ensemble. This means that, for example, bias-corrected
estimates of model performance are assessed by creating (at least) 20
bootstraps and computing the model performance of the ensemble model for
each bootstrap.
\item \code{hybrid} (default): Results are computed at the level of models in an
ensemble. This means that, for example, bias-corrected estimates of model
performance are directly computed using the models in the ensemble. If there
are at least 20 trained models in the ensemble, performance is computed for
each model, in contrast to \code{ensemble} where performance is computed for the
ensemble of models. If there are less than 20 trained models in the
ensemble, bootstraps are created so that at least 20 point estimates can be
made.
\item \code{model}: Results are computed at the model level. This means that, for
example, bias-corrected estimates of model performance are assessed by
creating (at least) 20 bootstraps and computing the performance of the model
for each bootstrap.
}

Note that each level of detail has a different interpretation for bootstrap
confidence intervals. For \code{ensemble} and \code{model} these are the confidence
intervals for the ensemble and an individual model, respectively. That is,
the confidence interval describes the range where an estimate produced by a
respective ensemble or model trained on a repeat of the experiment may be
found with the probability of the confidence level. For \code{hybrid}, it
represents the range where any single model trained on a repeat of the
experiment may be found with the probability of the confidence level. By
definition, confidence intervals obtained using \code{hybrid} are at least as
wide as those for \code{ensemble}. \code{hybrid} offers the correct interpretation if
the goal of the analysis is to assess the result of a single, unspecified,
model.

\code{hybrid} is generally computationally less expensive then \code{ensemble}, which
in turn is somewhat less expensive than \code{model}.

A non-default \code{detail_level} parameter can be specified for separate
evaluation steps by providing a parameter value in a named list with data
elements, e.g. \code{list("auc_data"="ensemble", "model_performance"="hybrid")}.
This parameter can be set for the following data elements: \code{auc_data},
\code{decision_curve_analyis}, \code{model_performance}, \code{permutation_vimp},
\code{ice_data}, \code{prediction_data} and \code{confusion_matrix}.}

\item{estimation_type}{(\emph{optional}) Sets the type of estimation that should be
possible. This has the following options:
\itemize{
\item \code{point}: Point estimates.
\item \code{bias_correction} or \code{bc}: Bias-corrected estimates. A bias-corrected
estimate is computed from (at least) 20 point estimates, and \code{familiar} may
bootstrap the data to create them.
\item \code{bootstrap_confidence_interval} or \code{bci} (default): Bias-corrected
estimates with bootstrap confidence intervals (Efron and Hastie, 2016). The
number of point estimates required depends on the \code{confidence_level}
parameter, and \code{familiar} may bootstrap the data to create them.
}

As with \code{detail_level}, a non-default \code{estimation_type} parameter can be
specified for separate evaluation steps by providing a parameter value in a
named list with data elements, e.g. \code{list("auc_data"="bci", "model_performance"="point")}. This parameter can be set for the following
data elements: \code{auc_data}, \code{decision_curve_analyis}, \code{model_performance},
\code{permutation_vimp}, \code{ice_data}, and \code{prediction_data}.}

\item{aggregate_results}{(\emph{optional}) Flag that signifies whether results
should be aggregated during evaluation. If \code{estimation_type} is
\code{bias_correction} or \code{bc}, aggregation leads to a single bias-corrected
estimate. If \code{estimation_type} is \code{bootstrap_confidence_interval} or \code{bci},
aggregation leads to a single bias-corrected estimate with lower and upper
boundaries of the confidence interval. This has no effect if
\code{estimation_type} is \code{point}.

The default value is equal to \code{TRUE} except when assessing metrics to assess
model performance, as the default violin plot requires underlying data.

As with \code{detail_level} and \code{estimation_type}, a non-default
\code{aggregate_results} parameter can be specified for separate evaluation steps
by providing a parameter value in a named list with data elements, e.g.
\code{list("auc_data"=TRUE, , "model_performance"=FALSE)}. This parameter exists
for the same elements as \code{estimation_type}.}

\item{confidence_level}{(\emph{optional}) Numeric value for the level at which
confidence intervals are determined. In the case bootstraps are used to
determine the confidence intervals bootstrap estimation, \code{familiar} uses the
rule of thumb \eqn{n = 20 / ci.level} to determine the number of required
bootstraps.

The default value is \code{0.95}.}

\item{bootstrap_ci_method}{(\emph{optional}) Method used to determine bootstrap
confidence intervals (Efron and Hastie, 2016). The following methods are
implemented:
\itemize{
\item \code{percentile} (default): Confidence intervals obtained using the percentile
method.
\item \code{bc}: Bias-corrected confidence intervals.
}

Note that the standard method is not implemented because this method is
often not suitable due to non-normal distributions. The bias-corrected and
accelerated (BCa) method is not implemented yet.}

\item{feature_cluster_method}{(\emph{optional}) Method used to perform clustering
of features. The same methods as for the \code{cluster_method} configuration
parameter are available: \code{none}, \code{hclust}, \code{agnes}, \code{diana} and \code{pam}.

The value for the \code{cluster_method} configuration parameter is used by
default. When generating clusters for the purpose of determining mutual
correlation and ordering feature expressions, \code{none} is ignored and \code{hclust}
is used instead.}

\item{feature_cluster_cut_method}{(\emph{optional}) Method used to divide features
into separate clusters. The available methods are the same as for the
\code{cluster_cut_method} configuration parameter: \code{silhouette}, \code{fixed_cut} and
\code{dynamic_cut}.

\code{silhouette} is available for all cluster methods, but \code{fixed_cut} only
applies to methods that create hierarchical trees (\code{hclust}, \code{agnes} and
\code{diana}). \code{dynamic_cut} requires the \code{dynamicTreeCut} package and can only
be used with \code{agnes} and \code{hclust}.

The value for the \code{cluster_cut_method} configuration parameter is used by
default.}

\item{feature_linkage_method}{(\emph{optional}) Method used for agglomerative
clustering with \code{hclust} and \code{agnes}. Linkage determines how features are
sequentially combined into clusters based on distance. The methods are
shared with the \code{cluster_linkage_method} configuration parameter: \code{average},
\code{single}, \code{complete}, \code{weighted}, and \code{ward}.

The value for the \code{cluster_linkage_method} configuration parameters is used
by default.}

\item{feature_similarity_metric}{(\emph{optional}) Metric to determine pairwise
similarity between features. Similarity is computed in the same manner as
for clustering, and \code{feature_similarity_metric} therefore has the same
options as \code{cluster_similarity_metric}: \code{mcfadden_r2}, \code{cox_snell_r2},
\code{nagelkerke_r2}, \code{spearman}, \code{kendall} and \code{pearson}.

The value used for the \code{cluster_similarity_metric} configuration parameter
is used by default.}

\item{feature_similarity_threshold}{(\emph{optional}) The threshold level for
pair-wise similarity that is required to form feature clusters with the
\code{fixed_cut} method. This threshold functions in the same manner as the one
defined using the \code{cluster_similarity_threshold} parameter.

By default, the value for the \code{cluster_similarity_threshold} configuration
parameter is used.

Unlike for \code{cluster_similarity_threshold}, more than one value can be
supplied here.}

\item{sample_cluster_method}{(\emph{optional}) The method used to perform
clustering based on distance between samples. These are the same methods as
for the \code{cluster_method} configuration parameter: \code{hclust}, \code{agnes}, \code{diana}
and \code{pam}.

The value for the \code{cluster_method} configuration parameter is used by
default. When generating clusters for the purpose of ordering samples in
feature expressions, \code{none} is ignored and \code{hclust} is used instead.}

\item{sample_linkage_method}{(\emph{optional}) The method used for agglomerative
clustering in \code{hclust} and \code{agnes}. These are the same methods as for the
\code{cluster_linkage_method} configuration parameter: \code{average}, \code{single},
\code{complete}, \code{weighted}, and \code{ward}.

The value for the \code{cluster_linkage_method} configuration parameters is used
by default.}

\item{sample_similarity_metric}{(\emph{optional}) Metric to determine pairwise
similarity between samples. Similarity is computed in the same manner as for
clustering, but \code{sample_similarity_metric} has different options that are
better suited to computing distance between samples instead of between
features. The following metrics are available.
\itemize{
\item \code{gower} (default): compute Gower's distance between samples. By default,
Gower's distance is computed based on winsorised data to reduce the effect
of outliers (see below).
\item \code{euclidean}: compute the Euclidean distance between samples.
}

The underlying feature data for numerical features is scaled to the
\eqn{[0,1]} range using the feature values across the samples. The
normalisation parameters required can optionally be computed from feature
data with the outer 5\% (on both sides) of feature values trimmed or
winsorised. To do so append \verb{_trim} (trimming) or \verb{_winsor} (winsorising) to
the metric name. This reduces the effect of outliers somewhat.

Regardless of metric, all categorical features are handled as for the
Gower's distance: distance is 0 if the values in a pair of samples match,
and 1 if they do not.}

\item{eval_aggregation_method}{(\emph{optional}) Method for aggregating variable
importances for the purpose of evaluation. Variable importances are
determined during feature selection steps and after training the model. Both
types are evaluated, but feature selection variable importance is only
evaluated at run-time.

See the documentation for the \code{vimp_aggregation_method} argument for
information concerning the different methods available.}

\item{eval_aggregation_rank_threshold}{(\emph{optional}) The threshold used to
define the subset of highly important features during evaluation.

See the documentation for the \code{vimp_aggregation_rank_threshold} argument for
more information.}

\item{eval_icc_type}{(\emph{optional}) String indicating the type of intraclass
correlation coefficient (\code{1}, \code{2} or \code{3}) that should be used to compute
robustness for features in repeated measurements during the evaluation of
univariate importance. These types correspond to the types in Shrout and
Fleiss (1979). The default value is \code{1}.}

\item{stratification_method}{(\emph{optional}) Method for determining the
stratification threshold for creating survival groups. The actual,
model-dependent, threshold value is obtained from the development data, and
can afterwards be used to perform stratification on validation data.

The following stratification methods are available:
\itemize{
\item \code{median} (default): The median predicted value in the development cohort
is used to stratify the samples into two risk groups. For predicted outcome
values that build a continuous spectrum, the two risk groups in the
development cohort will be roughly equal in size.
\item \code{mean}: The mean predicted value in the development cohort is used to
stratify the samples into two risk groups.
\item \code{mean_trim}: As \code{mean}, but based on the set of predicted values
where the 5\% lowest and 5\% highest values are discarded. This reduces the
effect of outliers.
\item \code{mean_winsor}: As \code{mean}, but based on the set of predicted values where
the 5\% lowest and 5\% highest values are winsorised. This reduces the effect
of outliers.
\item \code{fixed}: Samples are stratified based on the sample quantiles of the
predicted values. These quantiles are defined using the
\code{stratification_threshold} parameter.
\item \code{optimised}: Use maximally selected rank statistics to determine the
optimal threshold (Lausen and Schumacher, 1992; Hothorn et al., 2003) to
stratify samples into two optimally separated risk groups.
}

One or more stratification methods can be selected simultaneously.

This parameter is only relevant for \code{survival} outcomes.}

\item{stratification_threshold}{(\emph{optional}) Numeric value(s) signifying the
sample quantiles for stratification using the \code{fixed} method. The number of
risk groups will be the number of values +1.

The default value is \code{c(1/3, 2/3)}, which will yield two thresholds that
divide samples into three equally sized groups. If \code{fixed} is not among the
selected stratification methods, this parameter is ignored.

This parameter is only relevant for \code{survival} outcomes.}

\item{time_max}{(\emph{optional}) Time point which is used as the benchmark for
e.g. cumulative risks generated by random forest, or the cutoff for Uno's
concordance index.

If \code{time_max} is not provided, but \code{evaluation_times} is, the largest value
of \code{evaluation_times} is used. If both are not provided, \code{time_max} is set
to the 98th percentile of the distribution of survival times for samples
with an event in the development data set.

This parameter is only relevant for \code{survival} outcomes.}

\item{evaluation_times}{(\emph{optional}) One or more time points that are used for
assessing calibration in survival problems. This is done as expected and
observed survival probabilities depend on time.

If unset, \code{evaluation_times} will be equal to \code{time_max}.

This parameter is only relevant for \code{survival} outcomes.}

\item{dynamic_model_loading}{(\emph{optional}) Enables dynamic loading of models
during the evaluation process, if \code{TRUE}. Defaults to \code{FALSE}. Dynamic
loading of models may reduce the overall memory footprint, at the cost of
increased disk or network IO. Models can only be dynamically loaded if they
are found at an accessible disk or network location. Setting this parameter
to \code{TRUE} may help if parallel processing causes out-of-memory issues during
evaluation.}

\item{parallel_evaluation}{(\emph{optional}) Enable parallel processing for
hyperparameter optimisation. Defaults to \code{TRUE}. When set to \code{FALSE}, this
will disable the use of parallel processing while performing optimisation,
regardless of the settings of the \code{parallel} parameter. The parameter
moreover specifies whether parallelisation takes place within the evaluation
process steps (\code{inner}, default), or in an outer loop ( \code{outer}) over
learners, data subsamples, etc.

\code{parallel_evaluation} is ignored if \code{parallel=FALSE}.}

\item{...}{Unused arguments.}
}
\value{
List of parameters related to model evaluation.
}
\description{
Internal function for parsing settings related to model evaluation
}
\references{
\enumerate{
\item Davison, A. C. & Hinkley, D. V. Bootstrap methods and their
application. (Cambridge University Press, 1997).
\item Efron, B. & Hastie, T. Computer Age Statistical Inference. (Cambridge
University Press, 2016).
\item Lausen, B. & Schumacher, M. Maximally Selected Rank Statistics.
Biometrics 48, 73 (1992).
\item Hothorn, T. & Lausen, B. On the exact distribution of maximally selected
rank statistics. Comput. Stat. Data Anal. 43, 121–137 (2003).
}
}
\keyword{internal}
