% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/optimal_kmeans_d.R
\name{optimal_kmeans_d}
\alias{optimal_kmeans_d}
\title{Obtain optimal D solution based on k-means clustering of disease marker
data in a case-control study}
\usage{
optimal_kmeans_d(markers, M, factors, case, data, nstart = 100, seed = NULL)
}
\arguments{
\item{markers}{a vector of the names of the disease markers. These markers
should be of a type that is suitable for use with
\code{\link[stats]{kmeans}} clustering. All markers will be missing
for control subjects. e.g. \code{markers = c("marker1", "marker2")}}

\item{M}{is the number of clusters to identify using
\code{\link[stats]{kmeans}} clustering. For M>=2.}

\item{factors}{a list of the names of the binary or continuous risk factors.
For binary risk factors the lowest level will be used as the reference level.
e.g. \code{factors = list("age", "sex", "race")}}

\item{case}{denotes the variable that contains each subject's status as a
case or control. This value should be 1 for cases and 0 for controls.
Argument must be supplied in quotes, e.g. \code{case = "status"}.}

\item{data}{the name of the dataframe that contains the relevant variables.}

\item{nstart}{the number of random starts to use with
\code{\link[stats]{kmeans}} clustering. Defaults to 100.}

\item{seed}{an integer argument passed to \code{\link{set.seed}}.
Default is NULL. Recommended to set in order to obtain reproducible results.}
}
\value{
Returns a list

\code{optimal_d} The D value for the optimal D solution

\code{optimal_d_data} The original data frame supplied through the
\code{data} argument, with a column called \code{optimal_d_label}
added for the optimal D subtype label.
This has the subtype assignment for cases, and is 0 for all controls.
}
\description{
\code{optimal_kmeans_d} applies k-means clustering using the
\code{\link[stats]{kmeans}} function with many random starts. The D value is
then calculated for the cluster solution at each random start using the
\code{\link{d}} function, and the cluster solution that maximizes D is returned,
along with the corresponding value of D. In this way the optimally
etiologically heterogeneous subtype solution can be identified from possibly
high-dimensional disease marker data.
}
\examples{
\donttest{
# Cluster 30 disease markers to identify the optimally
# etiologically heterogeneous 3-subtype solution
res <- optimal_kmeans_d(
  markers = c(paste0("y", seq(1:30))),
  M = 3,
  factors = list("x1", "x2", "x3"),
  case = "case",
  data = subtype_data,
  nstart = 100,
  seed = 81110224
)

# Look at the value of D for the optimal D solution
res[["optimal_d"]]

# Look at a table of the optimal D solution
table(res[["optimal_d_data"]]$optimal_d_label)
}

}
\references{
Begg, C. B., Zabor, E. C., Bernstein, J. L., Bernstein, L., Press, M. F., &
Seshan, V. E. (2013). A conceptual and methodological framework for
investigating etiologic heterogeneity. Stat Med, 32(29), 5039-5052.
}
