% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/step_nearmiss.R
\name{step_nearmiss}
\alias{step_nearmiss}
\title{Remove Points Near Other Classes}
\usage{
step_nearmiss(
  recipe,
  ...,
  role = NA,
  trained = FALSE,
  column = NULL,
  under_ratio = 1,
  neighbors = 5,
  skip = TRUE,
  seed = sample.int(10^5, 1),
  id = rand_id("nearmiss")
)
}
\arguments{
\item{recipe}{A recipe object. The step will be added to the
sequence of operations for this recipe.}

\item{...}{One or more selector functions to choose which
variable is used to sample the data. See \code{\link[=selections]{selections()}}
for more details. The selection should result in \emph{single
factor variable}. For the \code{tidy} method, these are not
currently used.}

\item{role}{Not used by this step since no new variables are
created.}

\item{trained}{A logical to indicate if the quantities for
preprocessing have been estimated.}

\item{column}{A character string of the variable name that will
be populated (eventually) by the \code{...} selectors.}

\item{under_ratio}{A numeric value for the ratio of the
minority-to-majority frequencies. The default value (1) means
that all other levels are sampled down to have the same
frequency as the least occurring level. A value of 2 would mean
that the majority levels will have (at most) (approximately)
twice as many rows than the minority level.}

\item{neighbors}{An integer. Number of nearest neighbor that are used
to generate the new examples of the minority class.}

\item{skip}{A logical. Should the step be skipped when the
recipe is baked by \code{\link[recipes:bake]{bake()}}? While all operations are baked
when \code{\link[recipes:prep]{prep()}} is run, some operations may not be able to be
conducted on new data (e.g. processing the outcome variable(s)).
Care should be taken when using \code{skip = TRUE} as it may affect
the computations for subsequent operations.}

\item{seed}{An integer that will be used as the seed when
applied.}

\item{id}{A character string that is unique to this step to identify it.}
}
\value{
An updated version of \code{recipe} with the new step
added to the sequence of existing steps (if any). For the
\code{tidy} method, a tibble with columns \code{terms} which is
the variable used to sample.
}
\description{
\code{step_nearmiss} creates a \emph{specification} of a recipe
step that removes majority class instances by undersampling points
in the majority class based on their distance to other points in the
same class.
}
\details{
This method retains the points from the majority class which have the
smallest mean distance to the k nearest points in the minority class.

All columns in the data are sampled and returned by \code{\link[=juice]{juice()}}
and \code{\link[=bake]{bake()}}.

All columns used in this step must be numeric with no missing data.

When used in modeling, users should strongly consider using the
option \code{skip = TRUE} so that the extra sampling is \emph{not}
conducted outside of the training set.
}
\section{Tidying}{
When you \code{\link[=tidy.recipe]{tidy()}} this step, a tibble with columns \code{terms}
(the selectors or variables selected) will be returned.
}

\section{Tuning Parameters}{
This step has 2 tuning parameters:
\itemize{
\item \code{under_ratio}: Under-Sampling Ratio (type: double, default: 1)
\item \code{neighbors}: # Nearest Neighbors (type: integer, default: 5)
}
}

\section{Case weights}{


The underlying operation does not allow for case weights.
}

\examples{
library(recipes)
library(modeldata)
data(hpc_data)

hpc_data0 <- hpc_data \%>\%
  select(-protocol, -day)

orig <- count(hpc_data0, class, name = "orig")
orig

up_rec <- recipe(class ~ ., data = hpc_data0) \%>\%
  # Bring the majority levels down to about 1000 each
  # 1000/259 is approx 3.862
  step_nearmiss(class, under_ratio = 3.862) \%>\%
  prep()

training <- up_rec \%>\%
  bake(new_data = NULL) \%>\%
  count(class, name = "training")
training

# Since `skip` defaults to TRUE, baking the step has no effect
baked <- up_rec \%>\%
  bake(new_data = hpc_data0) \%>\%
  count(class, name = "baked")
baked

# Note that if the original data contained more rows than the
# target n (= ratio * majority_n), the data are left alone:
orig \%>\%
  left_join(training, by = "class") \%>\%
  left_join(baked, by = "class")

library(ggplot2)

ggplot(circle_example, aes(x, y, color = class)) +
  geom_point() +
  labs(title = "Without NEARMISS") +
  xlim(c(1, 15)) +
  ylim(c(1, 15))

recipe(class ~ x + y, data = circle_example) \%>\%
  step_nearmiss(class) \%>\%
  prep() \%>\%
  bake(new_data = NULL) \%>\%
  ggplot(aes(x, y, color = class)) +
  geom_point() +
  labs(title = "With NEARMISS") +
  xlim(c(1, 15)) +
  ylim(c(1, 15))
}
\references{
Inderjeet Mani and I Zhang. knn approach to unbalanced data
distributions: a case study involving information extraction. In Proceedings
of workshop on learning from imbalanced datasets, 2003.
}
\seealso{
\code{\link[=nearmiss]{nearmiss()}} for direct implementation

Other Steps for under-sampling: 
\code{\link{step_downsample}()},
\code{\link{step_tomek}()}
}
\concept{Steps for under-sampling}
