% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/est_block_error.R
\name{est_block_error}
\alias{est_block_error}
\title{Estimate errors due to blocking in record linkage}
\usage{
est_block_error(
  x = NULL,
  y = NULL,
  blocking_result = NULL,
  n = NULL,
  N = NULL,
  G,
  alpha = NULL,
  p = NULL,
  lambda = NULL,
  tol = 10^(-4),
  maxiter = 100,
  sample_size = NULL
)
}
\arguments{
\item{x}{Reference data (required if \code{n} and \code{N} are not provided).}

\item{y}{Query data (required if \code{n} is not provided).}

\item{blocking_result}{\code{data.frame} or \code{data.table} containing blocking results (required if \code{n} is not provided).}

\item{n}{Integer vector of numbers of accepted pairs formed by each record in the query data set
with records in the reference data set, based on blocking criteria (if \code{NULL}, derived from \code{blocking_result}).}

\item{N}{Total number of records in the reference data set (if \code{NULL}, derived as \code{length(x)}).}

\item{G}{Number of classes in the finite mixture model.}

\item{alpha}{Numeric vector of initial class proportions (length \code{G}; if \code{NULL}, initialized as \code{rep(1/G, G)}).}

\item{p}{Numeric vector of initial matching probabilities in each class of the mixture model
(length \code{G}; if \code{NULL}, randomly initialized from \code{runif(G, 0.5, 1)}).}

\item{lambda}{Numeric vector of initial Poisson distribution parameters for non-matching records in each class of the mixture model
(length \code{G}; if \code{NULL}, randomly initialized from \code{runif(G, 0.1, 2)}).}

\item{tol}{Convergence tolerance for the EM algorithm (default \code{10^(-6)}).}

\item{maxiter}{Maximum number of iterations for the EM algorithm (default \code{1000}).}

\item{sample_size}{Bootstrap sample (from \code{n}) size used for calculations (if \code{NULL}, uses all data).}
}
\value{
Returns a list containing:\cr
\itemize{
\item{\code{FPR} -- estimated false positive rate,}
\item{\code{FNR} -- estimated false negative rate,}
\item{\code{iter} -- number of the EM algorithm iterations performed,}
\item{\code{convergence} -- logical, indicating whether the EM algorithm converged within \code{maxiter} iterations.}
}
}
\description{
Function computes estimators for false positive rate (FPR) and false negative rate (FNR) due to blocking in record linkage,
as proposed by Dasylva and Goussanou (2021). Assumes duplicate-free data sources,
complete coverage of the reference data set and blocking decisions based solely on record pairs.
}
\details{
Consider a large finite population that comprises of \eqn{N} individuals, and two duplicate-free data sources: a register and a file.
Assume that the register has no undercoverage,
i.e. each record from the file corresponds to exactly one record from the same individual in the register.
Let \eqn{n_i} denote the number of register records which form an accepted (by the blocking criteria) pair with
record \eqn{i} on the file. Assume that:\cr
\itemize{
\item two matched records are neighbours with a probability that is bounded away from \eqn{0} regardless of \eqn{N},
\item two unmatched records are accidental neighbours with a probability of \eqn{O(\frac{1}{N})}.
}
The finite mixture model \eqn{n_i \sim \sum_{g=1}^G \alpha_g(\text{Bernoulli}(p_g) \ast \text{Poisson}(\lambda_g))} is assumed.
When \eqn{G} is fixed, the unknown model parameters are given by the vector \eqn{\psi = [(\alpha_g, p_g, \lambda_g)]_{1 \leq g \leq G}}
that may be estimated with the Expectation-Maximization (EM) procedure.

Let \eqn{n_i = n_{i|M} + n_{i|U}}, where \eqn{n_{i|M}} is the number of matched neighbours
and \eqn{n_{i|U}} is the number of unmatched neighbours, and let \eqn{c_{ig}} denote
the indicator that record \eqn{i} is from class \eqn{g}.
For the E-step of the EM procedure, the equations are as follows
\deqn{
\begin{aligned}
P(n_i | c_{ig} = 1) &= I(n_i = 0)(1-p_g)e^{-\lambda_g}+I(n_i > 0)\Bigl(p_g+(1-p_g)\frac{\lambda_g}{n_i}\Bigr)\frac{e^{-\lambda_g}\lambda_g^{n_i-1}}{(n_i-1)!}, \\
P(c_{ig} = 1 | n_i) &= \frac{\alpha_gP(n_i | c_{ig} = 1)}{\sum_{g'=1}^G\alpha_{g'}P(n_i | c_{ig'} = 1)}, \\
P(n_{i|M} = 1 | n_i,c_{ig} = 1) &= \frac{p_gn_i}{p_gn_i + (1-p_g)\lambda_g}, \\
P(n_{i|U} = n_i | n_i,c_{ig} = 1) &= I(n_i = 0) + I(n_i > 0)\frac{(1-p_g)\lambda_g}{p_gn_i + (1-p_g)\lambda_g}, \\
P(n_{i|U} = n_i-1 | n_i,c_{ig} = 1) &= \frac{p_gn_i}{p_gn_i + (1-p_g)\lambda_g}, \\
E[c_{ig}n_{i|M} | n_i] &= P(c_{ig} = 1 | n_i)P(n_{i|M} = 1 | n_i,c_{ig} = 1), \\
E[n_{i|U} | n_i,c_{ig} = 1] &= \Bigl(\frac{p_g(n_i-1) + (1-p_g)\lambda_g}{p_gn_i + (1-p_g)\lambda_g}\Bigr)n_i, \\
E[c_{ig}n_{i|U} | n_i] &= P(c_{ig} = 1 | n_i)E[n_{i|U} | n_i,c_{ig} = 1].
\end{aligned}
}
The M-step is given by following equations
\deqn{
\begin{aligned}
\hat{p}_g &= \frac{\sum_{i=1}^mE[c_{ig}n_{i|M} | n_i;\psi]}{\sum_{i=1}^mE[c_{ig} | n_i; \psi]}, \\
\hat{\lambda}_g &= \frac{\sum_{i=1}^mE[c_{ig}n_{i|U} | n_i; \psi]}{\sum_{i=1}^mE[c_{ig} | n_i; \psi]}, \\
\hat{\alpha}_g &= \frac{1}{m}\sum_{i=1}^mE[c_{ig} | n_i; \psi].
\end{aligned}
}
As \eqn{N \to \infty}, the error rates and the model parameters are related as follows
\deqn{
\begin{aligned}
\text{FNR} &\xrightarrow{p} 1 - E[p(v_i)], \\
(N-1)\text{FPR} &\xrightarrow{p} E[\lambda(v_i)],
\end{aligned}
}
where \eqn{E[p(v_i)] = \sum_{g=1}^G\alpha_gp_g} and \eqn{E[\lambda(v_i)] = \sum_{g=1}^G\alpha_g\lambda_g}.
}
\examples{
## an example proposed by Dasylva and Goussanou (2021)
## we obtain results very close to those reported in the paper

set.seed(111)

neighbors <- rep(0:5, c(1659, 53951, 6875, 603, 62, 5))

errors <- est_block_error(n = neighbors,
                          N = 63155,
                          G = 2,
                          tol = 10^(-3),
                          maxiter = 50)

errors

}
\references{
Dasylva, A., Goussanou, A. (2021). Estimating the false negatives due to blocking in record linkage.
Survey Methodology, Statistics Canada, Catalogue No. 12-001-X, Vol. 47, No. 2.

Dasylva, A., Goussanou, A. (2022). On the consistent estimation of linkage errors without training data.
Jpn J Stat Data Sci 5, 181–216. \doi{10.1007/s42081-022-00153-3}
}
