% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/dfm_lookup.R
\name{dfm_lookup}
\alias{dfm_lookup}
\title{Apply a dictionary to a dfm}
\usage{
dfm_lookup(x, dictionary, levels = 1:5, exclusive = TRUE,
  valuetype = c("glob", "regex", "fixed"), case_insensitive = TRUE,
  capkeys = !exclusive, nomatch = NULL,
  verbose = quanteda_options("verbose"))
}
\arguments{
\item{x}{the dfm to which the dictionary will be applied}

\item{dictionary}{a \link{dictionary} class object}

\item{levels}{levels of entries in a hierarchical dictionary that will be 
applied}

\item{exclusive}{if \code{TRUE}, remove all features not in dictionary, 
otherwise, replace values in dictionary with keys while leaving other 
features unaffected}

\item{valuetype}{the type of pattern matching: \code{"glob"} for 
"glob"-style wildcard expressions; \code{"regex"} for regular expressions;
or \code{"fixed"} for exact matching. See \link{valuetype} for details.}

\item{case_insensitive}{ignore the case of dictionary values if \code{TRUE}}

\item{capkeys}{if \code{TRUE}, convert dictionary keys to uppercase to
distinguish them from other features}

\item{nomatch}{an optional character naming a new feature that will contain 
the counts of features of \code{x} not matched to a dictionary key.  If 
\code{NULL} (default), do not tabulate unmatched features.}

\item{verbose}{print status messages if \code{TRUE}}
}
\description{
Apply a dictionary to a dfm by looking up all dfm features for matches in a a
set of \link{dictionary} values, and replace those features with a count of
the dictionary's keys.  If \code{exclusive = FALSE} then the behaviour is to
apply a "thesaurus", where each value match is replaced by the dictionary
key, converted to capitals if \code{capkeys = TRUE} (so that the replacements
are easily distinguished from features that were terms found originally in
the document).
}
\note{
If using \code{dfm_lookup} with dictionaries containing multi-word
  values, matches will only occur if the features themselves are multi-word
  or formed from ngrams. A better way to match dictionary values that include
  multi-word patterns is to apply \code{\link{tokens_lookup}} to the tokens,
  and then construct the dfm.
}
\examples{
my_dict <- dictionary(list(christmas = c("Christmas", "Santa", "holiday"),
                          opposition = c("Opposition", "reject", "notincorpus"),
                          taxglob = "tax*",
                          taxregex = "tax.+$",
                          country = c("United_States", "Sweden")))
my_dfm <- dfm(c("My Christmas was ruined by your opposition tax plan.", 
               "Does the United_States or Sweden have more progressive taxation?"),
             remove = stopwords("english"), verbose = FALSE)
my_dfm

# glob format
dfm_lookup(my_dfm, my_dict, valuetype = "glob")
dfm_lookup(my_dfm, my_dict, valuetype = "glob", case_insensitive = FALSE)

# regex v. glob format: note that "united_states" is a regex match for "tax*"
dfm_lookup(my_dfm, my_dict, valuetype = "glob")
dfm_lookup(my_dfm, my_dict, valuetype = "regex", case_insensitive = TRUE)

# fixed format: no pattern matching
dfm_lookup(my_dfm, my_dict, valuetype = "fixed")
dfm_lookup(my_dfm, my_dict, valuetype = "fixed", case_insensitive = FALSE)

# show unmatched tokens
dfm_lookup(my_dfm, my_dict, nomatch = "_UNMATCHED")

}
\seealso{
dfm_replace
}
\keyword{dfm}
