% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/tokens_compound.R
\name{tokens_compound}
\alias{tokens_compound}
\title{Convert token sequences into compound tokens}
\usage{
tokens_compound(x, pattern, concatenator = "_", valuetype = c("glob",
  "regex", "fixed"), case_insensitive = TRUE, join = TRUE)
}
\arguments{
\item{x}{an input \link{tokens} object}

\item{pattern}{a character vector, list of character vectors, \link{dictionary},
\link{collocations}, or \link{dfm}. See \link{pattern} for details.}

\item{concatenator}{the concatenation character that will connect the words 
making up the multi-word sequences.  The default \code{_} is  
recommended since it will not be removed during normal cleaning and 
tokenization (while nearly all other punctuation characters, at least those
in the Unicode punctuation class [P] will be removed).}

\item{valuetype}{the type of pattern matching: \code{"glob"} for 
"glob"-style wildcard expressions; \code{"regex"} for regular expressions;
or \code{"fixed"} for exact matching. See \link{valuetype} for details.}

\item{case_insensitive}{logical; if \code{TRUE}, ignore case when matching}

\item{join}{logical; if \code{TRUE}, join overlapping compounds}
}
\value{
a \link{tokens} object in which the token sequences matching \code{pattern}
have been replaced by  compound "tokens" joined by the concatenator
}
\description{
Replace multi-token sequences with a multi-word, or "compound" token.  The
resulting compound tokens will represent a phrase or multi-word expression, 
concatenated with  \code{concatenator} (by default, the "\code{_}" character)
to form a single "token".  This ensures that the sequences will be processed
subsequently as single tokens, for instance in constructing a \link{dfm}.
}
\examples{
mytexts <- c("The new law included a capital gains tax, and an inheritance tax.",
             "New York City has raised taxes: an income tax and inheritance taxes.")
mytoks <- tokens(mytexts, remove_punct = TRUE)

# for lists of sequence elements
myseqs <- list(c("tax"), c("income", "tax"), c("capital", "gains", "tax"), c("inheritance", "tax"))
(cw <- tokens_compound(mytoks, myseqs))
dfm(cw)

# when used as a dictionary for dfm creation
mydict1 <- dictionary(list(tax=c("tax", "income tax", "capital gains tax", "inheritance tax*")))
(cw2 <- tokens_compound(mytoks, mydict1))

# to pick up "taxes" in the second text, set valuetype = "regex"
(cw3 <- tokens_compound(mytoks, mydict1, valuetype = "regex"))

# dictionaries w/glob matches
mydict2 <- dictionary(list(negative = c("bad* word*", "negative", "awful text"),
                          positive = c("good stuff", "like? th??")))
toks <- tokens(c(txt1 = "I liked this, when we can use bad words, in awful text.",
                 txt2 = "Some damn good stuff, like the text, she likes that too."))
tokens_compound(toks, mydict2)

# with collocations
cols <- 
    textstat_collocations(tokens("capital gains taxes are worse than inheritance taxes"), 
                                  size = 2, min_count = 1)
toks <- tokens("The new law included capital gains taxes and inheritance taxes.")
tokens_compound(toks, cols)
}
\author{
Kenneth Benoit and Kohei Watanabe
}
