% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/ngram.R
\name{step_ngram}
\alias{step_ngram}
\title{Generate n-grams From Token Variables}
\usage{
step_ngram(
  recipe,
  ...,
  role = NA,
  trained = FALSE,
  columns = NULL,
  num_tokens = 3L,
  min_num_tokens = 3L,
  delim = "_",
  skip = FALSE,
  id = rand_id("ngram")
)
}
\arguments{
\item{recipe}{A \link{recipe} object. The step will be added to the
sequence of operations for this recipe.}

\item{...}{One or more selector functions to choose which
variables are affected by the step. See \code{\link[recipes:selections]{recipes::selections()}}
for more details.}

\item{role}{Not used by this step since no new variables are
created.}

\item{trained}{A logical to indicate if the quantities for
preprocessing have been estimated.}

\item{columns}{A character string of variable names that will
be populated (eventually) by the \code{terms} argument. This is \code{NULL}
until the step is trained by \code{\link[recipes:prep]{recipes::prep.recipe()}}.}

\item{num_tokens}{The number of tokens in the n-gram. This must be an integer
greater than or equal to 1. Defaults to 3.}

\item{min_num_tokens}{The minimum number of tokens in the n-gram. This must
be an integer greater than or equal to 1 and smaller than \code{n}. Defaults to
3.}

\item{delim}{The separator between words in an n-gram. Defaults to "_".}

\item{skip}{A logical. Should the step be skipped when the
recipe is baked by \code{\link[recipes:bake]{recipes::bake.recipe()}}? While all operations are baked
when \code{\link[recipes:prep]{recipes::prep.recipe()}} is run, some operations may not be able to be
conducted on new data (e.g. processing the outcome variable(s)).
Care should be taken when using \code{skip = FALSE}.}

\item{id}{A character string that is unique to this step to identify it.}
}
\value{
An updated version of \code{recipe} with the new step added
to the sequence of existing steps (if any).
}
\description{
\code{step_ngram} creates a \emph{specification} of a recipe step that will convert a
\code{\link[=tokenlist]{token}} variable into a \code{\link[=tokenlist]{token}} variable of
ngrams.
}
\details{
The use of this step will leave the ordering of the tokens meaningless. If
\code{min_num_tokens <  num_tokens} then the tokens order in increasing fashion
with respect to the number of tokens in the n-gram. If \code{min_num_tokens = 1}
and \code{num_tokens = 3} then the output contains all the 1-grams followed by all
the 2-grams followed by all the 3-grams.
}
\section{Tidying}{
When you \code{\link[=tidy.recipe]{tidy()}} this step, a tibble with columns \code{terms}
(the selectors or variables selected).
}

\section{Case weights}{


The underlying operation does not allow for case weights.
}

\examples{
library(recipes)
library(modeldata)
data(tate_text)

tate_rec <- recipe(~., data = tate_text) \%>\%
  step_tokenize(medium) \%>\%
  step_ngram(medium)

tate_obj <- tate_rec \%>\%
  prep()

bake(tate_obj, new_data = NULL, medium) \%>\%
  slice(1:2)

bake(tate_obj, new_data = NULL) \%>\%
  slice(2) \%>\%
  pull(medium)

tidy(tate_rec, number = 2)
tidy(tate_obj, number = 2)
}
\seealso{
\code{\link[=step_tokenize]{step_tokenize()}} to turn characters into \code{\link[=tokenlist]{tokens}}

Other Steps for Token Modification: 
\code{\link{step_lemma}()},
\code{\link{step_pos_filter}()},
\code{\link{step_stem}()},
\code{\link{step_stopwords}()},
\code{\link{step_tokenfilter}()},
\code{\link{step_tokenmerge}()}
}
\concept{Steps for Token Modification}
