% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/S4classes.R, R/phrases.R
\docType{class}
\name{phrases}
\alias{phrases}
\alias{phrases-class}
\alias{as.phrases,ngrams-method}
\alias{as.phrases}
\alias{as.phrases,matrix-method}
\alias{as.character,phrases-method}
\alias{concatenate_phrases}
\title{Manage and use phrases}
\usage{
\S4method{as.phrases}{ngrams}(.Object)

\S4method{as.phrases}{matrix}(.Object, corpus, enc = encoding(corpus))

\S4method{as.character}{phrases}(x, p_attribute)

concatenate_phrases(dt, phrases, col)
}
\arguments{
\item{.Object}{Input object, either a \code{ngrams} or a \code{matrix} object.}

\item{corpus}{A length-one \code{character} vector, the corpus ID of the corpus
from which regions / the \code{data.table} representing a decoded corpus is derived.}

\item{enc}{Encoding of the corpus.}

\item{x}{A \code{phrases} class object.}

\item{p_attribute}{The positional attribute (p-attribute) to decode.}

\item{dt}{A \code{data.table}.}

\item{phrases}{A \code{phrases} class object.}

\item{col}{If \code{.Object} is a \code{data.table}, the column to concatenate.}
}
\description{
Class, methods and functionality for processing phrases (lexical
units, lexical items, multi-word expressions) beyond the token level. The
envisaged workflow at this stage is to detect phrases using the
\code{ngrams}-method and to generate a \code{phrases} class object from the
\code{ngrams} object using the \code{as.phrases} method. This object can be
passed into a call of \code{count}, see examples. Further methods and
functions documented here are used internally, but may be useful.
}
\details{
The \code{phrases} considers a phrase as sequence as tokens that can
be defined by region, i.e. a left and a right corpus position. This
information is kept in a region matrix in the slot "cpos" of the
\code{phrases} class. The \code{phrases} class inherits from the
\code{\link{regions}} class (which inherits from the and the
\code{\link{corpus}} class), without adding further slots.

If \code{.Object} is an object of class \code{ngrams}, the
\code{as.phrases}-method will interpret the ngrams as CQP queries,
look up the matching corpus positions and return an \code{phrases}
object.

If \code{.Object} is a \code{matrix}, the \code{as.phrases}-method
will initialize a \code{phrases} object. The corpus and the encoding of the
corpus will be assigned to the object.

Applying the \code{as.character}-method on a \code{phrases} object
will return the decoded regions, concatenated using an underscore as
seperator.

The \code{concatenate_phrases} function takes a \code{data.table}
(argument \code{dt}) as input and concatenates phrases in successive rows
into a phrase.
}
\examples{
# Workflow to create document-term-matrix with phrases

obs <- corpus("GERMAPARLMINI") \%>\%
  count(p_attribute = "word")

phrases <- corpus("GERMAPARLMINI") \%>\%
  ngrams(n = 2L, p_attribute = "word") \%>\%
  pmi(observed = obs) \%>\% 
  subset(ngram_count > 5L) \%>\%
  subset(1:100) \%>\%
  as.phrases()

dtm <- corpus("GERMAPARLMINI") \%>\%
  as.speeches(s_attribute_name = "speaker", progress = TRUE) \%>\%
  count(phrases = phrases, p_attribute = "word", progress = TRUE, verbose = TRUE) \%>\%
  as.DocumentTermMatrix(col = "count", verbose = FALSE)
  
grep("erneuerbaren_Energien", colnames(dtm))
grep("verpasste_Chancen", colnames(dtm))

# Derive phrases object from an ngrams object

reuters_phrases <- ngrams("REUTERS", p_attribute = "word", n = 2L) \%>\%
  pmi(observed = count("REUTERS", p_attribute = "word")) \%>\%
  subset(ngram_count >= 5L) \%>\%
  subset(1:25) \%>\%
  as.phrases()

phr <- as.character(reuters_phrases, p_attribute = "word")

# Derive phrases from explicitly stated CQP queries

cqp_phrase_queries <- c(
  '"oil" "revenue";',
  '"Sheikh" "Aziz";',
  '"Abdul" "Aziz";',
  '"Saudi" "Arabia";',
  '"oil" "markets";'
)
reuters_phrases <- cpos("REUTERS", cqp_phrase_queries, p_attribute = "word") \%>\%
  as.phrases(corpus = "REUTERS", enc = "latin1")
  
# Use the concatenate_phrases() function on a data.table
 
lexical_units_cqp <- c(
  '"Deutsche.*" "Bundestag.*";',
  '"sozial.*" "Gerechtigkeit";',
  '"Ausschuss" "f.r" "Arbeit" "und" "Soziales";',
  '"soziale.*" "Marktwirtschaft";',
  '"freiheitliche.*" "Grundordnung";'
)

phr <- cpos("GERMAPARLMINI", query = lexical_units_cqp, cqp = TRUE) \%>\%
  as.phrases(corpus = "GERMAPARLMINI", enc = "word")

dt <- corpus("GERMAPARLMINI") \%>\%
  decode(p_attribute = "word", s_attribute = character(), to = "data.table") \%>\%
  concatenate_phrases(phrases = phr, col = "word")
  
dt[word == "Deutschen_Bundestag"]
dt[word == "soziale_Marktwirtschaft"]

}
\seealso{
Other classes to manage corpora: 
\code{\link{corpus-class}},
\code{\link{ranges-class}},
\code{\link{regions}},
\code{\link{subcorpus}}
}
\concept{classes to manage corpora}
