% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/similarity.R
\docType{methods}
\name{similarity}
\alias{as.matrix.similMatrix}
\alias{print.similMatrix}
\alias{similarity}
\alias{similarity,dfm-method}
\title{compute similarities between documents and/or features}
\usage{
similarity(x, selection = NULL, n = NULL, margin = c("features",
  "documents"), method = "correlation", sorted = TRUE, normalize = FALSE)

\S4method{similarity}{dfm}(x, selection = NULL, n = NULL,
  margin = c("features", "documents"), method = "correlation",
  sorted = TRUE, normalize = FALSE)

\method{as.matrix}{similMatrix}(x, ...)

\method{print}{similMatrix}(x, digits = 4, ...)
}
\arguments{
\item{x}{a \link{dfm} object}

\item{selection}{character or character vector of document names or feature 
labels from the dfm}

\item{n}{the top \code{n} most similar items will be returned, sorted in 
descending order.  If n is \code{NULL}, return all items.}

\item{margin}{identifies the margin of the dfm on which similarity will be 
computed: \code{features} for word/term features or \code{documents} for 
documents.}

\item{method}{a valid method for computing similarity from 
\code{\link[proxy]{pr_DB}}}

\item{sorted}{sort results in descending order if \code{TRUE}}

\item{normalize}{if \code{TRUE}, normalize the dfm by term frequency within 
document (so that the dfm values will be relative term frequency within 
each document)}

\item{...}{unused}

\item{digits}{decimal places to display similarity values}
}
\value{
a named list of the selection labels, with a sorted named vector of 
  similarity measures.
}
\description{
Compute similarities between documents and/or features from a 
\code{\link{dfm}}. Uses the similarity measures defined in 
\link[proxy]{simil}.  See \code{\link[proxy]{pr_DB}} for available distance 
measures, or how to create your own.
}
\note{
The method for computing feature similarities can be quite slow when
  there are large numbers of feature types.  Future implementations will
  hopefully speed this up.
}
\examples{
# create a dfm from inaugural addresses from Reagan onwards
presDfm <- dfm(subset(inaugCorpus, Year > 1980), ignoredFeatures = stopwords("english"),
               stem = TRUE)

# compute some document similarities
(tmp <- similarity(presDfm, margin = "documents"))
# output as a matrix
as.matrix(tmp)
# for specific comparisons
similarity(presDfm, "1985-Reagan", n = 5, margin = "documents")
similarity(presDfm, c("2009-Obama" , "2013-Obama"), n = 5, margin = "documents")
similarity(presDfm, c("2009-Obama" , "2013-Obama"), margin = "documents")
similarity(presDfm, c("2009-Obama" , "2013-Obama"), margin = "documents", method = "cosine")
similarity(presDfm, "2005-Bush", margin = "documents", method = "eJaccard", sorted = FALSE)

# compute some term similarities
similarity(presDfm, c("fair", "health", "terror"), method="cosine")

\dontrun{
# compare to tm
require(tm)
data("crude")
crude <- tm_map(crude, content_transformer(tolower))
crude <- tm_map(crude, removePunctuation)
crude <- tm_map(crude, removeNumbers)
crude <- tm_map(crude, stemDocument)
tdm <- TermDocumentMatrix(crude)
findAssocs(tdm, c("oil", "opec", "xyz"), c(0.75, 0.82, 0.1))
# in quanteda
quantedaDfm <- new("dfmSparse", Matrix(t(as.matrix(tdm))))
similarity(quantedaDfm, c("oil", "opec", "xyz"), n = 14)
corMat <- as.matrix(proxy::simil(as.matrix(quantedaDfm), by_rows = FALSE))
round(head(sort(corMat[, "oil"], decreasing = TRUE), 14), 2)
round(head(sort(corMat[, "opec"], decreasing = TRUE), 9), 2)
}
}

