% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/sniff_groups_terms.R
\name{sniff_groups_terms}
\alias{sniff_groups_terms}
\title{Extract and Analyze Key Terms from Research Groups}
\usage{
sniff_groups_terms(
  net_groups,
  algorithm = "rake",
  phrase_pattern = "(A|N)*N(P+D*(A|N)*N)*",
  model_dir = tempdir(),
  n_cores = 1,
  show_progress = TRUE,
  n_terms = 15,
  min_freq = 2,
  digits = 4
)
}
\arguments{
\item{net_groups}{A list containing network data with publication information.
Must include elements: \code{network} (with vertex attributes 'group', 'TI', 'AB'),
\code{pubs_by_year}, and \code{aggregate}.}

\item{algorithm}{Term extraction algorithm to use. Options are:
\itemize{
\item "rake" - Rapid Automatic Keyword Extraction (default)
\item "pointwise" - Pointwise Mutual Information
\item "phrase" - Phrase pattern matching
}}

\item{phrase_pattern}{Regular expression pattern for phrase extraction when
algorithm = "phrase" (default: "(A|N)\emph{N(P+D}(A|N)\emph{N)}")}

\item{model_dir}{Directory where UDPipe models are stored (default: tempdir())}

\item{n_cores}{Number of CPU cores to use for parallel processing (default: 1)}

\item{show_progress}{Logical indicating whether to show progress bar (default: TRUE)}

\item{n_terms}{Number of top terms to return in summary table (default: 15)}

\item{min_freq}{Minimum frequency threshold for terms (default: 2)}

\item{digits}{Number of decimal places to round numerical values (default: 4)}
}
\value{
A list with two components:
\itemize{
\item \code{terms_by_group}: A named list (by group) of data frames containing extracted terms with statistics
\item \code{terms_table}: A summary tibble with top terms by frequency and TF-IDF for each group
}
}
\description{
Identifies and extracts key terms from titles and abstracts of publications within different
research groups using natural language processing techniques, and computes term statistics
including TF-IDF scores.
}
\details{
This function performs the following steps:
\enumerate{
\item Validates input structure and parameters
\item Loads the UDPipe language model from the specified directory
\item Processes text data (titles and abstracts) for each group
\item Applies the selected term extraction algorithm (RAKE, PMI, or phrase patterns)
\item Computes term frequencies and TF-IDF scores
\item Returns ranked terms for each research group with comprehensive statistics
}

The function uses UDPipe for tokenization, lemmatization and POS tagging before term extraction.
For phrase extraction, the default pattern finds noun phrases.
}
\examples{
\dontrun{
# Assuming groups is output from sniff_groups()
terms <- sniff_groups_terms(groups, algorithm = "rake")

# View terms for first group
head(terms$terms_by_group[[1]])

# View summary table
print(terms$terms_table)

# Customized extraction with custom model directory
net_groups_terms <- sniff_groups_terms(net_groups,
  algorithm = "phrase",
  model_dir = tempdir(),
  n_terms = 10,
  min_freq = 3,
  n_cores = 4
)
}

}
