## ----include = FALSE---------------------------------------------------------- knitr::opts_chunk$set( collapse = TRUE, warning = F, message = F, comment = "#>" ) ## ----------------------------------------------------------------------------- library(sentixr) ## ----------------------------------------------------------------------------- data(recensioni_tv) recensioni_tv ## ----------------------------------------------------------------------------- library(tidytext) ## ----------------------------------------------------------------------------- # Get the MAL lexicon (inflected forms) mal_dict <- get_sentix("MAL") head(mal_dict) ## ----------------------------------------------------------------------------- # Tokenize tidy_text <- recensioni_tv |> unnest_tokens(word, text) ## ----------------------------------------------------------------------------- # Join with lexicon tidy_sent <- tidy_text |> left_join(mal_dict, by = "word") head(tidy_sent) ## ----------------------------------------------------------------------------- # Calculate average sentiment per document sentix_summarize(tidy_sent, simplify = FALSE) ## ----------------------------------------------------------------------------- # Manual summary with dplyr tidy_sent |> group_by(doc_id) |> summarise( sentiment = mean(score, na.rm = T), n_tokens = n(), n_scored = sum(!is.na(score)) ) ## ----------------------------------------------------------------------------- # Get MAL with polarity labels polar_dict <- get_sentix("MAL", polarity = TRUE) head(polar_dict) ## ----------------------------------------------------------------------------- # Join with tokenized text tidy_text |> left_join(polar_dict, by = "word") |> head() ## ----------------------------------------------------------------------------- mal_dict |> mutate(polarity = make_polarity(score, threshold = 0.125)) |> head() ## ----------------------------------------------------------------------------- get_elita() |> mutate(across(where(is.numeric), ~ make_polarity(.x))) |> tail() ## ----------------------------------------------------------------------------- library(quanteda) ## ----data--------------------------------------------------------------------- data(recensioni_tv) sentix_toks <- corpus(recensioni_tv) |> tokens(remove_punct = TRUE) ## ----------------------------------------------------------------------------- # Convert MAL to a valence dictionary my_dict <- df_to_dict(mal_dict) ## ----eval=FALSE--------------------------------------------------------------- # df_to_valence(MAL) ## ----eval = FALSE------------------------------------------------------------- # # Compute valence # quanteda.sentiment::textstat_valence(sentix_toks, dictionary = my_dict) # #> doc_id sentiment # #> 1 doc1 0.2689482 # #> 2 doc2 -0.1755017 # #> 3 doc3 0.2788701 # #> 4 doc4 0.1295423 # #> 5 doc5 -0.0208181 ## ----eval=FALSE--------------------------------------------------------------- # my_dict2 <- get_sentix("MAL", polarity = TRUE) |> # # if there are other numeric columns, other than 'polarity' # df_to_polar() ## ----------------------------------------------------------------------------- my_dict2 <- df_to_dict(polar_dict) ## ----eval = FALSE------------------------------------------------------------- # # Compute polarity scores # quanteda.sentiment::textstat_polarity(sentix_toks, # dictionary = my_dict2) # #> doc_id sentiment # #> 1 doc1 2.8332133 # #> 2 doc2 0.0000000 # #> 3 doc3 1.4663371 # #> 4 doc4 0.9555114 # #> 5 doc5 0.0000000