get_diag()
is a helper function to compute average and median
semanticCoherence
and exclusivity
for
a number of stm
models. The function does not work for
models with content covariates.
get_diag(models, outobj)
A list of stm models.
The out
object containing documents for all stm models.
Returns model diagnostics in a data frame.
library(stm)
#> Warning: package 'stm' was built under R version 4.2.3
#> stm v1.3.7 successfully loaded. See ?stm for help.
#> Papers, resources, and other materials at structuraltopicmodel.com
library(dplyr)
#> Warning: package 'dplyr' was built under R version 4.2.3
#>
#> Attaching package: 'dplyr'
#> The following objects are masked from 'package:stats':
#>
#> filter, lag
#> The following objects are masked from 'package:base':
#>
#> intersect, setdiff, setequal, union
library(ggplot2)
#> Warning: package 'ggplot2' was built under R version 4.2.3
library(quanteda)
#> Warning: package 'quanteda' was built under R version 4.2.3
#> Warning: undefined subclass "ndiMatrix" of class "replValueSp"; definition not updated
#> Warning: undefined subclass "pcorMatrix" of class "replValueSp"; definition not updated
#> Package version: 4.0.2
#> Unicode version: 13.0
#> ICU version: 69.1
#> Parallel computing: 16 of 16 threads used.
#> See https://quanteda.io for tutorials and examples.
# prepare data
data <- corpus(gadarian, text_field = 'open.ended.response')
docvars(data)$text <- as.character(data)
data <- tokens(data, remove_punct = TRUE) |>
tokens_wordstem() |>
tokens_remove(stopwords('english')) |> dfm() |>
dfm_trim(min_termfreq = 2)
out <- convert(data, to = 'stm')
# fit models
gadarian_3 <- stm(documents = out$documents,
vocab = out$vocab,
data = out$meta,
prevalence = ~ treatment + s(pid_rep),
K = 3,
max.em.its = 1, # reduce computation time for example
verbose = FALSE)
gadarian_5 <- stm(documents = out$documents,
vocab = out$vocab,
data = out$meta,
prevalence = ~ treatment + s(pid_rep),
K = 5,
max.em.its = 1, # reduce computation time for example
verbose = FALSE)
# get diagnostics
diag <- get_diag(models = list(
model_3 = gadarian_3,
model_5 = gadarian_5),
outobj = out)
if (FALSE) {
# plot diagnostics
diag |>
ggplot(aes(x = coherence, y = exclusivity, color = statistic)) +
geom_text(aes(label = name), nudge_x = 5) + geom_point() +
labs(x = 'Semantic Coherence', y = 'Exclusivity') + theme_light()
}