8 Explorations with LDA
8.1 Initialisation
library(tidyverse)
library(tidytext)
library(topicmodels)
library(pins)
library(pinsqs)
library(AzureStor)
library(ggiraph)
source(here::here("R", "azure_init.R"))
fs::dir_ls(here::here("inst", "NMFregress"), glob="*.R") %>%
walk(source)
theme_set(theme_bw())
set_girafe_defaults(
opts_toolbar = opts_toolbar(position="topright", saveaspng=FALSE),
opts_zoom = opts_zoom(min=0.8, max=4)
)
speeches_board <- storage_endpoint("https://cbspeeches1.dfs.core.windows.net/", token=token) %>%
storage_container(name = "cbspeeches") %>%
board_azure(path = "data-speeches")
models_board <- storage_endpoint("https://cbspeeches1.dfs.core.windows.net/", token=token) %>%
storage_container(name = "cbspeeches") %>%
board_azure(path = "data-models")
8.2 Model fitting
As a proof of concept, the LDA model will fix the values for the hyperparameters:
-
K
, the number of topics, is set to 40. -
alpha
, the homogeneity of document-topic distributions, is set to . -
delta
, the homogeneity of topic-term distributions, is set to 0.1.
speeches <- speeches_board %>%
pin_qread("speeches-g7-dtm")
gibbs_control <- list(
seed = 100,
verbose = 1L,
alpha = 1/40,
delta = 0.1,
iter = 500
)
lda_model <- LDA(
speeches,
k = 40,
method = "Gibbs",
control = gibbs_control
)
Creating a checkpoint:
8.3 Explorations
speeches_metadata <- speeches_board %>%
pin_qread("speeches-g7-metadata", version="20241207T163741Z-5783f")
8.3.1 Pre-exploration data wrangling
As with NMF, variables that refer to theta
pertain to the document-topic matrix. Note that LDA
models produced by {topicmodels} refer to this matrix as gamma
.
As with NMF, the document-topic distribution is first retrieved from the model and reunited with the document metadata.
dtd <- lda_output %>%
tidy(matrix = "gamma") %>%
inner_join(speeches_metadata, by=c("document" = "doc")) %>%
arrange(date, document, topic)
Next, the document-topic proportions were aggregated and averaged by year-month to produce a monthly time series of topic proportions.
summarised_theta_by_month <- dtd %>%
mutate(
year = year(date),
month = month(date)
) %>%
group_by(country, year, month, topic) %>%
summarise(avg_probability = mean(gamma, na.rm=TRUE)) %>%
ungroup() %>%
unite("date", year, month, sep="-") %>%
mutate(date = ym(date)) %>%
drop_na()
For plotting, without assigning names to a topic, a topic's top 10 words can be used instead.
ttd <- lda_output %>%
tidy(matrix = "beta") %>%
arrange(topic, desc(beta))
top_words <- ttd %>%
select(-beta) %>%
group_split(topic) %>%
map(~ slice_head(.x, n=10)) %>%
map(~ pull(.x, term)) %>%
map(str_flatten_comma) %>%
set_names(1:40)
label_top_words <- top_words %>%
list_c() %>%
str_c(as.character(1:40), ": ", .) %>%
set_names(1:40)
8.3.2 Plots
Hover over plot titles for the top words of each topic. Hover over the series to reveal the country.
data1 <- summarised_theta_by_month %>%
filter(topic %in% c(21, 26, 34, 37))
plot1 <- ggplot(data1) +
geom_line_interactive(
aes(x=date, y=avg_probability, colour=country, tooltip=country, data_id=country),
alpha=0.4, show.legend=FALSE
) +
facet_wrap_interactive(
~ topic, ncol=1,
labeller = labeller_interactive(aes(tooltip=label_top_words[topic], data_id=topic))
) +
scale_x_date(breaks = "2 years", date_labels="%Y") +
labs(x="", y="Averaged topic proportions", caption="LDA, G7")
girafe(ggobj = plot1) %>%
girafe_options(
opts_hover(css = "stroke-opacity:1;"),
opts_hover_inv(css = "opacity:0.5;filter:saturate(20%);")
)
data2 <- summarised_theta_by_month %>%
filter(topic %in% c(9, 21, 24))
plot2 <- ggplot(data2) +
geom_line_interactive(
aes(x=date, y=avg_probability, colour=country, tooltip=country, data_id=country),
alpha=0.4, show.legend=FALSE
) +
facet_wrap_interactive(
~ topic, ncol=1,
labeller = labeller_interactive(aes(tooltip=label_top_words[topic], data_id=topic))
) +
scale_x_date(breaks = "2 years", date_labels="%Y") +
labs(x="", y="Averaged topic proportions", caption="LDA, G7")
girafe(ggobj = plot2) %>%
girafe_options(
opts_hover(css = "stroke-opacity:1;"),
opts_hover_inv(css = "opacity:0.5;filter:saturate(20%);")
)