8 Explorations with LDA

8.1 Initialisation

library(tidyverse)
library(tidytext)
library(topicmodels)
library(pins)
library(pinsqs)
library(AzureStor)
library(ggiraph)

source(here::here("R", "azure_init.R"))

fs::dir_ls(here::here("inst", "NMFregress"), glob="*.R") %>%
  walk(source)

theme_set(theme_bw())
set_girafe_defaults(
  opts_toolbar = opts_toolbar(position="topright", saveaspng=FALSE),
  opts_zoom = opts_zoom(min=0.8, max=4)
)

speeches_board <- storage_endpoint("https://cbspeeches1.dfs.core.windows.net/", token=token) %>%
  storage_container(name = "cbspeeches") %>%
  board_azure(path = "data-speeches")

models_board <- storage_endpoint("https://cbspeeches1.dfs.core.windows.net/", token=token) %>%
  storage_container(name = "cbspeeches") %>%
  board_azure(path = "data-models")

8.2 Model fitting

As a proof of concept, the LDA model will fix the values for the hyperparameters:

  • K, the number of topics, is set to 40.
  • alpha, the homogeneity of document-topic distributions, is set to 1/K=1/401/K \,=\, 1/40.
  • delta, the homogeneity of topic-term distributions, is set to 0.1.
speeches <- speeches_board %>%
  pin_qread("speeches-g7-dtm")

gibbs_control <- list(
  seed = 100,
  verbose = 1L,
  alpha = 1/40,
  delta = 0.1,
  iter = 500
)

lda_model <- LDA(
  speeches,
  k = 40,
  method = "Gibbs",
  control = gibbs_control
)

Creating a checkpoint:

models_board %>%
  pin_qsave(
    lda_model,
    "lda-g7-k=40",
    title = "40-topic lda, g7"
  )

8.3 Explorations

speeches_metadata <- speeches_board %>%
  pin_qread("speeches-g7-metadata", version="20241207T163741Z-5783f")

8.3.1 Pre-exploration data wrangling

As with NMF, variables that refer to theta pertain to the document-topic matrix. Note that LDA models produced by {topicmodels} refer to this matrix as gamma.

As with NMF, the document-topic distribution is first retrieved from the model and reunited with the document metadata.

dtd <- lda_output %>%
  tidy(matrix = "gamma") %>%
  inner_join(speeches_metadata, by=c("document" = "doc")) %>%
  arrange(date, document, topic)

Next, the document-topic proportions were aggregated and averaged by year-month to produce a monthly time series of topic proportions.

summarised_theta_by_month <- dtd %>%
  mutate(
    year = year(date),
    month = month(date)
  ) %>%
  group_by(country, year, month, topic) %>%
  summarise(avg_probability = mean(gamma, na.rm=TRUE)) %>%
  ungroup() %>%
  unite("date", year, month, sep="-") %>%
  mutate(date = ym(date)) %>%
  drop_na()

For plotting, without assigning names to a topic, a topic's top 10 words can be used instead.

ttd <- lda_output %>%
  tidy(matrix = "beta") %>%
  arrange(topic, desc(beta))

top_words <- ttd %>%
  select(-beta) %>%
  group_split(topic) %>%
  map(~ slice_head(.x, n=10)) %>%
  map(~ pull(.x, term)) %>%
  map(str_flatten_comma) %>%
  set_names(1:40)

label_top_words <- top_words %>%
  list_c() %>%
  str_c(as.character(1:40), ": ", .) %>%
  set_names(1:40)

8.3.2 Plots

Hover over plot titles for the top words of each topic. Hover over the series to reveal the country.

data1 <- summarised_theta_by_month %>%
  filter(topic %in% c(21, 26, 34, 37))

plot1 <- ggplot(data1) +
  geom_line_interactive(
    aes(x=date, y=avg_probability, colour=country, tooltip=country, data_id=country),
    alpha=0.4, show.legend=FALSE
  ) +
  facet_wrap_interactive(
    ~ topic, ncol=1,
    labeller = labeller_interactive(aes(tooltip=label_top_words[topic], data_id=topic))
  ) +
  scale_x_date(breaks = "2 years", date_labels="%Y") +
  labs(x="", y="Averaged topic proportions", caption="LDA, G7")

girafe(ggobj = plot1) %>%
  girafe_options(
    opts_hover(css = "stroke-opacity:1;"),
    opts_hover_inv(css = "opacity:0.5;filter:saturate(20%);")
  )
data2 <- summarised_theta_by_month %>%
  filter(topic %in% c(9, 21, 24))

plot2 <- ggplot(data2) +
  geom_line_interactive(
    aes(x=date, y=avg_probability, colour=country, tooltip=country, data_id=country),
    alpha=0.4, show.legend=FALSE
  ) +
  facet_wrap_interactive(
    ~ topic, ncol=1,
    labeller = labeller_interactive(aes(tooltip=label_top_words[topic], data_id=topic))
  ) +
  scale_x_date(breaks = "2 years", date_labels="%Y") +
  labs(x="", y="Averaged topic proportions", caption="LDA, G7")

girafe(ggobj = plot2) %>%
  girafe_options(
    opts_hover(css = "stroke-opacity:1;"),
    opts_hover_inv(css = "opacity:0.5;filter:saturate(20%);")
  )