3 Cleaning text for G7 countries

This chapter documents the cleaning of the text for speeches given by a G7 country.

3.1 Initialisation

library(tidyverse)
library(pins)
library(pinsqs)
library(AzureStor)

source(here::here("R", "azure_init.R"))

speeches_board <- storage_endpoint("https://cbspeeches1.dfs.core.windows.net/", token=token) %>%
  storage_container(name = "cbspeeches") %>%
  board_azure(path = "data-speeches")

3.2 Filter speeches to G7 countries

g7_members <- c("Canada", "France", "Germany", "Italy", "Japan", "United Kingdom", "United States")

speeches <- speeches_board %>%
  pin_qread("speeches-with-country") %>%
  filter(country %in% g7_members)

3.3 Fix one date

There was one speech whose date should be December 2023, not December 2024, as this corpus only goes up to January 2024.

data_update <- tribble(
  ~doc, ~date,
  "r240109a", ymd("2023-12-08")
)

speeches <- speeches %>%
  rows_update(data_update, by="doc")

3.4 Repairs and removals

3.4.1 Remove introductions

The introductory remarks of each speech were removed using the same pattern previously used to identify the first sentence of each speech.

speeches <- speeches %>%
  mutate(
    text = str_remove(text, pattern="^[^.]+\\."),
    text = str_squish(text)
  )

The "Introduction" headers were also removed, identified by the presence of the word "Introduction" in title case, followed by another word in title case.

speeches <- speeches %>%
  mutate(text = str_remove(text, "Introduction (?=[:upper:])"))

3.4.2 Remove references section

speeches <- speeches %>%
  mutate(
    text = str_remove_all(text, "(?<=[:punct:]|[:digit:]) References:? .+$"),
    text = str_remove_all(text, "References (?=[:upper:]).+$")
  )

3.4.3 Repair typos

As mentioned in the section on normalising institution names, some country names were incorrectly entered and require repair. Of the G7 countries, Italy was the only one affected.

speeches <- speeches %>%
  mutate(text = str_replace_all(text, "Italty", "Italy"))

3.4.4 Remove own institution and country names

It is of greater interest when a central bank mentions another central bank or another country. Therefore, all self-mentions of the bank, country, and inhabitants were removed. For example, for Canada, words to remove would include: Bank of Canada, Canada, Canada's, and Canadian. The removal patterns corresponding to each bank are stored in inst/data-misc/bank_country_regex_patterns.csv.

The bank_country_regex_patterns.csv file is read in using read_delim() rather than read_csv() as the file contains pre-escaped regex strings, and as such, escape_backslash = TRUE is required.

bank_country_regex_patterns <- read_delim(
  here::here("inst", "data-misc", "bank_country_regex_patterns.csv"),
  delim=",", escape_backslash=TRUE
) %>%
  filter(country %in% g7_members) %>%
  select(country, regex_pattern)

speeches <- speeches %>%
  left_join(bank_country_regex_patterns, by="country") %>%
  mutate(text = str_remove_all(text, regex_pattern)) %>%
  select(-regex_pattern)

3.5 General cleaning

3.5.2 Normalisation of select ngrams into acronyms

"Central Bank Digital Currency" is a particular 4-gram of interest and can be converted to its abbreviated form.

speeches <- speeches %>%
  mutate(text = str_replace_all(text, "(?i)Central Bank Digital Currency", "CBDC"))

3.5.3 Remove non-ascii characters, emails, social media handles, and links

speeches <- speeches %>%
  mutate(
    text = str_remove_all(text, "[:^ascii:]"),
    text = str_remove_all(text, "([[:alnum:]_.\\-]+)?@[[:alnum:]_.\\-]+"),
    text = str_remove_all(text, "https?://\\S+"),
    text = str_remove_all(text, "www\\.\\S+")
  )

3.5.4 Remove/replace stray/excessive punctuation

speeches <- speeches %>%
  mutate(
    text = str_remove_all(text, "(\\* )+"),
    text = str_replace_all(text, "\\?|!", "."),
    text = str_remove_all(text, ","),
    text = str_remove_all(text, "\""),
    text = str_replace_all(text, "'{2,}", "'"),
    text = str_remove_all(text, "\\B'(?=[:alpha:])"),
    text = str_remove_all(text, "(?<=[:alpha:])'\\B"),
    text = str_remove_all(text, "\\B'\\B"),
    text = str_replace_all(text, "\\.{3}", "."),
    text = str_remove_all(text, " \\. "),
    text = str_remove_all(text, "-"),
    text = str_remove_all(text, "_"),
    text = str_remove_all(text, "\\(|\\)|\\{|\\}|\\[|\\]|\\||;|:|\\+")
  )

3.5.5 Remove numerical quantities

This included dollar signs, percent signs, punctuation separated numbers, and whole numbers.

speeches <- speeches %>%
  mutate(
    text = str_remove_all(text, "\\$"),
    text = str_remove_all(text, "%"),
    text = str_remove_all(text, "[:digit:]+([.,]+[:digit:]+)*"),
    text = str_remove_all(text, "[:digit:]")
  )

3.5.6 Remove stray letters

speeches <- speeches %>%
  mutate(text = str_remove_all(text, "\\b[A-Za-z]\\b"))

3.5.7 Final squish

Excessive whitespace resulting from previous removals/replacements was removed.

speeches <- speeches %>%
  mutate(text = str_squish(text))

3.5.8 Remove unneeded columns

speeches <- speeches %>%
  select(-first_sentence)

3.6 Save the data

Writing the data to the pin board:

speeches_board %>%
  pin_qsave(
    speeches,
    "speeches-g7-cleaned",
    title = "speeches for g7 countries, cleaned"
  )

Make a separate copy of the metadata as well:

speeches_metadata <- speeches %>%
  select(doc, date, institution, country)

speeches_board %>%
  pin_qsave(
    speeches_metadata,
    "speeches-g7-metadata",
    title = "metadata for g7 speeches"
  )

2 Identifying the country from the text

4 Identifying important bigrams and trigrams