TidyTuesday Week 12: Amazon’s Annual Reports

This week we’re exploring text data from Amazon’s annual reports. The PDFs were read into R using the {pdftools} R package, and explored by TidyTuesday participant Gregory Vander Vinne in a post on his website. Note that stop words (e.g., ‘and’, ‘the’, ‘a’) have been removed from the data.

TidyTuesday

Data Visualization

R Programming

2025

Author

Peter Gray

Published

March 28, 2025

1. R code

Show code

# Load the packages in ----------------------------------------------------

if(!require(tidyverse)){install.packages("tidyverse"); library(tidyverse)}
if(!require(patchwork)){install.packages("patchwork"); library(patchwork)}
if(!require(ggplot2)){install.packages("ggplot2"); library(ggplot2)}
if(!require(ggwordcloud)){install.packages("ggwordcloud"); library(ggwordcloud)}
# I stick all my styling into a CUsotm PAckage to tidy up my code and keep it consistent over the time
if(!require(CustomGGPlot2Theme)){devtools::install("CustomGGPlot2Theme"); library(CustomGGPlot2Theme)}

# get the wd
wd <- getwd()

font <- "noto_mono"
amazon <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/main/data/2025/2025-03-25/report_words_clean.csv')


data_clean <- amazon %>%
  filter(year >= 2013) %>%
  group_by(year, word) %>%
  summarise(count = n(), .groups = "drop") %>%     
  arrange(year, desc(count)) %>% # Sort within each year by count
  group_by(year) %>%
  slice(1:15) %>% # Select top 15 words per year
  ungroup() %>%
  filter(word != "aaa") %>% 
  mutate(word = str_remove_all(word, "<.*?>"))




wordcloud  <- ggplot(data_clean, aes(label = word, size = count, color = count)) +
  geom_text_wordcloud(area_corr = TRUE) +
  facet_wrap(~ year) +
  scale_size_area(max_size = 15) +
  scale_color_gradient(low = "blue", high = "red") +
  Custom_Style() +
  theme(
    strip.background = element_rect(fill = "#FFFBF0", color = "#FFFBF0"), 
    strip.text = element_text(size = 32, face = "bold", color = "black", family = font, hjust = 0) )

p1 <- wordcloud +
  plot_annotation(
    title = str_wrap("Word Cloud of Top 15 Most Commonly used words in Amazon's Annual Report (2013-2023)", 40),
    subtitle = "TidyTuesday: Week 12, 2025",
    theme = Custom_Style()
  ) &
  theme(
    caption = element_text(hjust = 0.5),
    plot.subtitle = element_text(size = 20),
    plot.title = element_text(size = 32)
  )

ggsave(
  filename = "~/Documents/Coding/Website/data_visualisations/TidyTuesday/2025/thumbnails/TidyTues_Week12.png", 
  plot = p1, 
  height = 1080 / 96,  # Converts 1240px to inches (assuming 96 DPI)
  width = 1080 / 96,    # Converts 1080px to inches
  dpi = 96,             # Set DPI to 96 to match pixel dimensions
  units = "in",
)