# Load the packages in ----------------------------------------------------
if(!require(httr)){install.packages("httr"); library(httr)}
if(!require(jsonlite)){install.packages("jsonlite"); library(jsonlite)}
if(!require(withr)){install.packages("withr"); library(withr)}
if(!require(tidyverse)){install.packages("tidyverse"); library(tidyverse)}
if(!require(readxl)){install.packages("readxl"); library(readxl)}
if(!require(patchwork)){install.packages("patchwork"); library(patchwork)}
if(!require(gridExtra)){install.packages("gridExtra"); library(gridExtra)}
if(!require(grid)){install.packages("grid"); library(grid)}
if(!require(cowplot)){install.packages("cowplot"); library(cowplot)}
if(!require(showtext)){install.packages("showtext"); library(showtext)}
if(!require(ggpmisc)){install.packages("ggpmisc"); library(ggpmisc)}
if(!require(ggimage)){install.packages("ggimage"); library(ggimage)}
if(!require(jpeg)){install.packages("jpeg"); library(jpeg)}
if(!require(tinytex)){install.packages("tinytex"); library(tinytex)}
# get the wd
wd <- getwd()
# I have gone beyond the remit of the TidyTuesday and used the whole dataset --------
# No one cares about the later seasons
# Source for using full dataset: https://github.com/toddwschneider/flim-springfield
# Define the metadata URL and fetch it
metadata_url <- "www.kaggle.com/datasets/prashant111/the-simpsons-dataset/croissant/download"
response <- httr::GET(metadata_url)
# Ensure the request succeeded
if (httr::http_status(response)$category != "Success") {
stop("Failed to fetch metadata.")
}
# Parse the metadata
metadata <- httr::content(response, as = "parsed", type = "application/json")
# Locate the ZIP file URL
distribution <- metadata$distribution
zip_url <- NULL
for (file in distribution) {
if (file$encodingFormat == "application/zip") {
zip_url <- file$contentUrl
break
}
}
if (is.null(zip_url)) {
stop("No ZIP file URL found in the metadata.")
}
# Download the ZIP file. We'll use the withr package to make sure the downloaded
# files get cleaned up when we're done.
temp_file <- withr::local_tempfile(fileext = ".zip")
utils::download.file(zip_url, temp_file, mode = "wb")
# Unzip and read the CSV
unzip_dir <- withr::local_tempdir()
utils::unzip(temp_file, exdir = unzip_dir)
# Locate the CSV file within the extracted contents
csv_file <- list.files(unzip_dir, pattern = "\\.csv$", full.names = TRUE)
if (length(csv_file) == 0) {
stop("No CSV file found in the unzipped contents.")
}
# Read the CSV into a dataframe
characters <- read_csv(csv_file[1])
episodes <- read_csv(csv_file[2])
locations <- read_csv(csv_file[3])
script_lines <- read_csv(csv_file[4])
# Load the font and define the theme --------------------------------------
#| warning: false
#| echo: FALSE
#| message: false
font_add_google("Permanent Marker")
showtext_auto()
# Custom Theme - to emulate the simpson colour scheme of yellow and blue
custom_theme <- function() {
ggplot2::theme(
plot.title.position = "plot",
plot.caption.position = "plot",
plot.title = element_text(size = 60, face = "bold", family = "Permanent Marker", color = "#FFD90F", hjust = 0.5),
plot.subtitle = element_text(size = 48, face = "bold", family = "Permanent Marker", color = "#FFD90F"),
plot.caption = element_text(size = 13, family = "Permanent Marker", color = "#FFD90F"),
axis.text = element_text(family = "Permanent Marker", size = 13, color = "#FFD90F"),
axis.title.x = element_text(size = 32, family = "Permanent Marker", color = "#FFD90F"),
axis.title.y = element_text(size = 32, family = "Permanent Marker", color = "#FFD90F"),
axis.line = element_line(linewidth = 0.5, colour = "darkgrey"),
axis.text.x = element_text(angle = 45, hjust = 1, size = 20, family = "Permanent Marker", color = "#FFD90F"),
axis.text.y = element_text(angle = 45, hjust = 1, size = 20, family = "Permanent Marker", color = "#FFD90F"),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
panel.background = element_rect(fill = alpha("#009DDC", 0.3),
color = "#009DDC",
linewidth = 0.5,
linetype = "solid"),
plot.background = element_rect(fill = alpha("#009DDC", 1),
color = "#009DDC",
linewidth = 0.5,
linetype = "solid"),
legend.background = element_rect(fill = alpha("#009DDC", 0.5),
color = "#009DDC",
linewidth = 0.5,
linetype = "solid"),
legend.title = element_text(family = "Permanent Marker", size = 32, color = "#FFD90F"),
legend.text = element_text(family = "Permanent Marker", size = 32, color = "#FFD90F"),
legend.position = "bottom")
}
# Character Gender Split in the Simpsons --------------------------------------------
gender <- characters %>%
select(gender) %>%
filter(!is.na(gender)) %>%
mutate(gender = str_to_upper(factor(gender)),
gender = case_when(gender == "F" ~ "Female",
gender == "M" ~ "Male",
TRUE ~ NA))
barchart_gender <- gender %>%
ggplot(aes(x = gender)) +
geom_bar(aes(fill = gender)) +
geom_text(stat = "count", aes(label = after_stat(paste0("n = ", count))),
vjust = 2, size = 15,
family = "Permanent Marker",
color = "#FFD90F") +
scale_fill_brewer(palette = "Set2") + # Use a nice color scheme
custom_theme() +
labs(subtitle = "Character Gender Split", y = "Number", fill = "Gender")
# Heatmap of IMDB Ratings -------------------------------------------------
heat <- episodes %>%
select(season, number_in_season, imdb_rating, title) %>%
mutate(season = factor(season),
number_in_season = factor(number_in_season))
# Build a table
max_rating <- max(heat$imdb_rating, na.rm = TRUE)
min_rating <- min(heat$imdb_rating, na.rm = TRUE)
low_rating <- heat %>%
filter(imdb_rating == min_rating)
high_rating <- heat %>%
filter(imdb_rating == max_rating)
highlight_episodes <- heat %>%
filter(imdb_rating == max_rating | imdb_rating == min_rating) %>%
arrange(desc(imdb_rating))
highlight_table <- data.frame(
`Episode Name` = highlight_episodes$title,
`Season` = highlight_episodes$season,
`IMDB Rating` = round(highlight_episodes$imdb_rating, 1),
check.names=FALSE
)
#Only way I could work out how to do this to add a title to the table above
title_table <- data.frame(matrix(ncol = 1, nrow = 1))
colnames(title_table) = "Highest and Lowest Rated Episodes"
title_table[is.na(title_table)] <- ""
heatmap_episode <- ggplot(heat, aes(x = season, y = number_in_season, fill = imdb_rating)) +
geom_tile() +
geom_text(aes(label = round(imdb_rating, 1)),
color = alpha("Black", 0.5),
size = 8,
family = "Permanent Marker") +
scale_fill_gradient(low = "firebrick2", high = "yellow", limits = c(4,10)) +
labs(
subtitle = "Heatmap of Episode IMDB Ratings",
x = "Season Number",
y = "Episode Number",
fill = "IMDB Rating",
caption = "Grey indicated data not available") +
custom_theme() +
theme(legend.position = "right") +
geom_tile(data = low_rating,
aes(x = season, y = number_in_season),
color = "darkred",
linewidth = 1,
fill = NA) +
geom_tile(data = high_rating,
aes(x = season, y = number_in_season),
color = "darkgreen",
linewidth = 1,
fill = NA) +
# Add a Table to the graph
annotate(geom = "table", x = 20, y = 30, label = list(highlight_table),
vjust = 1, hjust = 0, family = "Permanent Marker", color = alpha("black", 0.5),
table.theme = ttheme_minimal(title = "Highest and Lowest Rated Episodes",
base_colour = "#FFD90F", base_family = "Permanent Marker", color = "#009DDC",base_size = 20,
core=list(bg_params = list(fill = "#009DDC")),
colhead = list(bg_params = list(fill = "#009DDC")))) +
# Add a table title to the graph (Messy but)
annotate(geom = "table", x = 20.5, y = 33, label = list(title_table),
vjust = 1, hjust = 0, family = "Permanent Marker", color = alpha("black", 0.5),
table.theme = ttheme_minimal(title = "Highest and Lowest Rated Episodes",
base_colour = "#FFD90F", base_family = "Permanent Marker", base_size = 20, color = "#009DDC",
core=list(bg_params = list(fill = "#009DDC")),
colhead = list(bg_params = list(fill = "#009DDC"))))
# Boxplot of viewership over time -----------------------------------------
viewers <- episodes %>%
select(season, us_viewers_in_millions) %>%
group_by(season) %>%
summarise(mean = mean(us_viewers_in_millions),
sd = sd(us_viewers_in_millions),
n = n(),
se = sd/sqrt(n),
lci = mean - qt(1 - (0.05 / 2), n - 1) * se,
uci = mean + qt(1 - (0.05 / 2), n - 1) * se)
boxplot_viewers <- ggplot(episodes, aes(x = factor(season), y = us_viewers_in_millions)) +
geom_boxplot(fill = alpha("firebrick2", 0.9), color = "black", outlier.shape = NA) +
geom_smooth(method = "lm", se=FALSE, color= "#F2E86D", aes(group=1)) +
labs(
subtitle = "US Viewership per Season",
x = "Season Number",
y = "US Viewers (Millions)",
caption = "Yellow Line: Trendline derived from Linear Model"
) +
custom_theme() +
theme(plot.caption = element_text(hjust = 0))
wd <- getwd()
fig_dir <- paste0(wd, "/1. Data/1. Images/")
lines <-script_lines %>%
filter(speaking_line == T) %>%
rename(Character = raw_character_text) %>%
select(Character, word_count) %>%
group_by(Character) %>%
summarise(n = n()) %>%
arrange(desc(n)) %>%
slice(1:10) %>%
mutate(image = case_when(
Character == "Homer Simpson" ~ paste0(fig_dir, "Homer.png"),
Character == "Marge Simpson" ~ paste0(fig_dir, "Marge_Simpson.png"),
Character == "Bart Simpson" ~ paste0(fig_dir, "Bart_Simpson_200px.png"),
Character == "Lisa Simpson" ~ paste0(fig_dir, "Lisa_Simpson.png"),
Character == "C. Montgomery Burns" ~ paste0(fig_dir, "Mr_Burns.png"),
Character == "Moe Szyslak" ~ paste0(fig_dir, "Moe_Szyslak.png"),
Character == "Seymour Skinner" ~ paste0(fig_dir, "Seymour_Skinner.png"),
Character == "Ned Flanders" ~ paste0(fig_dir, "Ned_Flanders.png"),
Character == "Grampa Simpson" ~ paste0(fig_dir, "Abe_Simpson.png"),
Character == "Chief Wiggum" ~ paste0(fig_dir, "Chief_Wiggum.png")))
barchart_lines <- lines %>%
ggplot(aes(x = reorder(Character, n), y = n, fill = Character)) +
geom_bar(stat = "identity") +
geom_image(aes(x = Character, y = -max(n) * 0.05, image = image),
size = 0.04, asp = 1) +
geom_text(aes(label = paste0("n = ", n), y=n/1.5), # looks the best
hjust= -0.5,
position = position_dodge(width = .25),
size = 10,
family = "Permanent Marker",
color = "#FFD90F") +
coord_flip() +
labs(x = "", y = "Number of Lines", subtitle = "Most Spoken Lines per Character") +
custom_theme() +
theme(
axis.text.y = element_blank(),
axis.ticks.y = element_blank(),
legend.position = "none"
)
# Combine all the charts and save them in an outputs folder ---------------
combined <- (barchart_gender + boxplot_viewers + barchart_lines) / heatmap_episode +
plot_annotation(title = 'The Simpsons',
caption = "TidyTuesday: Week 5, 2025",
theme = custom_theme()) +
theme(caption = element_text(hjust = 0.5))