Simpsons Data Analysis

An Analysis of the Simpsons data provided for Week 5 of Tidy Tuesday

TidyTuesday
Data Visualization
R Programming
2025
Author

Peter Gray

Published

February 2, 2025

Thumbnail

1. R code

Show code
# Load the packages in ----------------------------------------------------

if(!require(httr)){install.packages("httr"); library(httr)}
if(!require(jsonlite)){install.packages("jsonlite"); library(jsonlite)}
if(!require(withr)){install.packages("withr"); library(withr)}
if(!require(tidyverse)){install.packages("tidyverse"); library(tidyverse)}
if(!require(readxl)){install.packages("readxl"); library(readxl)}
if(!require(patchwork)){install.packages("patchwork"); library(patchwork)}
if(!require(gridExtra)){install.packages("gridExtra"); library(gridExtra)}
if(!require(grid)){install.packages("grid"); library(grid)}
if(!require(cowplot)){install.packages("cowplot"); library(cowplot)}
if(!require(showtext)){install.packages("showtext"); library(showtext)}
if(!require(ggpmisc)){install.packages("ggpmisc"); library(ggpmisc)}
if(!require(ggimage)){install.packages("ggimage"); library(ggimage)}
if(!require(jpeg)){install.packages("jpeg"); library(jpeg)}
if(!require(tinytex)){install.packages("tinytex"); library(tinytex)}

# get the wd
wd <- getwd()


# I have gone beyond the remit of the TidyTuesday and used the whole dataset --------
# No one cares about the later seasons

# Source for using full dataset: https://github.com/toddwschneider/flim-springfield
# Define the metadata URL and fetch it
metadata_url <- "www.kaggle.com/datasets/prashant111/the-simpsons-dataset/croissant/download"
response <- httr::GET(metadata_url)

# Ensure the request succeeded
if (httr::http_status(response)$category != "Success") {
  stop("Failed to fetch metadata.")
}

# Parse the metadata
metadata <- httr::content(response, as = "parsed", type = "application/json")

# Locate the ZIP file URL
distribution <- metadata$distribution
zip_url <- NULL

for (file in distribution) {
  if (file$encodingFormat == "application/zip") {
    zip_url <- file$contentUrl
    break
  }
}

if (is.null(zip_url)) {
  stop("No ZIP file URL found in the metadata.")
}

# Download the ZIP file. We'll use the withr package to make sure the downloaded
# files get cleaned up when we're done.
temp_file <- withr::local_tempfile(fileext = ".zip")
utils::download.file(zip_url, temp_file, mode = "wb")

# Unzip and read the CSV
unzip_dir <- withr::local_tempdir()
utils::unzip(temp_file, exdir = unzip_dir)

# Locate the CSV file within the extracted contents
csv_file <- list.files(unzip_dir, pattern = "\\.csv$", full.names = TRUE)

if (length(csv_file) == 0) {
  stop("No CSV file found in the unzipped contents.")
}

# Read the CSV into a dataframe
characters <- read_csv(csv_file[1])
episodes <- read_csv(csv_file[2])
locations <- read_csv(csv_file[3])
script_lines <- read_csv(csv_file[4])




# Load the font and define the theme --------------------------------------

#| warning: false
#| echo: FALSE
#| message: false

font_add_google("Permanent Marker")
showtext_auto()


# Custom Theme - to emulate the simpson colour scheme of yellow and blue
custom_theme <- function() {
  ggplot2::theme(
    plot.title.position   = "plot",
    plot.caption.position = "plot",
    plot.title = element_text(size = 60, face = "bold", family = "Permanent Marker", color = "#FFD90F", hjust =  0.5),    
    plot.subtitle = element_text(size = 48, face = "bold", family = "Permanent Marker", color = "#FFD90F"),    
    plot.caption = element_text(size = 13,  family = "Permanent Marker", color = "#FFD90F"),    
    axis.text = element_text(family = "Permanent Marker", size = 13, color = "#FFD90F"),   
    axis.title.x = element_text(size = 32,  family = "Permanent Marker", color = "#FFD90F"), 
    axis.title.y = element_text(size = 32,  family = "Permanent Marker", color = "#FFD90F"), 
    axis.line = element_line(linewidth  = 0.5, colour = "darkgrey"),
    axis.text.x = element_text(angle = 45, hjust = 1, size = 20,  family = "Permanent Marker", color = "#FFD90F"),
    axis.text.y = element_text(angle = 45, hjust = 1, size = 20, family = "Permanent Marker", color = "#FFD90F"),
    panel.grid.major = element_blank(),
    panel.grid.minor = element_blank(), 
    panel.background = element_rect(fill = alpha("#009DDC", 0.3),  
                                    color = "#009DDC", 
                                    linewidth = 0.5, 
                                    linetype = "solid"),
    plot.background = element_rect(fill = alpha("#009DDC", 1),  
                                   color = "#009DDC", 
                                   linewidth = 0.5, 
                                   linetype = "solid"),
    legend.background = element_rect(fill = alpha("#009DDC", 0.5),  
                                     color = "#009DDC", 
                                     linewidth = 0.5, 
                                     linetype = "solid"),
    legend.title = element_text(family = "Permanent Marker", size = 32, color = "#FFD90F"),
    legend.text = element_text(family = "Permanent Marker", size = 32, color = "#FFD90F"),
    legend.position = "bottom")
}


# Character Gender Split in the Simpsons --------------------------------------------

gender <- characters %>% 
  select(gender) %>% 
  filter(!is.na(gender)) %>% 
  mutate(gender = str_to_upper(factor(gender)),
         gender = case_when(gender == "F" ~ "Female", 
                            gender == "M" ~ "Male",
                            TRUE ~ NA))

barchart_gender <- gender %>% 
  ggplot(aes(x = gender)) +
  geom_bar(aes(fill = gender))  +
  geom_text(stat = "count", aes(label = after_stat(paste0("n = ", count))), 
            vjust = 2, size = 15,  
            family = "Permanent Marker", 
            color = "#FFD90F") +
  scale_fill_brewer(palette = "Set2") +  # Use a nice color scheme
  custom_theme() + 
  labs(subtitle = "Character Gender Split", y = "Number", fill = "Gender")


# Heatmap of IMDB Ratings -------------------------------------------------

heat <- episodes %>% 
  select(season, number_in_season, imdb_rating, title) %>%  
  mutate(season = factor(season),
         number_in_season = factor(number_in_season))
# Build a table

max_rating <- max(heat$imdb_rating, na.rm = TRUE)
min_rating <- min(heat$imdb_rating, na.rm = TRUE)


low_rating <- heat %>%
  filter(imdb_rating == min_rating)
high_rating <- heat %>%
  filter(imdb_rating == max_rating)

highlight_episodes <- heat %>%
  filter(imdb_rating == max_rating | imdb_rating == min_rating) %>%
  arrange(desc(imdb_rating)) 

highlight_table <- data.frame(
  `Episode Name` = highlight_episodes$title,
  `Season` = highlight_episodes$season,
  `IMDB Rating` = round(highlight_episodes$imdb_rating, 1),
  check.names=FALSE
)

#Only way I could work out how to do this to add a title to the table above

title_table <- data.frame(matrix(ncol = 1, nrow = 1))

colnames(title_table) = "Highest and Lowest Rated Episodes"

title_table[is.na(title_table)] <- ""

heatmap_episode <- ggplot(heat, aes(x = season, y = number_in_season, fill = imdb_rating)) +
  geom_tile() +
  geom_text(aes(label = round(imdb_rating, 1)),  
            color = alpha("Black", 0.5),  
            size = 8,  
            family = "Permanent Marker") +
  scale_fill_gradient(low = "firebrick2", high = "yellow", limits = c(4,10)) +  
  labs(
    subtitle = "Heatmap of Episode IMDB Ratings", 
    x = "Season Number",  
    y = "Episode Number", 
    fill = "IMDB Rating",
    caption = "Grey indicated data not available") +
  custom_theme() + 
  theme(legend.position = "right") +
  geom_tile(data = low_rating, 
            aes(x = season, y = number_in_season), 
            color = "darkred",
            linewidth = 1, 
            fill = NA)  + 
  geom_tile(data = high_rating, 
            aes(x = season, y = number_in_season), 
            color = "darkgreen",
            linewidth = 1, 
            fill = NA) +
  # Add a Table to the graph
  annotate(geom = "table", x = 20, y = 30, label = list(highlight_table), 
           vjust = 1, hjust = 0, family = "Permanent Marker", color = alpha("black", 0.5),
           table.theme = ttheme_minimal(title = "Highest and Lowest Rated Episodes",
                                        base_colour = "#FFD90F", base_family = "Permanent Marker", color = "#009DDC",base_size = 20,
                                        core=list(bg_params = list(fill = "#009DDC")), 
                                        colhead = list(bg_params = list(fill = "#009DDC")))) + 
  # Add a table title to the graph (Messy but)
  annotate(geom = "table", x = 20.5, y = 33, label = list(title_table), 
           vjust = 1, hjust = 0, family = "Permanent Marker", color = alpha("black", 0.5),
           table.theme = ttheme_minimal(title = "Highest and Lowest Rated Episodes",
                                        base_colour = "#FFD90F", base_family = "Permanent Marker", base_size = 20, color = "#009DDC",
                                        core=list(bg_params = list(fill = "#009DDC")), 
                                        colhead = list(bg_params = list(fill = "#009DDC"))))




# Boxplot of viewership over time -----------------------------------------


viewers <- episodes %>% 
  select(season, us_viewers_in_millions) %>% 
  group_by(season) %>% 
  summarise(mean = mean(us_viewers_in_millions),
            sd =  sd(us_viewers_in_millions),
            n = n(),
            se = sd/sqrt(n), 
            lci = mean - qt(1 - (0.05 / 2), n - 1) * se, 
            uci = mean + qt(1 - (0.05 / 2), n - 1) * se)

boxplot_viewers <- ggplot(episodes, aes(x = factor(season), y = us_viewers_in_millions)) +
  geom_boxplot(fill = alpha("firebrick2", 0.9), color = "black", outlier.shape = NA) +  
  geom_smooth(method = "lm", se=FALSE, color= "#F2E86D", aes(group=1)) +
  labs(
    subtitle = "US Viewership per Season",
    x = "Season Number",
    y = "US Viewers (Millions)",
    caption = "Yellow Line: Trendline derived from Linear Model"
  ) +
  custom_theme() +
  theme(plot.caption = element_text(hjust = 0))

wd <- getwd()


fig_dir <- paste0(wd, "/1. Data/1. Images/")

lines <-script_lines %>% 
  filter(speaking_line == T) %>% 
  rename(Character = raw_character_text) %>% 
  select(Character, word_count) %>% 
  group_by(Character) %>%
  summarise(n = n())  %>% 
  arrange(desc(n)) %>%  
  slice(1:10)  %>% 
  mutate(image = case_when(
    Character ==  "Homer Simpson" ~ paste0(fig_dir, "Homer.png"),
    Character ==  "Marge Simpson" ~ paste0(fig_dir, "Marge_Simpson.png"),
    Character ==  "Bart Simpson" ~ paste0(fig_dir, "Bart_Simpson_200px.png"),
    Character ==  "Lisa Simpson" ~ paste0(fig_dir, "Lisa_Simpson.png"),
    Character ==  "C. Montgomery Burns" ~ paste0(fig_dir, "Mr_Burns.png"),
    Character ==  "Moe Szyslak" ~ paste0(fig_dir, "Moe_Szyslak.png"),
    Character ==  "Seymour Skinner" ~ paste0(fig_dir, "Seymour_Skinner.png"),
    Character ==  "Ned Flanders" ~ paste0(fig_dir, "Ned_Flanders.png"),
    Character ==  "Grampa Simpson" ~ paste0(fig_dir, "Abe_Simpson.png"),
    Character ==  "Chief Wiggum" ~ paste0(fig_dir, "Chief_Wiggum.png")))


barchart_lines <- lines %>% 
  ggplot(aes(x = reorder(Character, n), y = n, fill = Character)) +  
  geom_bar(stat = "identity") +  
  geom_image(aes(x = Character, y = -max(n) * 0.05, image = image), 
             size = 0.04, asp = 1) +  
  geom_text(aes(label = paste0("n = ", n), y=n/1.5),  # looks the best
            hjust= -0.5,
            position = position_dodge(width = .25),
            size = 10,  
            family = "Permanent Marker", 
            color = "#FFD90F") +
  coord_flip() + 
  labs(x = "", y = "Number of Lines", subtitle = "Most Spoken Lines per Character") +  
  custom_theme() +
  theme(
    axis.text.y = element_blank(),  
    axis.ticks.y = element_blank(), 
    legend.position = "none"
  )



# Combine all the charts and save them in an outputs folder ---------------





combined <- (barchart_gender + boxplot_viewers + barchart_lines) / heatmap_episode +
  plot_annotation(title = 'The Simpsons',
                  caption = "TidyTuesday: Week 5, 2025",
                  theme = custom_theme()) +
  theme(caption = element_text(hjust =  0.5))
Back to top