An Analysis of the Simpsons data provided for Week 5 of Tidy Tuesday
TidyTuesday
Data Visualization
R Programming
2025
Author
Peter Gray
Published
February 2, 2025
Thumbnail
1. R code
Show code
# Load the packages in ----------------------------------------------------if(!require(httr)){install.packages("httr"); library(httr)}if(!require(jsonlite)){install.packages("jsonlite"); library(jsonlite)}if(!require(withr)){install.packages("withr"); library(withr)}if(!require(tidyverse)){install.packages("tidyverse"); library(tidyverse)}if(!require(readxl)){install.packages("readxl"); library(readxl)}if(!require(patchwork)){install.packages("patchwork"); library(patchwork)}if(!require(gridExtra)){install.packages("gridExtra"); library(gridExtra)}if(!require(grid)){install.packages("grid"); library(grid)}if(!require(cowplot)){install.packages("cowplot"); library(cowplot)}if(!require(showtext)){install.packages("showtext"); library(showtext)}if(!require(ggpmisc)){install.packages("ggpmisc"); library(ggpmisc)}if(!require(ggimage)){install.packages("ggimage"); library(ggimage)}if(!require(jpeg)){install.packages("jpeg"); library(jpeg)}if(!require(tinytex)){install.packages("tinytex"); library(tinytex)}# get the wdwd<-getwd()# I have gone beyond the remit of the TidyTuesday and used the whole dataset --------# No one cares about the later seasons# Source for using full dataset: https://github.com/toddwschneider/flim-springfield# Define the metadata URL and fetch itmetadata_url<-"www.kaggle.com/datasets/prashant111/the-simpsons-dataset/croissant/download"response<-httr::GET(metadata_url)# Ensure the request succeededif(httr::http_status(response)$category!="Success"){stop("Failed to fetch metadata.")}# Parse the metadatametadata<-httr::content(response, as ="parsed", type ="application/json")# Locate the ZIP file URLdistribution<-metadata$distributionzip_url<-NULLfor(fileindistribution){if(file$encodingFormat=="application/zip"){zip_url<-file$contentUrlbreak}}if(is.null(zip_url)){stop("No ZIP file URL found in the metadata.")}# Download the ZIP file. We'll use the withr package to make sure the downloaded# files get cleaned up when we're done.temp_file<-withr::local_tempfile(fileext =".zip")utils::download.file(zip_url, temp_file, mode ="wb")# Unzip and read the CSVunzip_dir<-withr::local_tempdir()utils::unzip(temp_file, exdir =unzip_dir)# Locate the CSV file within the extracted contentscsv_file<-list.files(unzip_dir, pattern ="\\.csv$", full.names =TRUE)if(length(csv_file)==0){stop("No CSV file found in the unzipped contents.")}# Read the CSV into a dataframecharacters<-read_csv(csv_file[1])episodes<-read_csv(csv_file[2])locations<-read_csv(csv_file[3])script_lines<-read_csv(csv_file[4])# Load the font and define the theme --------------------------------------#| warning: false#| echo: FALSE#| message: falsefont_add_google("Permanent Marker")showtext_auto()# Custom Theme - to emulate the simpson colour scheme of yellow and bluecustom_theme<-function(){ggplot2::theme( plot.title.position ="plot", plot.caption.position ="plot", plot.title =element_text(size =60, face ="bold", family ="Permanent Marker", color ="#FFD90F", hjust =0.5), plot.subtitle =element_text(size =48, face ="bold", family ="Permanent Marker", color ="#FFD90F"), plot.caption =element_text(size =13, family ="Permanent Marker", color ="#FFD90F"), axis.text =element_text(family ="Permanent Marker", size =13, color ="#FFD90F"), axis.title.x =element_text(size =32, family ="Permanent Marker", color ="#FFD90F"), axis.title.y =element_text(size =32, family ="Permanent Marker", color ="#FFD90F"), axis.line =element_line(linewidth =0.5, colour ="darkgrey"), axis.text.x =element_text(angle =45, hjust =1, size =20, family ="Permanent Marker", color ="#FFD90F"), axis.text.y =element_text(angle =45, hjust =1, size =20, family ="Permanent Marker", color ="#FFD90F"), panel.grid.major =element_blank(), panel.grid.minor =element_blank(), panel.background =element_rect(fill =alpha("#009DDC", 0.3), color ="#009DDC", linewidth =0.5, linetype ="solid"), plot.background =element_rect(fill =alpha("#009DDC", 1), color ="#009DDC", linewidth =0.5, linetype ="solid"), legend.background =element_rect(fill =alpha("#009DDC", 0.5), color ="#009DDC", linewidth =0.5, linetype ="solid"), legend.title =element_text(family ="Permanent Marker", size =32, color ="#FFD90F"), legend.text =element_text(family ="Permanent Marker", size =32, color ="#FFD90F"), legend.position ="bottom")}# Character Gender Split in the Simpsons --------------------------------------------gender<-characters%>%select(gender)%>%filter(!is.na(gender))%>%mutate(gender =str_to_upper(factor(gender)), gender =case_when(gender=="F"~"Female", gender=="M"~"Male",TRUE~NA))barchart_gender<-gender%>%ggplot(aes(x =gender))+geom_bar(aes(fill =gender))+geom_text(stat ="count", aes(label =after_stat(paste0("n = ", count))), vjust =2, size =15, family ="Permanent Marker", color ="#FFD90F")+scale_fill_brewer(palette ="Set2")+# Use a nice color schemecustom_theme()+labs(subtitle ="Character Gender Split", y ="Number", fill ="Gender")# Heatmap of IMDB Ratings -------------------------------------------------heat<-episodes%>%select(season, number_in_season, imdb_rating, title)%>%mutate(season =factor(season), number_in_season =factor(number_in_season))# Build a tablemax_rating<-max(heat$imdb_rating, na.rm =TRUE)min_rating<-min(heat$imdb_rating, na.rm =TRUE)low_rating<-heat%>%filter(imdb_rating==min_rating)high_rating<-heat%>%filter(imdb_rating==max_rating)highlight_episodes<-heat%>%filter(imdb_rating==max_rating|imdb_rating==min_rating)%>%arrange(desc(imdb_rating))highlight_table<-data.frame( `Episode Name` =highlight_episodes$title, `Season` =highlight_episodes$season, `IMDB Rating` =round(highlight_episodes$imdb_rating, 1), check.names=FALSE)#Only way I could work out how to do this to add a title to the table abovetitle_table<-data.frame(matrix(ncol =1, nrow =1))colnames(title_table)="Highest and Lowest Rated Episodes"title_table[is.na(title_table)]<-""heatmap_episode<-ggplot(heat, aes(x =season, y =number_in_season, fill =imdb_rating))+geom_tile()+geom_text(aes(label =round(imdb_rating, 1)), color =alpha("Black", 0.5), size =8, family ="Permanent Marker")+scale_fill_gradient(low ="firebrick2", high ="yellow", limits =c(4,10))+labs( subtitle ="Heatmap of Episode IMDB Ratings", x ="Season Number", y ="Episode Number", fill ="IMDB Rating", caption ="Grey indicated data not available")+custom_theme()+theme(legend.position ="right")+geom_tile(data =low_rating, aes(x =season, y =number_in_season), color ="darkred", linewidth =1, fill =NA)+geom_tile(data =high_rating, aes(x =season, y =number_in_season), color ="darkgreen", linewidth =1, fill =NA)+# Add a Table to the graphannotate(geom ="table", x =20, y =30, label =list(highlight_table), vjust =1, hjust =0, family ="Permanent Marker", color =alpha("black", 0.5), table.theme =ttheme_minimal(title ="Highest and Lowest Rated Episodes", base_colour ="#FFD90F", base_family ="Permanent Marker", color ="#009DDC",base_size =20, core=list(bg_params =list(fill ="#009DDC")), colhead =list(bg_params =list(fill ="#009DDC"))))+# Add a table title to the graph (Messy but)annotate(geom ="table", x =20.5, y =33, label =list(title_table), vjust =1, hjust =0, family ="Permanent Marker", color =alpha("black", 0.5), table.theme =ttheme_minimal(title ="Highest and Lowest Rated Episodes", base_colour ="#FFD90F", base_family ="Permanent Marker", base_size =20, color ="#009DDC", core=list(bg_params =list(fill ="#009DDC")), colhead =list(bg_params =list(fill ="#009DDC"))))# Boxplot of viewership over time -----------------------------------------viewers<-episodes%>%select(season, us_viewers_in_millions)%>%group_by(season)%>%summarise(mean =mean(us_viewers_in_millions), sd =sd(us_viewers_in_millions), n =n(), se =sd/sqrt(n), lci =mean-qt(1-(0.05/2), n-1)*se, uci =mean+qt(1-(0.05/2), n-1)*se)boxplot_viewers<-ggplot(episodes, aes(x =factor(season), y =us_viewers_in_millions))+geom_boxplot(fill =alpha("firebrick2", 0.9), color ="black", outlier.shape =NA)+geom_smooth(method ="lm", se=FALSE, color="#F2E86D", aes(group=1))+labs( subtitle ="US Viewership per Season", x ="Season Number", y ="US Viewers (Millions)", caption ="Yellow Line: Trendline derived from Linear Model")+custom_theme()+theme(plot.caption =element_text(hjust =0))wd<-getwd()fig_dir<-paste0(wd, "/1. Data/1. Images/")lines<-script_lines%>%filter(speaking_line==T)%>%rename(Character =raw_character_text)%>%select(Character, word_count)%>%group_by(Character)%>%summarise(n =n())%>%arrange(desc(n))%>%slice(1:10)%>%mutate(image =case_when(Character=="Homer Simpson"~paste0(fig_dir, "Homer.png"),Character=="Marge Simpson"~paste0(fig_dir, "Marge_Simpson.png"),Character=="Bart Simpson"~paste0(fig_dir, "Bart_Simpson_200px.png"),Character=="Lisa Simpson"~paste0(fig_dir, "Lisa_Simpson.png"),Character=="C. Montgomery Burns"~paste0(fig_dir, "Mr_Burns.png"),Character=="Moe Szyslak"~paste0(fig_dir, "Moe_Szyslak.png"),Character=="Seymour Skinner"~paste0(fig_dir, "Seymour_Skinner.png"),Character=="Ned Flanders"~paste0(fig_dir, "Ned_Flanders.png"),Character=="Grampa Simpson"~paste0(fig_dir, "Abe_Simpson.png"),Character=="Chief Wiggum"~paste0(fig_dir, "Chief_Wiggum.png")))barchart_lines<-lines%>%ggplot(aes(x =reorder(Character, n), y =n, fill =Character))+geom_bar(stat ="identity")+geom_image(aes(x =Character, y =-max(n)*0.05, image =image), size =0.04, asp =1)+geom_text(aes(label =paste0("n = ", n), y=n/1.5), # looks the best hjust=-0.5, position =position_dodge(width =.25), size =10, family ="Permanent Marker", color ="#FFD90F")+coord_flip()+labs(x ="", y ="Number of Lines", subtitle ="Most Spoken Lines per Character")+custom_theme()+theme( axis.text.y =element_blank(), axis.ticks.y =element_blank(), legend.position ="none")# Combine all the charts and save them in an outputs folder ---------------combined<-(barchart_gender+boxplot_viewers+barchart_lines)/heatmap_episode+plot_annotation(title ='The Simpsons', caption ="TidyTuesday: Week 5, 2025", theme =custom_theme())+theme(caption =element_text(hjust =0.5))