TidyTuesday Week 15: Penguins

This week we’re taking another look at penguins! The Palmer Penguins dataset first appeared in TidyTuesday back in July of 2020. We’re using the dataset again because, as of R 4.5.0 (released this past Friday), the datasets are available in the base R datasets package!

TidyTuesday

Data Visualization

R Programming

Python

2025

Author

Peter Gray

Published

April 16, 2025

1. R code

Show code

# Load the packages in ----------------------------------------------------

if(!require(tidyverse)){install.packages("tidyverse"); library(tidyverse)}
if(!require(patchwork)){install.packages("patchwork"); library(patchwork)}
if(!require(ggplot2)){install.packages("ggplot2"); library(ggplot2)}
if(!require(ggwordcloud)){install.packages("ggwordcloud"); library(ggwordcloud)}
if(!require(ggpmisc)){install.packages("ggpmisc"); library(ggpmisc)}
if(!require(ggimage)){install.packages("ggimage"); library(ggimage)}
if(!require(ggtext)){install.packages("ggtext"); library(ggimage)}
if(!require(rlist)){install.packages("rlist"); library(rlist)}
if(!require(factoextra)){install.packages("factoextra"); library(factoextra)}
if(!require(NbClust)){install.packages("NbClust"); library(NbClust)}
# I stick all my styling into a CUsotm PAckage to tidy up my code and keep it consistent over the time
if(!require(CustomGGPlot2Theme)){devtools::install("CustomGGPlot2Theme"); library(CustomGGPlot2Theme)}

# get the wd - force of habit
wd <- getwd()
#pulling images off internet was taking a lot of time and casing a time out error
options(timeout = 1000) 



font <- "noto_mono"

penguins <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/main/data/2025/2025-04-15/penguins.csv')

penguins_clean <- penguins %>% 
  select(sex, island, bill_len, bill_dep, flipper_len, body_mass) %>% 
  na.omit() %>% 
  mutate(
    sex = as.numeric(factor(sex)),
    island = as.numeric(factor(island))
  )

# Find the optimal number of clusters using elbow method
# nb <- NbClust(penguins_clean, distance = "euclidean", min.nc = 2, max.nc = 10, method = "kmeans")
k <- 2


clust_kmeans <- eclust(penguins_clean, k=k, FUNcluster="kmeans", hc_metric="euclidean", graph=FALSE)
kmeans_silhouette <- fviz_silhouette(clust_kmeans, print.summary = FALSE)

kmeans_graph_2_clusters <- fviz_cluster(clust_kmeans, data = penguins_clean) +
  Custom_Style() 



#Now lets try with 3 clusters
k <- 3
# clust_kmeans <- eclust(penguins_clean, k=k, FUNcluster="kmeans", hc_metric="euclidean", graph=FALSE)
kmeans_silhouette <- fviz_silhouette(clust_kmeans, print.summary = FALSE)

kmeans_graph_3_clusters <- fviz_cluster(clust_kmeans, data = penguins_clean, labelsize = 0) +
  Custom_Style() +
  labs(title = "K-means Clustering of Penguins (PCA Projection) using R and ggplot2")

2. Python code

Show code


import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import LabelEncoder
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.decomposition import PCA
from sklearn.metrics import pairwise_distances_argmin_min

import requests


penguins = pd.read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/main/data/2025/2025-04-15/penguins.csv')

penguins_clean = penguins.replace("NaN", np.nan)  # Replace "NaN" text with actual NaN values
penguins_clean = penguins_clean[["sex", "island", "bill_len", "bill_dep", "flipper_len", "body_mass"]].dropna()


le = LabelEncoder()
penguins_clean['sex'] = le.fit_transform(penguins_clean['sex']).astype(float)
penguins_clean['island'] = le.fit_transform(penguins_clean['island']).astype(float)


# Just gone straight for 3 now
k = 3
kmeans = KMeans(n_clusters = k, random_state = 42)
penguins_clean['cluster'] = kmeans.fit_predict(penguins_clean)


pca = PCA(n_components=2)
pca_components = pca.fit_transform(penguins_clean.drop('cluster', axis=1))
pca_df = pd.DataFrame(pca_components, columns=['PC1', 'PC2'])
pca_df['cluster'] = penguins_clean['cluster'].astype(str)

# Plot
# plt.figure(figsize=(32, 24))
# # sns.scatterplot(data=pca_df, x='PC1', y='PC2', hue='cluster', palette='Set2', s=100)
# 
# plt.title('K-Means Clustering of Penguins (PCA Projection)')
# plt.legend(title='Cluster', bbox_to_anchor=(1.05, 1), loc='upper left', borderaxespad=0.)