import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.decomposition import PCA
from sklearn.metrics import pairwise_distances_argmin_min
import requests
penguins = pd.read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/main/data/2025/2025-04-15/penguins.csv')
penguins_clean = penguins.replace("NaN", np.nan) # Replace "NaN" text with actual NaN values
penguins_clean = penguins_clean[["sex", "island", "bill_len", "bill_dep", "flipper_len", "body_mass"]].dropna()
le = LabelEncoder()
penguins_clean['sex'] = le.fit_transform(penguins_clean['sex']).astype(float)
penguins_clean['island'] = le.fit_transform(penguins_clean['island']).astype(float)
# Just gone straight for 3 now
k = 3
kmeans = KMeans(n_clusters = k, random_state = 42)
penguins_clean['cluster'] = kmeans.fit_predict(penguins_clean)
pca = PCA(n_components=2)
pca_components = pca.fit_transform(penguins_clean.drop('cluster', axis=1))
pca_df = pd.DataFrame(pca_components, columns=['PC1', 'PC2'])
pca_df['cluster'] = penguins_clean['cluster'].astype(str)
# Plot
# plt.figure(figsize=(32, 24))
# # sns.scatterplot(data=pca_df, x='PC1', y='PC2', hue='cluster', palette='Set2', s=100)
#
# plt.title('K-Means Clustering of Penguins (PCA Projection)')
# plt.legend(title='Cluster', bbox_to_anchor=(1.05, 1), loc='upper left', borderaxespad=0.)