import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.decomposition import PCA
from sklearn.metrics import pairwise_distances_argmin_min
import requests
penguins = pd.read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/main/data/2025/2025-04-15/penguins.csv' )
penguins_clean = penguins.replace("NaN" , np.nan) # Replace "NaN" text with actual NaN values
penguins_clean = penguins_clean[["sex" , "island" , "bill_len" , "bill_dep" , "flipper_len" , "body_mass" ]].dropna()
le = LabelEncoder()
penguins_clean['sex' ] = le.fit_transform(penguins_clean['sex' ]).astype(float )
penguins_clean['island' ] = le.fit_transform(penguins_clean['island' ]).astype(float )
# Just gone straight for 3 now
k = 3
kmeans = KMeans(n_clusters = k, random_state = 42 )
penguins_clean['cluster' ] = kmeans.fit_predict(penguins_clean)
pca = PCA(n_components= 2 )
pca_components = pca.fit_transform(penguins_clean.drop('cluster' , axis= 1 ))
pca_df = pd.DataFrame(pca_components, columns= ['PC1' , 'PC2' ])
pca_df['cluster' ] = penguins_clean['cluster' ].astype(str )
# Plot
# plt.figure(figsize=(32, 24))
# # sns.scatterplot(data=pca_df, x='PC1', y='PC2', hue='cluster', palette='Set2', s=100)
#
# plt.title('K-Means Clustering of Penguins (PCA Projection)')
# plt.legend(title='Cluster', bbox_to_anchor=(1.05, 1), loc='upper left', borderaxespad=0.)