import skfda
from sklearn.mixture import GaussianMixture
from skfda.preprocessing.dim_reduction import FPCA
# from sklearn.metrics import adjusted_mutual_info_score, adjusted_rand_score, silhouette_score, davies_bouldin_score
from sklearn.metrics import silhouette_score as ss
from sklearn.metrics import davies_bouldin_score as dbs
import matplotlib.pyplot as plt
import numpy as np
from sklearn.cluster import KMeans
[docs]
def silhouette_score(fd, y, **kwargs):
"""
Compute the silhouette score for clustering results on functional data.
Parameters
----------
fd : skfda.FDataGrid
Functional data object containing the data matrix.
y : array-like
Cluster labels for each sample.
**kwargs : dict
Additional keyword arguments passed to sklearn.metrics.silhouette_score.
Returns
-------
float
Silhouette score.
"""
return ss(fd.data_matrix.squeeze(), y, **kwargs)
[docs]
def davies_bouldin_score(fd, y):
"""
Compute the Davies-Bouldin score for clustering results on functional data.
Parameters
----------
fd : skfda.FDataGrid
Functional data object containing the data matrix.
y : array-like
Cluster labels for each sample.
Returns
-------
float
Davies-Bouldin score.
"""
return dbs(fd.data_matrix.squeeze(), y)
[docs]
def gmms_fit_plot_(weights, means, stdev, ax = None, **kwargs):
"""
Plot Gaussian mixture model (GMM) density curves.
Parameters
----------
weights : array-like
Weights of each Gaussian component.
means : array-like
Means of each Gaussian component.
stdev : array-like
Standard deviations of each Gaussian component.
ax : matplotlib.axes.Axes, optional
Axis to plot on. If None, uses the current axis.
**kwargs : dict
Additional keyword arguments passed to matplotlib plot.
Returns
-------
None
"""
d_pdf = lambda x, mu, sigma: np.exp(-0.5 * (((x - mu) / sigma)) ** 2) / (sigma * np.sqrt(2 * np.pi))
for i in range(len(weights)):
x = np.linspace(means[i] - 3 * stdev[i], means[i] + 3 * stdev[i], 50)
if ax:
ax.plot(x, weights[i] * d_pdf(x, means[i], stdev[i]), linewidth = 3, **kwargs)
else:
plt.plot(x, weights[i] * d_pdf(x, means[i], stdev[i]), linewidth = 3, **kwargs)
[docs]
def match_labels(cluster_labels, true_class_labels, cluster_class_labels_perm):
"""
Permute cluster labels to match specified true class labels.
Parameters
----------
cluster_labels : array-like
Cluster labels assigned by a clustering algorithm.
true_class_labels : array-like
True class labels to match.
cluster_class_labels_perm : array-like
Permutation of cluster labels to match true labels.
Returns
-------
np.ndarray
Array of matched labels.
"""
label_match_dict = {}
for key, value in zip(cluster_class_labels_perm, true_class_labels):
label_match_dict[key] = value
matched_labels = np.zeros_like(cluster_labels)
for i in cluster_class_labels_perm:
matched_labels[np.argwhere(cluster_labels == i)] = label_match_dict[i]
return matched_labels
[docs]
def estimate_nclusters(fdata, ncluster_grid = None):
"""
Employs a systematic search to identify the number of clusters that minimize the Akaike Information Criterion (AIC)
or the Bayesian Information Criterion (BIC).
Parameters
----------
fdata : skfda.FDataGrid
The functional dataset for which the number of clusters is to be estimated.
ncluster_grid : array-like, optional
List or array specifying the grid within which the number of clusters is searched.
Defaults to range(2, 15).
Returns
-------
int
The estimated number of clusters in the functional dataset.
"""
if ncluster_grid is None:
ncluster_grid = range(2,15)
fpca_ = FPCA(n_components = 1)
scores = fpca_.fit_transform(fdata)
bic_ = []
aic_ = []
for n_comp in ncluster_grid:
model = GaussianMixture(n_components=n_comp, n_init= 20)
model.fit(scores)
bic_.append(model.bic(scores))
aic_.append(model.aic(scores))
return min([ncluster_grid[np.argmin(aic_)], ncluster_grid[np.argmin(bic_)]])
[docs]
def hybrid_representative_selection(data, p : float, p1 : float):
"""
Select representative samples from a large dataset using a hybrid approach
combining random sampling and KMeans clustering.
This method first randomly samples a proportion `p` of the data, then applies
KMeans clustering to this subset to select a smaller set of representative samples.
Parameters
----------
data : skfda.FDataGrid or np.ndarray
Functional dataset or array to sample from.
p : float
Proportion of data to randomly sample (0 < p < 1).
p1 : float
Proportion of data to use as representative samples (0 < p1 < p).
Returns
-------
skfda.FDataGrid or np.ndarray
Representative samples selected by KMeans clustering.
Raises
------
AssertionError
If p1 is not less than p.
ValueError
If data is not of type skfda.FDataGrid or np.ndarray.
"""
# Random Selection
N = len(data)
n = np.ceil(p * N).astype('int') # approx. (100 * p)% of dataset
n_idx = np.random.choice(N, size = n, replace = False)
# Kmeans selection
assert p1 < p, "p1 must be less that p."
k_reps = int(p1 * N) # approx (100 *p1)% of dataset
Kmean_reps = KMeans(n_clusters=k_reps, init= 'k-means++',)
if type(data) == skfda.FDataGrid:
Kmean_reps.fit(data[n_idx].data_matrix.squeeze())
return skfda.FDataGrid(data_matrix= Kmean_reps.cluster_centers_,
grid_points= data.grid_points[0])
elif type(data) == np.ndarray:
Kmean_reps.fit(data[n_idx])
return Kmean_reps.cluster_centers_
else:
raise ValueError("'data' type should be either skfda.FDataGrid or numpy.ndarray.")