Source code for GPmix.misc

import skfda
from sklearn.mixture import GaussianMixture
from skfda.preprocessing.dim_reduction import FPCA
# from sklearn.metrics import adjusted_mutual_info_score, adjusted_rand_score, silhouette_score, davies_bouldin_score
from sklearn.metrics import silhouette_score as ss
from sklearn.metrics import davies_bouldin_score as dbs
import matplotlib.pyplot as plt
import numpy as np
from sklearn.cluster import KMeans

[docs] def silhouette_score(fd, y, **kwargs): """ Compute the silhouette score for clustering results on functional data. Parameters ---------- fd : skfda.FDataGrid Functional data object containing the data matrix. y : array-like Cluster labels for each sample. **kwargs : dict Additional keyword arguments passed to sklearn.metrics.silhouette_score. Returns ------- float Silhouette score. """ return ss(fd.data_matrix.squeeze(), y, **kwargs)
[docs] def davies_bouldin_score(fd, y): """ Compute the Davies-Bouldin score for clustering results on functional data. Parameters ---------- fd : skfda.FDataGrid Functional data object containing the data matrix. y : array-like Cluster labels for each sample. Returns ------- float Davies-Bouldin score. """ return dbs(fd.data_matrix.squeeze(), y)
[docs] def gmms_fit_plot_(weights, means, stdev, ax = None, **kwargs): """ Plot Gaussian mixture model (GMM) density curves. Parameters ---------- weights : array-like Weights of each Gaussian component. means : array-like Means of each Gaussian component. stdev : array-like Standard deviations of each Gaussian component. ax : matplotlib.axes.Axes, optional Axis to plot on. If None, uses the current axis. **kwargs : dict Additional keyword arguments passed to matplotlib plot. Returns ------- None """ d_pdf = lambda x, mu, sigma: np.exp(-0.5 * (((x - mu) / sigma)) ** 2) / (sigma * np.sqrt(2 * np.pi)) for i in range(len(weights)): x = np.linspace(means[i] - 3 * stdev[i], means[i] + 3 * stdev[i], 50) if ax: ax.plot(x, weights[i] * d_pdf(x, means[i], stdev[i]), linewidth = 3, **kwargs) else: plt.plot(x, weights[i] * d_pdf(x, means[i], stdev[i]), linewidth = 3, **kwargs)
[docs] def match_labels(cluster_labels, true_class_labels, cluster_class_labels_perm): """ Permute cluster labels to match specified true class labels. Parameters ---------- cluster_labels : array-like Cluster labels assigned by a clustering algorithm. true_class_labels : array-like True class labels to match. cluster_class_labels_perm : array-like Permutation of cluster labels to match true labels. Returns ------- np.ndarray Array of matched labels. """ label_match_dict = {} for key, value in zip(cluster_class_labels_perm, true_class_labels): label_match_dict[key] = value matched_labels = np.zeros_like(cluster_labels) for i in cluster_class_labels_perm: matched_labels[np.argwhere(cluster_labels == i)] = label_match_dict[i] return matched_labels
[docs] def estimate_nclusters(fdata, ncluster_grid = None): """ Employs a systematic search to identify the number of clusters that minimize the Akaike Information Criterion (AIC) or the Bayesian Information Criterion (BIC). Parameters ---------- fdata : skfda.FDataGrid The functional dataset for which the number of clusters is to be estimated. ncluster_grid : array-like, optional List or array specifying the grid within which the number of clusters is searched. Defaults to range(2, 15). Returns ------- int The estimated number of clusters in the functional dataset. """ if ncluster_grid is None: ncluster_grid = range(2,15) fpca_ = FPCA(n_components = 1) scores = fpca_.fit_transform(fdata) bic_ = [] aic_ = [] for n_comp in ncluster_grid: model = GaussianMixture(n_components=n_comp, n_init= 20) model.fit(scores) bic_.append(model.bic(scores)) aic_.append(model.aic(scores)) return min([ncluster_grid[np.argmin(aic_)], ncluster_grid[np.argmin(bic_)]])
[docs] def hybrid_representative_selection(data, p : float, p1 : float): """ Select representative samples from a large dataset using a hybrid approach combining random sampling and KMeans clustering. This method first randomly samples a proportion `p` of the data, then applies KMeans clustering to this subset to select a smaller set of representative samples. Parameters ---------- data : skfda.FDataGrid or np.ndarray Functional dataset or array to sample from. p : float Proportion of data to randomly sample (0 < p < 1). p1 : float Proportion of data to use as representative samples (0 < p1 < p). Returns ------- skfda.FDataGrid or np.ndarray Representative samples selected by KMeans clustering. Raises ------ AssertionError If p1 is not less than p. ValueError If data is not of type skfda.FDataGrid or np.ndarray. """ # Random Selection N = len(data) n = np.ceil(p * N).astype('int') # approx. (100 * p)% of dataset n_idx = np.random.choice(N, size = n, replace = False) # Kmeans selection assert p1 < p, "p1 must be less that p." k_reps = int(p1 * N) # approx (100 *p1)% of dataset Kmean_reps = KMeans(n_clusters=k_reps, init= 'k-means++',) if type(data) == skfda.FDataGrid: Kmean_reps.fit(data[n_idx].data_matrix.squeeze()) return skfda.FDataGrid(data_matrix= Kmean_reps.cluster_centers_, grid_points= data.grid_points[0]) elif type(data) == np.ndarray: Kmean_reps.fit(data[n_idx]) return Kmean_reps.cluster_centers_ else: raise ValueError("'data' type should be either skfda.FDataGrid or numpy.ndarray.")