X Tutup
import numpy import matplotlib.pyplot import pygad cluster1_num_samples = 10 cluster1_x1_start = 0 cluster1_x1_end = 5 cluster1_x2_start = 2 cluster1_x2_end = 6 cluster1_x1 = numpy.random.random(size=(cluster1_num_samples)) cluster1_x1 = cluster1_x1 * (cluster1_x1_end - cluster1_x1_start) + cluster1_x1_start cluster1_x2 = numpy.random.random(size=(cluster1_num_samples)) cluster1_x2 = cluster1_x2 * (cluster1_x2_end - cluster1_x2_start) + cluster1_x2_start cluster2_num_samples = 10 cluster2_x1_start = 10 cluster2_x1_end = 15 cluster2_x2_start = 8 cluster2_x2_end = 12 cluster2_x1 = numpy.random.random(size=(cluster2_num_samples)) cluster2_x1 = cluster2_x1 * (cluster2_x1_end - cluster2_x1_start) + cluster2_x1_start cluster2_x2 = numpy.random.random(size=(cluster2_num_samples)) cluster2_x2 = cluster2_x2 * (cluster2_x2_end - cluster2_x2_start) + cluster2_x2_start c1 = numpy.array([cluster1_x1, cluster1_x2]).T c2 = numpy.array([cluster2_x1, cluster2_x2]).T data = numpy.concatenate((c1, c2), axis=0) matplotlib.pyplot.scatter(cluster1_x1, cluster1_x2) matplotlib.pyplot.scatter(cluster2_x1, cluster2_x2) matplotlib.pyplot.title("Optimal Clustering") matplotlib.pyplot.show() def euclidean_distance(X, Y): """ Calculate the euclidean distance between X and Y. It accepts: :X should be a matrix of size (N, f) where N is the number of samples and f is the number of features for each sample. :Y should be of size f. In other words, it is a single sample. Returns a vector of N elements with the distances between the N samples and the Y. """ return numpy.sqrt(numpy.sum(numpy.power(X - Y, 2), axis=1)) def cluster_data(solution, solution_idx): """ Clusters the data based on the current solution. """ global num_cluster, data feature_vector_length = data.shape[1] cluster_centers = [] # A list of size (C, f) where C is the number of clusters and f is the number of features representing each sample. all_clusters_dists = [] # A list of size (C, N) where C is the number of clusters and N is the number of data samples. It holds the distances between each cluster center and all the data samples. clusters = [] # A list with C elements where each element holds the indices of the samples within a cluster. clusters_sum_dist = [] # A list with C elements where each element represents the sum of distances of the samples with a cluster. for clust_idx in range(num_clusters): # Return the current cluster center. cluster_centers.append(solution[feature_vector_length*clust_idx:feature_vector_length*(clust_idx+1)]) # Calculate the distance (e.g. euclidean) between the current cluster center and all samples. cluster_center_dists = euclidean_distance(data, cluster_centers[clust_idx]) all_clusters_dists.append(numpy.array(cluster_center_dists)) cluster_centers = numpy.array(cluster_centers) all_clusters_dists = numpy.array(all_clusters_dists) # A 1D array that, for each sample, holds the index of the cluster with the smallest distance. # In other words, the array holds the sample's cluster index. cluster_indices = numpy.argmin(all_clusters_dists, axis=0) for clust_idx in range(num_clusters): clusters.append(numpy.where(cluster_indices == clust_idx)[0]) # Calculate the sum of distances for the cluster. if len(clusters[clust_idx]) == 0: # In case the cluster is empty (i.e. has zero samples). clusters_sum_dist.append(0) else: # When the cluster is not empty (i.e. has at least 1 sample). clusters_sum_dist.append(numpy.sum(all_clusters_dists[clust_idx, clusters[clust_idx]])) # clusters_sum_dist.append(numpy.sum(euclidean_distance(data[clusters[clust_idx], :], cluster_centers[clust_idx]))) clusters_sum_dist = numpy.array(clusters_sum_dist) return cluster_centers, all_clusters_dists, cluster_indices, clusters, clusters_sum_dist def fitness_func(solution, solution_idx): _, _, _, _, clusters_sum_dist = cluster_data(solution, solution_idx) # The tiny value 0.00000001 is added to the denominator in case the average distance is 0. fitness = 1.0 / (numpy.sum(clusters_sum_dist) + 0.00000001) return fitness num_clusters = 2 num_genes = num_clusters * data.shape[1] ga_instance = pygad.GA(num_generations=100, sol_per_pop=10, num_parents_mating=5, init_range_low=-6, init_range_high=20, keep_parents=2, num_genes=num_genes, fitness_func=fitness_func, suppress_warnings=True) ga_instance.run() best_solution, best_solution_fitness, best_solution_idx = ga_instance.best_solution() print("Best solution is {bs}".format(bs=best_solution)) print("Fitness of the best solution is {bsf}".format(bsf=best_solution_fitness)) print("Best solution found after {gen} generations".format(gen=ga_instance.best_solution_generation)) cluster_centers, all_clusters_dists, cluster_indices, clusters, clusters_sum_dist = cluster_data(best_solution, best_solution_idx) for cluster_idx in range(num_clusters): cluster_x = data[clusters[cluster_idx], 0] cluster_y = data[clusters[cluster_idx], 1] matplotlib.pyplot.scatter(cluster_x, cluster_y) matplotlib.pyplot.scatter(cluster_centers[cluster_idx, 0], cluster_centers[cluster_idx, 1], linewidths=5) matplotlib.pyplot.title("Clustering using PyGAD") matplotlib.pyplot.show()
X Tutup