|
20 | 20 | sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) |
21 | 21 | import bench |
22 | 22 | import numpy as np |
23 | | -from sklearn.metrics.cluster import davies_bouldin_score |
24 | 23 |
|
25 | | -parser = argparse.ArgumentParser(description='scikit-learn K-means benchmark') |
26 | | -parser.add_argument('-i', '--filei', '--fileI', '--init', |
27 | | - type=str, help='Initial clusters') |
28 | | -parser.add_argument('-t', '--tol', type=float, default=0., |
29 | | - help='Absolute threshold') |
30 | | -parser.add_argument('--maxiter', type=int, default=100, |
31 | | - help='Maximum number of iterations') |
32 | | -parser.add_argument('--n-clusters', type=int, help='Number of clusters') |
33 | | -params = bench.parse_args(parser) |
34 | 24 |
|
35 | | -from sklearn.cluster import KMeans |
| 25 | +def main(): |
| 26 | + from sklearn.cluster import KMeans |
| 27 | + from sklearn.metrics.cluster import davies_bouldin_score |
36 | 28 |
|
37 | | -# Load and convert generated data |
38 | | -X_train, X_test, _, _ = bench.load_data(params) |
| 29 | + # Load and convert generated data |
| 30 | + X_train, X_test, _, _ = bench.load_data(params) |
39 | 31 |
|
40 | | -if params.filei == 'k-means++': |
41 | | - X_init = 'k-means++' |
42 | | -# Load initial centroids from specified path |
43 | | -elif params.filei is not None: |
44 | | - X_init = np.load(params.filei).astype(params.dtype) |
45 | | - params.n_clusters = X_init.shape[0] |
46 | | -# or choose random centroids from training data |
47 | | -else: |
48 | | - np.random.seed(params.seed) |
49 | | - centroids_idx = np.random.randint(0, X_train.shape[0], |
50 | | - size=params.n_clusters) |
51 | | - if hasattr(X_train, "iloc"): |
52 | | - X_init = X_train.iloc[centroids_idx].values |
| 32 | + if params.filei == 'k-means++': |
| 33 | + X_init = 'k-means++' |
| 34 | + # Load initial centroids from specified path |
| 35 | + elif params.filei is not None: |
| 36 | + X_init = np.load(params.filei).astype(params.dtype) |
| 37 | + params.n_clusters = X_init.shape[0] |
| 38 | + # or choose random centroids from training data |
53 | 39 | else: |
54 | | - X_init = X_train[centroids_idx] |
| 40 | + np.random.seed(params.seed) |
| 41 | + centroids_idx = np.random.randint(0, X_train.shape[0], |
| 42 | + size=params.n_clusters) |
| 43 | + if hasattr(X_train, "iloc"): |
| 44 | + X_init = X_train.iloc[centroids_idx].values |
| 45 | + else: |
| 46 | + X_init = X_train[centroids_idx] |
55 | 47 |
|
| 48 | + def fit_kmeans(X, X_init): |
| 49 | + alg = KMeans(n_clusters=params.n_clusters, tol=params.tol, |
| 50 | + max_iter=params.maxiter, init=X_init, n_init=1) |
| 51 | + alg.fit(X) |
| 52 | + return alg |
56 | 53 |
|
57 | | -def fit_kmeans(X): |
58 | | - global X_init, params |
59 | | - alg = KMeans(n_clusters=params.n_clusters, tol=params.tol, |
60 | | - max_iter=params.maxiter, init=X_init, n_init=1) |
61 | | - alg.fit(X) |
62 | | - return alg |
| 54 | + # Time fit |
| 55 | + fit_time, kmeans = bench.measure_function_time(fit_kmeans, X_train, |
| 56 | + X_init, params=params) |
63 | 57 |
|
| 58 | + train_predict = kmeans.predict(X_train) |
| 59 | + acc_train = davies_bouldin_score(X_train, train_predict) |
64 | 60 |
|
65 | | -# Time fit |
66 | | -fit_time, kmeans = bench.measure_function_time(fit_kmeans, X_train, params=params) |
| 61 | + # Time predict |
| 62 | + predict_time, test_predict = bench.measure_function_time( |
| 63 | + kmeans.predict, X_test, params=params) |
67 | 64 |
|
68 | | -train_predict = kmeans.predict(X_train) |
69 | | -acc_train = davies_bouldin_score(X_train, train_predict) |
| 65 | + acc_test = davies_bouldin_score(X_test, test_predict) |
70 | 66 |
|
71 | | -# Time predict |
72 | | -predict_time, test_predict = bench.measure_function_time( |
73 | | - kmeans.predict, X_test, params=params) |
| 67 | + bench.print_output(library='sklearn', algorithm='kmeans', |
| 68 | + stages=['training', 'prediction'], |
| 69 | + params=params, functions=['KMeans.fit', 'KMeans.predict'], |
| 70 | + times=[fit_time, predict_time], |
| 71 | + accuracy_type='davies_bouldin_score', |
| 72 | + accuracies=[acc_train, acc_test], data=[X_train, X_test], |
| 73 | + alg_instance=kmeans) |
74 | 74 |
|
75 | | -acc_test = davies_bouldin_score(X_test, test_predict) |
76 | 75 |
|
77 | | -bench.print_output(library='sklearn', algorithm='kmeans', |
78 | | - stages=['training', 'prediction'], |
79 | | - params=params, functions=['KMeans.fit', 'KMeans.predict'], |
80 | | - times=[fit_time, predict_time], accuracy_type='davies_bouldin_score', |
81 | | - accuracies=[acc_train, acc_test], data=[X_train, X_test], |
82 | | - alg_instance=kmeans) |
| 76 | +if __name__ == "__main__": |
| 77 | + parser = argparse.ArgumentParser(description='scikit-learn K-means benchmark') |
| 78 | + parser.add_argument('-i', '--filei', '--fileI', '--init', |
| 79 | + type=str, help='Initial clusters') |
| 80 | + parser.add_argument('-t', '--tol', type=float, default=0., |
| 81 | + help='Absolute threshold') |
| 82 | + parser.add_argument('--maxiter', type=int, default=100, |
| 83 | + help='Maximum number of iterations') |
| 84 | + parser.add_argument('--n-clusters', type=int, help='Number of clusters') |
| 85 | + params = bench.parse_args(parser) |
| 86 | + bench.run_with_context(params, main) |
0 commit comments