[SPARK-25764][ML][EXAMPLES] Update BisectingKMeans example to use ClusteringEvaluator

## What changes were proposed in this pull request? Using `computeCost` for evaluating a model is a very poor approach. We should advice the users to a better approach which is available, ie. using the `ClusteringEvaluator` to evaluate their models. The PR updates the examples for `BisectingKMeans` in order to do that. ## How was this patch tested? running examples Closes #22786 from mgaido91/SPARK-25764. Authored-by: Marco Gaido <marcogaido91@gmail.com> Signed-off-by: DB Tsai <d_tsai@apple.com>
2018-11-05 22:42:04 +00:00 · 2018-11-05 22:42:04 +00:00 · 0b59170001
parent 486acda8c5
commit 0b59170001
3 changed files with 27 additions and 9 deletions
--- a/examples/src/main/java/org/apache/spark/examples/ml/JavaBisectingKMeansExample.java
+++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaBisectingKMeansExample.java
@ -20,6 +20,7 @@ package org.apache.spark.examples.ml;
 // $example on$
 import org.apache.spark.ml.clustering.BisectingKMeans;
 import org.apache.spark.ml.clustering.BisectingKMeansModel;
+import org.apache.spark.ml.evaluation.ClusteringEvaluator;
 import org.apache.spark.ml.linalg.Vector;
 import org.apache.spark.sql.Dataset;
 import org.apache.spark.sql.Row;
@ -50,9 +51,14 @@ public class JavaBisectingKMeansExample {
    BisectingKMeans bkm = new BisectingKMeans().setK(2).setSeed(1);
    BisectingKMeansModel model = bkm.fit(dataset);

-    // Evaluate clustering.
-    double cost = model.computeCost(dataset);
-    System.out.println("Within Set Sum of Squared Errors = " + cost);
+    // Make predictions
+    Dataset<Row> predictions = model.transform(dataset);
+
+    // Evaluate clustering by computing Silhouette score
+    ClusteringEvaluator evaluator = new ClusteringEvaluator();
+
+    double silhouette = evaluator.evaluate(predictions);
+    System.out.println("Silhouette with squared euclidean distance = " + silhouette);

    // Shows the result.
    System.out.println("Cluster Centers: ");
--- a/examples/src/main/python/ml/bisecting_k_means_example.py
+++ b/examples/src/main/python/ml/bisecting_k_means_example.py
@ -24,6 +24,7 @@ from __future__ import print_function

 # $example on$
 from pyspark.ml.clustering import BisectingKMeans
+from pyspark.ml.evaluation import ClusteringEvaluator
 # $example off$
 from pyspark.sql import SparkSession

@ -41,9 +42,14 @@ if __name__ == "__main__":
    bkm = BisectingKMeans().setK(2).setSeed(1)
    model = bkm.fit(dataset)

-    # Evaluate clustering.
-    cost = model.computeCost(dataset)
-    print("Within Set Sum of Squared Errors = " + str(cost))
+    # Make predictions
+    predictions = model.transform(dataset)
+
+    # Evaluate clustering by computing Silhouette score
+    evaluator = ClusteringEvaluator()
+
+    silhouette = evaluator.evaluate(predictions)
+    print("Silhouette with squared euclidean distance = " + str(silhouette))

    # Shows the result.
    print("Cluster Centers: ")
--- a/examples/src/main/scala/org/apache/spark/examples/ml/BisectingKMeansExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/ml/BisectingKMeansExample.scala
@ -21,6 +21,7 @@ package org.apache.spark.examples.ml

 // $example on$
 import org.apache.spark.ml.clustering.BisectingKMeans
+import org.apache.spark.ml.evaluation.ClusteringEvaluator
 // $example off$
 import org.apache.spark.sql.SparkSession

@ -48,9 +49,14 @@ object BisectingKMeansExample {
    val bkm = new BisectingKMeans().setK(2).setSeed(1)
    val model = bkm.fit(dataset)

-    // Evaluate clustering.
-    val cost = model.computeCost(dataset)
-    println(s"Within Set Sum of Squared Errors = $cost")
+    // Make predictions
+    val predictions = model.transform(dataset)
+
+    // Evaluate clustering by computing Silhouette score
+    val evaluator = new ClusteringEvaluator()
+
+    val silhouette = evaluator.evaluate(predictions)
+    println(s"Silhouette with squared euclidean distance = $silhouette")

    // Shows the result.
    println("Cluster Centers: ")