[SPARK-14516][FOLLOWUP] Adding ClusteringEvaluator to examples

## What changes were proposed in this pull request? In SPARK-14516 we have introduced ClusteringEvaluator, but we didn't put any reference in the documentation and the examples were still relying on the sum of squared errors to show a way to evaluate the clustering model. The PR adds the ClusteringEvaluator in the examples. ## How was this patch tested? Manual runs of the examples. Author: Marco Gaido <mgaido@hortonworks.com> Closes #19676 from mgaido91/SPARK-14516_examples.
2017-12-11 06:35:31 -06:00 · 2017-12-11 06:35:31 -06:00 · ec873a4fd2
parent 4289ac9d8d
commit ec873a4fd2
3 changed files with 27 additions and 9 deletions
--- a/examples/src/main/java/org/apache/spark/examples/ml/JavaKMeansExample.java
+++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaKMeansExample.java
@ -20,6 +20,7 @@ package org.apache.spark.examples.ml;
 // $example on$
 import org.apache.spark.ml.clustering.KMeansModel;
 import org.apache.spark.ml.clustering.KMeans;
+import org.apache.spark.ml.evaluation.ClusteringEvaluator;
 import org.apache.spark.ml.linalg.Vector;
 import org.apache.spark.sql.Dataset;
 import org.apache.spark.sql.Row;
@ -51,9 +52,14 @@ public class JavaKMeansExample {
    KMeans kmeans = new KMeans().setK(2).setSeed(1L);
    KMeansModel model = kmeans.fit(dataset);

-    // Evaluate clustering by computing Within Set Sum of Squared Errors.
-    double WSSSE = model.computeCost(dataset);
-    System.out.println("Within Set Sum of Squared Errors = " + WSSSE);
+    // Make predictions
+    Dataset<Row> predictions = model.transform(dataset);
+
+    // Evaluate clustering by computing Silhouette score
+    ClusteringEvaluator evaluator = new ClusteringEvaluator();
+
+    double silhouette = evaluator.evaluate(predictions);
+    System.out.println("Silhouette with squared euclidean distance = " + silhouette);

    // Shows the result.
    Vector[] centers = model.clusterCenters();
--- a/examples/src/main/python/ml/kmeans_example.py
+++ b/examples/src/main/python/ml/kmeans_example.py
@ -19,6 +19,7 @@ from __future__ import print_function

 # $example on$
 from pyspark.ml.clustering import KMeans
+from pyspark.ml.evaluation import ClusteringEvaluator
 # $example off$

 from pyspark.sql import SparkSession
@ -45,9 +46,14 @@ if __name__ == "__main__":
    kmeans = KMeans().setK(2).setSeed(1)
    model = kmeans.fit(dataset)

-    # Evaluate clustering by computing Within Set Sum of Squared Errors.
-    wssse = model.computeCost(dataset)
-    print("Within Set Sum of Squared Errors = " + str(wssse))
+    # Make predictions
+    predictions = model.transform(dataset)
+
+    # Evaluate clustering by computing Silhouette score
+    evaluator = ClusteringEvaluator()
+
+    silhouette = evaluator.evaluate(predictions)
+    print("Silhouette with squared euclidean distance = " + str(silhouette))

    # Shows the result.
    centers = model.clusterCenters()
--- a/examples/src/main/scala/org/apache/spark/examples/ml/KMeansExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/ml/KMeansExample.scala
@ -21,6 +21,7 @@ package org.apache.spark.examples.ml

 // $example on$
 import org.apache.spark.ml.clustering.KMeans
+import org.apache.spark.ml.evaluation.ClusteringEvaluator
 // $example off$
 import org.apache.spark.sql.SparkSession

@ -47,9 +48,14 @@ object KMeansExample {
    val kmeans = new KMeans().setK(2).setSeed(1L)
    val model = kmeans.fit(dataset)

-    // Evaluate clustering by computing Within Set Sum of Squared Errors.
-    val WSSSE = model.computeCost(dataset)
-    println(s"Within Set Sum of Squared Errors = $WSSSE")
+    // Make predictions
+    val predictions = model.transform(dataset)
+
+    // Evaluate clustering by computing Silhouette score
+    val evaluator = new ClusteringEvaluator()
+
+    val silhouette = evaluator.evaluate(predictions)
+    println(s"Silhouette with squared euclidean distance = $silhouette")

    // Shows the result.
    println("Cluster Centers: ")