[SPARK-25764][ML][EXAMPLES] Update BisectingKMeans example to use ClusteringEvaluator

## What changes were proposed in this pull request?

Using `computeCost` for evaluating a model is a very poor approach. We should advice the users to a better approach which is available, ie. using the `ClusteringEvaluator` to evaluate their models. The PR updates the examples for `BisectingKMeans` in order to do that.

## How was this patch tested?

running examples

Closes #22786 from mgaido91/SPARK-25764.

Authored-by: Marco Gaido <marcogaido91@gmail.com>
Signed-off-by: DB Tsai <d_tsai@apple.com>
This commit is contained in:
Marco Gaido 2018-11-05 22:42:04 +00:00 committed by DB Tsai
parent 486acda8c5
commit 0b59170001
No known key found for this signature in database
GPG key ID: E6FD79DA81FE14FD
3 changed files with 27 additions and 9 deletions

View file

@ -20,6 +20,7 @@ package org.apache.spark.examples.ml;
// $example on$
import org.apache.spark.ml.clustering.BisectingKMeans;
import org.apache.spark.ml.clustering.BisectingKMeansModel;
import org.apache.spark.ml.evaluation.ClusteringEvaluator;
import org.apache.spark.ml.linalg.Vector;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Row;
@ -50,9 +51,14 @@ public class JavaBisectingKMeansExample {
BisectingKMeans bkm = new BisectingKMeans().setK(2).setSeed(1);
BisectingKMeansModel model = bkm.fit(dataset);
// Evaluate clustering.
double cost = model.computeCost(dataset);
System.out.println("Within Set Sum of Squared Errors = " + cost);
// Make predictions
Dataset<Row> predictions = model.transform(dataset);
// Evaluate clustering by computing Silhouette score
ClusteringEvaluator evaluator = new ClusteringEvaluator();
double silhouette = evaluator.evaluate(predictions);
System.out.println("Silhouette with squared euclidean distance = " + silhouette);
// Shows the result.
System.out.println("Cluster Centers: ");

View file

@ -24,6 +24,7 @@ from __future__ import print_function
# $example on$
from pyspark.ml.clustering import BisectingKMeans
from pyspark.ml.evaluation import ClusteringEvaluator
# $example off$
from pyspark.sql import SparkSession
@ -41,9 +42,14 @@ if __name__ == "__main__":
bkm = BisectingKMeans().setK(2).setSeed(1)
model = bkm.fit(dataset)
# Evaluate clustering.
cost = model.computeCost(dataset)
print("Within Set Sum of Squared Errors = " + str(cost))
# Make predictions
predictions = model.transform(dataset)
# Evaluate clustering by computing Silhouette score
evaluator = ClusteringEvaluator()
silhouette = evaluator.evaluate(predictions)
print("Silhouette with squared euclidean distance = " + str(silhouette))
# Shows the result.
print("Cluster Centers: ")

View file

@ -21,6 +21,7 @@ package org.apache.spark.examples.ml
// $example on$
import org.apache.spark.ml.clustering.BisectingKMeans
import org.apache.spark.ml.evaluation.ClusteringEvaluator
// $example off$
import org.apache.spark.sql.SparkSession
@ -48,9 +49,14 @@ object BisectingKMeansExample {
val bkm = new BisectingKMeans().setK(2).setSeed(1)
val model = bkm.fit(dataset)
// Evaluate clustering.
val cost = model.computeCost(dataset)
println(s"Within Set Sum of Squared Errors = $cost")
// Make predictions
val predictions = model.transform(dataset)
// Evaluate clustering by computing Silhouette score
val evaluator = new ClusteringEvaluator()
val silhouette = evaluator.evaluate(predictions)
println(s"Silhouette with squared euclidean distance = $silhouette")
// Shows the result.
println("Cluster Centers: ")