[SPARK-14516][FOLLOWUP] Adding ClusteringEvaluator to examples

## What changes were proposed in this pull request?

In SPARK-14516 we have introduced ClusteringEvaluator, but we didn't put any reference in the documentation and the examples were still relying on the sum of squared errors to show a way to evaluate the clustering model.

The PR adds the ClusteringEvaluator in the examples.

## How was this patch tested?

Manual runs of the examples.

Author: Marco Gaido <mgaido@hortonworks.com>

Closes #19676 from mgaido91/SPARK-14516_examples.
This commit is contained in:
Marco Gaido 2017-12-11 06:35:31 -06:00 committed by Sean Owen
parent 4289ac9d8d
commit ec873a4fd2
3 changed files with 27 additions and 9 deletions

View file

@ -20,6 +20,7 @@ package org.apache.spark.examples.ml;
// $example on$
import org.apache.spark.ml.clustering.KMeansModel;
import org.apache.spark.ml.clustering.KMeans;
import org.apache.spark.ml.evaluation.ClusteringEvaluator;
import org.apache.spark.ml.linalg.Vector;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Row;
@ -51,9 +52,14 @@ public class JavaKMeansExample {
KMeans kmeans = new KMeans().setK(2).setSeed(1L);
KMeansModel model = kmeans.fit(dataset);
// Evaluate clustering by computing Within Set Sum of Squared Errors.
double WSSSE = model.computeCost(dataset);
System.out.println("Within Set Sum of Squared Errors = " + WSSSE);
// Make predictions
Dataset<Row> predictions = model.transform(dataset);
// Evaluate clustering by computing Silhouette score
ClusteringEvaluator evaluator = new ClusteringEvaluator();
double silhouette = evaluator.evaluate(predictions);
System.out.println("Silhouette with squared euclidean distance = " + silhouette);
// Shows the result.
Vector[] centers = model.clusterCenters();

View file

@ -19,6 +19,7 @@ from __future__ import print_function
# $example on$
from pyspark.ml.clustering import KMeans
from pyspark.ml.evaluation import ClusteringEvaluator
# $example off$
from pyspark.sql import SparkSession
@ -45,9 +46,14 @@ if __name__ == "__main__":
kmeans = KMeans().setK(2).setSeed(1)
model = kmeans.fit(dataset)
# Evaluate clustering by computing Within Set Sum of Squared Errors.
wssse = model.computeCost(dataset)
print("Within Set Sum of Squared Errors = " + str(wssse))
# Make predictions
predictions = model.transform(dataset)
# Evaluate clustering by computing Silhouette score
evaluator = ClusteringEvaluator()
silhouette = evaluator.evaluate(predictions)
print("Silhouette with squared euclidean distance = " + str(silhouette))
# Shows the result.
centers = model.clusterCenters()

View file

@ -21,6 +21,7 @@ package org.apache.spark.examples.ml
// $example on$
import org.apache.spark.ml.clustering.KMeans
import org.apache.spark.ml.evaluation.ClusteringEvaluator
// $example off$
import org.apache.spark.sql.SparkSession
@ -47,9 +48,14 @@ object KMeansExample {
val kmeans = new KMeans().setK(2).setSeed(1L)
val model = kmeans.fit(dataset)
// Evaluate clustering by computing Within Set Sum of Squared Errors.
val WSSSE = model.computeCost(dataset)
println(s"Within Set Sum of Squared Errors = $WSSSE")
// Make predictions
val predictions = model.transform(dataset)
// Evaluate clustering by computing Silhouette score
val evaluator = new ClusteringEvaluator()
val silhouette = evaluator.evaluate(predictions)
println(s"Silhouette with squared euclidean distance = $silhouette")
// Shows the result.
println("Cluster Centers: ")