[SPARK-14516][FOLLOWUP] Adding ClusteringEvaluator to examples
## What changes were proposed in this pull request? In SPARK-14516 we have introduced ClusteringEvaluator, but we didn't put any reference in the documentation and the examples were still relying on the sum of squared errors to show a way to evaluate the clustering model. The PR adds the ClusteringEvaluator in the examples. ## How was this patch tested? Manual runs of the examples. Author: Marco Gaido <mgaido@hortonworks.com> Closes #19676 from mgaido91/SPARK-14516_examples.
This commit is contained in:
parent
4289ac9d8d
commit
ec873a4fd2
|
@ -20,6 +20,7 @@ package org.apache.spark.examples.ml;
|
|||
// $example on$
|
||||
import org.apache.spark.ml.clustering.KMeansModel;
|
||||
import org.apache.spark.ml.clustering.KMeans;
|
||||
import org.apache.spark.ml.evaluation.ClusteringEvaluator;
|
||||
import org.apache.spark.ml.linalg.Vector;
|
||||
import org.apache.spark.sql.Dataset;
|
||||
import org.apache.spark.sql.Row;
|
||||
|
@ -51,9 +52,14 @@ public class JavaKMeansExample {
|
|||
KMeans kmeans = new KMeans().setK(2).setSeed(1L);
|
||||
KMeansModel model = kmeans.fit(dataset);
|
||||
|
||||
// Evaluate clustering by computing Within Set Sum of Squared Errors.
|
||||
double WSSSE = model.computeCost(dataset);
|
||||
System.out.println("Within Set Sum of Squared Errors = " + WSSSE);
|
||||
// Make predictions
|
||||
Dataset<Row> predictions = model.transform(dataset);
|
||||
|
||||
// Evaluate clustering by computing Silhouette score
|
||||
ClusteringEvaluator evaluator = new ClusteringEvaluator();
|
||||
|
||||
double silhouette = evaluator.evaluate(predictions);
|
||||
System.out.println("Silhouette with squared euclidean distance = " + silhouette);
|
||||
|
||||
// Shows the result.
|
||||
Vector[] centers = model.clusterCenters();
|
||||
|
|
|
@ -19,6 +19,7 @@ from __future__ import print_function
|
|||
|
||||
# $example on$
|
||||
from pyspark.ml.clustering import KMeans
|
||||
from pyspark.ml.evaluation import ClusteringEvaluator
|
||||
# $example off$
|
||||
|
||||
from pyspark.sql import SparkSession
|
||||
|
@ -45,9 +46,14 @@ if __name__ == "__main__":
|
|||
kmeans = KMeans().setK(2).setSeed(1)
|
||||
model = kmeans.fit(dataset)
|
||||
|
||||
# Evaluate clustering by computing Within Set Sum of Squared Errors.
|
||||
wssse = model.computeCost(dataset)
|
||||
print("Within Set Sum of Squared Errors = " + str(wssse))
|
||||
# Make predictions
|
||||
predictions = model.transform(dataset)
|
||||
|
||||
# Evaluate clustering by computing Silhouette score
|
||||
evaluator = ClusteringEvaluator()
|
||||
|
||||
silhouette = evaluator.evaluate(predictions)
|
||||
print("Silhouette with squared euclidean distance = " + str(silhouette))
|
||||
|
||||
# Shows the result.
|
||||
centers = model.clusterCenters()
|
||||
|
|
|
@ -21,6 +21,7 @@ package org.apache.spark.examples.ml
|
|||
|
||||
// $example on$
|
||||
import org.apache.spark.ml.clustering.KMeans
|
||||
import org.apache.spark.ml.evaluation.ClusteringEvaluator
|
||||
// $example off$
|
||||
import org.apache.spark.sql.SparkSession
|
||||
|
||||
|
@ -47,9 +48,14 @@ object KMeansExample {
|
|||
val kmeans = new KMeans().setK(2).setSeed(1L)
|
||||
val model = kmeans.fit(dataset)
|
||||
|
||||
// Evaluate clustering by computing Within Set Sum of Squared Errors.
|
||||
val WSSSE = model.computeCost(dataset)
|
||||
println(s"Within Set Sum of Squared Errors = $WSSSE")
|
||||
// Make predictions
|
||||
val predictions = model.transform(dataset)
|
||||
|
||||
// Evaluate clustering by computing Silhouette score
|
||||
val evaluator = new ClusteringEvaluator()
|
||||
|
||||
val silhouette = evaluator.evaluate(predictions)
|
||||
println(s"Silhouette with squared euclidean distance = $silhouette")
|
||||
|
||||
// Shows the result.
|
||||
println("Cluster Centers: ")
|
||||
|
|
Loading…
Reference in a new issue