From bbae20ade14e50541e4403ca7b45bf6c11695d15 Mon Sep 17 00:00:00 2001
From: Yanbo Liang <ybliang8@gmail.com>
Date: Fri, 12 Aug 2016 10:06:17 -0700
Subject: [PATCH] [SPARK-17033][ML][MLLIB] GaussianMixture should use
 treeAggregate to improve performance

## What changes were proposed in this pull request?
```GaussianMixture``` should use ```treeAggregate``` rather than ```aggregate``` to improve performance and scalability. In my test of dataset with 200 features and 1M instance, I found there is 20% increased performance.
BTW, we should destroy broadcast variable ```compute``` at the end of each iteration.

## How was this patch tested?
Existing tests.

Author: Yanbo Liang <ybliang8@gmail.com>

Closes #14621 from yanboliang/spark-17033.
---
 .../org/apache/spark/mllib/clustering/GaussianMixture.scala    | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixture.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixture.scala
index a214b1a26f..43193adf3e 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixture.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixture.scala
@@ -198,7 +198,7 @@ class GaussianMixture private (
       val compute = sc.broadcast(ExpectationSum.add(weights, gaussians)_)
 
       // aggregate the cluster contribution for all sample points
-      val sums = breezeData.aggregate(ExpectationSum.zero(k, d))(compute.value, _ += _)
+      val sums = breezeData.treeAggregate(ExpectationSum.zero(k, d))(compute.value, _ += _)
 
       // Create new distributions based on the partial assignments
       // (often referred to as the "M" step in literature)
@@ -227,6 +227,7 @@ class GaussianMixture private (
       llhp = llh // current becomes previous
       llh = sums.logLikelihood // this is the freshly computed log-likelihood
       iter += 1
+      compute.destroy(blocking = false)
     }
 
     new GaussianMixtureModel(weights, gaussians)