Revert "[SPARK-13019][DOCS] Replace example code in mllib-statistics.md using include_example"

This reverts commit 1af8de200c.
2016-03-21 17:42:30 -07:00 · 2016-03-21 17:42:30 -07:00 · 43ef1e52bf
parent 3f49e0766f
commit 43ef1e52bf
19 changed files with 382 additions and 1076 deletions
--- a/docs/mllib-statistics.md
+++ b/docs/mllib-statistics.md
@ -40,7 +40,19 @@ total count.

 Refer to the [`MultivariateStatisticalSummary` Scala docs](api/scala/index.html#org.apache.spark.mllib.stat.MultivariateStatisticalSummary) for details on the API.

-{% include_example scala/org/apache/spark/examples/mllib/SummaryStatisticsExample.scala %}
+{% highlight scala %}
+import org.apache.spark.mllib.linalg.Vector
+import org.apache.spark.mllib.stat.{MultivariateStatisticalSummary, Statistics}
+
+val observations: RDD[Vector] = ... // an RDD of Vectors
+
+// Compute column summary statistics.
+val summary: MultivariateStatisticalSummary = Statistics.colStats(observations)
+println(summary.mean) // a dense vector containing the mean value for each column
+println(summary.variance) // column-wise variance
+println(summary.numNonzeros) // number of nonzeros in each column
+
+{% endhighlight %}
 </div>

 <div data-lang="java" markdown="1">
@ -52,7 +64,24 @@ total count.

 Refer to the [`MultivariateStatisticalSummary` Java docs](api/java/org/apache/spark/mllib/stat/MultivariateStatisticalSummary.html) for details on the API.

-{% include_example java/org/apache/spark/examples/mllib/JavaSummaryStatisticsExample.java %}
+{% highlight java %}
+import org.apache.spark.api.java.JavaRDD;
+import org.apache.spark.api.java.JavaSparkContext;
+import org.apache.spark.mllib.linalg.Vector;
+import org.apache.spark.mllib.stat.MultivariateStatisticalSummary;
+import org.apache.spark.mllib.stat.Statistics;
+
+JavaSparkContext jsc = ...
+
+JavaRDD<Vector> mat = ... // an RDD of Vectors
+
+// Compute column summary statistics.
+MultivariateStatisticalSummary summary = Statistics.colStats(mat.rdd());
+System.out.println(summary.mean()); // a dense vector containing the mean value for each column
+System.out.println(summary.variance()); // column-wise variance
+System.out.println(summary.numNonzeros()); // number of nonzeros in each column
+
+{% endhighlight %}
 </div>

 <div data-lang="python" markdown="1">
@ -63,7 +92,20 @@ total count.

 Refer to the [`MultivariateStatisticalSummary` Python docs](api/python/pyspark.mllib.html#pyspark.mllib.stat.MultivariateStatisticalSummary) for more details on the API.

-{% include_example python/mllib/summary_statistics_example.py %}
+{% highlight python %}
+from pyspark.mllib.stat import Statistics
+
+sc = ... # SparkContext
+
+mat = ... # an RDD of Vectors
+
+# Compute column summary statistics.
+summary = Statistics.colStats(mat)
+print(summary.mean())
+print(summary.variance())
+print(summary.numNonzeros())
+
+{% endhighlight %}
 </div>

 </div>
@ -82,7 +124,27 @@ an `RDD[Vector]`, the output will be a `Double` or the correlation `Matrix` resp

 Refer to the [`Statistics` Scala docs](api/scala/index.html#org.apache.spark.mllib.stat.Statistics) for details on the API.

-{% include_example scala/org/apache/spark/examples/mllib/CorrelationsExample.scala %}
+{% highlight scala %}
+import org.apache.spark.SparkContext
+import org.apache.spark.mllib.linalg._
+import org.apache.spark.mllib.stat.Statistics
+
+val sc: SparkContext = ...
+
+val seriesX: RDD[Double] = ... // a series
+val seriesY: RDD[Double] = ... // must have the same number of partitions and cardinality as seriesX
+
+// compute the correlation using Pearson's method. Enter "spearman" for Spearman's method. If a 
+// method is not specified, Pearson's method will be used by default. 
+val correlation: Double = Statistics.corr(seriesX, seriesY, "pearson")
+
+val data: RDD[Vector] = ... // note that each Vector is a row and not a column
+
+// calculate the correlation matrix using Pearson's method. Use "spearman" for Spearman's method.
+// If a method is not specified, Pearson's method will be used by default. 
+val correlMatrix: Matrix = Statistics.corr(data, "pearson")
+
+{% endhighlight %}
 </div>

 <div data-lang="java" markdown="1">
@ -92,7 +154,28 @@ a `JavaRDD<Vector>`, the output will be a `Double` or the correlation `Matrix` r

 Refer to the [`Statistics` Java docs](api/java/org/apache/spark/mllib/stat/Statistics.html) for details on the API.

-{% include_example java/org/apache/spark/examples/mllib/JavaCorrelationsExample.java %}
+{% highlight java %}
+import org.apache.spark.api.java.JavaDoubleRDD;
+import org.apache.spark.api.java.JavaSparkContext;
+import org.apache.spark.mllib.linalg.*;
+import org.apache.spark.mllib.stat.Statistics;
+
+JavaSparkContext jsc = ...
+
+JavaDoubleRDD seriesX = ... // a series
+JavaDoubleRDD seriesY = ... // must have the same number of partitions and cardinality as seriesX
+
+// compute the correlation using Pearson's method. Enter "spearman" for Spearman's method. If a 
+// method is not specified, Pearson's method will be used by default. 
+Double correlation = Statistics.corr(seriesX.srdd(), seriesY.srdd(), "pearson");
+
+JavaRDD<Vector> data = ... // note that each Vector is a row and not a column
+
+// calculate the correlation matrix using Pearson's method. Use "spearman" for Spearman's method.
+// If a method is not specified, Pearson's method will be used by default. 
+Matrix correlMatrix = Statistics.corr(data.rdd(), "pearson");
+
+{% endhighlight %}
 </div>

 <div data-lang="python" markdown="1">
@ -102,7 +185,24 @@ an `RDD[Vector]`, the output will be a `Double` or the correlation `Matrix` resp

 Refer to the [`Statistics` Python docs](api/python/pyspark.mllib.html#pyspark.mllib.stat.Statistics) for more details on the API.

-{% include_example python/mllib/correlations_example.py %}
+{% highlight python %}
+from pyspark.mllib.stat import Statistics
+
+sc = ... # SparkContext
+
+seriesX = ... # a series
+seriesY = ... # must have the same number of partitions and cardinality as seriesX
+
+# Compute the correlation using Pearson's method. Enter "spearman" for Spearman's method. If a 
+# method is not specified, Pearson's method will be used by default. 
+print(Statistics.corr(seriesX, seriesY, method="pearson"))
+
+data = ... # an RDD of Vectors
+# calculate the correlation matrix using Pearson's method. Use "spearman" for Spearman's method.
+# If a method is not specified, Pearson's method will be used by default. 
+print(Statistics.corr(data, method="pearson"))
+
+{% endhighlight %}
 </div>

 </div>
@ -128,7 +228,21 @@ fraction for key $k$, $n_k$ is the number of key-value pairs for key $k$, and $K
 keys. Sampling without replacement requires one additional pass over the RDD to guarantee sample 
 size, whereas sampling with replacement requires two additional passes.

-{% include_example scala/org/apache/spark/examples/mllib/StratifiedSamplingExample.scala %}
+{% highlight scala %}
+import org.apache.spark.SparkContext
+import org.apache.spark.SparkContext._
+import org.apache.spark.rdd.PairRDDFunctions
+
+val sc: SparkContext = ...
+
+val data = ... // an RDD[(K, V)] of any key value pairs
+val fractions: Map[K, Double] = ... // specify the exact fraction desired from each key
+
+// Get an exact sample from each stratum
+val approxSample = data.sampleByKey(withReplacement = false, fractions)
+val exactSample = data.sampleByKeyExact(withReplacement = false, fractions)
+
+{% endhighlight %}
 </div>

 <div data-lang="java" markdown="1">
@ -138,7 +252,22 @@ fraction for key $k$, $n_k$ is the number of key-value pairs for key $k$, and $K
 keys. Sampling without replacement requires one additional pass over the RDD to guarantee sample 
 size, whereas sampling with replacement requires two additional passes.

-{% include_example java/org/apache/spark/examples/mllib/JavaStratifiedSamplingExample.java %}
+{% highlight java %}
+import java.util.Map;
+
+import org.apache.spark.api.java.JavaPairRDD;
+import org.apache.spark.api.java.JavaSparkContext;
+
+JavaSparkContext jsc = ...
+
+JavaPairRDD<K, V> data = ... // an RDD of any key value pairs
+Map<K, Object> fractions = ... // specify the exact fraction desired from each key
+
+// Get an exact sample from each stratum
+JavaPairRDD<K, V> approxSample = data.sampleByKey(false, fractions);
+JavaPairRDD<K, V> exactSample = data.sampleByKeyExact(false, fractions);
+
+{% endhighlight %}
 </div>
 <div data-lang="python" markdown="1">
 [`sampleByKey()`](api/python/pyspark.html#pyspark.RDD.sampleByKey) allows users to
@ -148,7 +277,16 @@ set of keys.

 *Note:* `sampleByKeyExact()` is currently not supported in Python.

-{% include_example python/mllib/stratified_sampling_example.py %}
+{% highlight python %}
+
+sc = ... # SparkContext
+
+data = ... # an RDD of any key value pairs
+fractions = ... # specify the exact fraction desired from each key as a dictionary
+
+approxSample = data.sampleByKey(False, fractions);
+
+{% endhighlight %}
 </div>

 </div>
@ -170,7 +308,41 @@ independence tests.
 run Pearson's chi-squared tests. The following example demonstrates how to run and interpret 
 hypothesis tests.

-{% include_example scala/org/apache/spark/examples/mllib/HypothesisTestingExample.scala %}
+{% highlight scala %}
+import org.apache.spark.SparkContext
+import org.apache.spark.mllib.linalg._
+import org.apache.spark.mllib.regression.LabeledPoint
+import org.apache.spark.mllib.stat.Statistics._
+
+val sc: SparkContext = ...
+
+val vec: Vector = ... // a vector composed of the frequencies of events
+
+// compute the goodness of fit. If a second vector to test against is not supplied as a parameter, 
+// the test runs against a uniform distribution.  
+val goodnessOfFitTestResult = Statistics.chiSqTest(vec)
+println(goodnessOfFitTestResult) // summary of the test including the p-value, degrees of freedom, 
+                                 // test statistic, the method used, and the null hypothesis.
+
+val mat: Matrix = ... // a contingency matrix
+
+// conduct Pearson's independence test on the input contingency matrix
+val independenceTestResult = Statistics.chiSqTest(mat) 
+println(independenceTestResult) // summary of the test including the p-value, degrees of freedom...
+
+val obs: RDD[LabeledPoint] = ... // (feature, label) pairs.
+
+// The contingency table is constructed from the raw (feature, label) pairs and used to conduct
+// the independence test. Returns an array containing the ChiSquaredTestResult for every feature 
+// against the label.
+val featureTestResults: Array[ChiSqTestResult] = Statistics.chiSqTest(obs)
+var i = 1
+featureTestResults.foreach { result =>
+    println(s"Column $i:\n$result")
+    i += 1
+} // summary of the test 
+
+{% endhighlight %}
 </div>

 <div data-lang="java" markdown="1">
@ -180,7 +352,46 @@ hypothesis tests.

 Refer to the [`ChiSqTestResult` Java docs](api/java/org/apache/spark/mllib/stat/test/ChiSqTestResult.html) for details on the API.

-{% include_example java/org/apache/spark/examples/mllib/JavaHypothesisTestingExample.java %}
+{% highlight java %}
+import org.apache.spark.api.java.JavaRDD;
+import org.apache.spark.api.java.JavaSparkContext;
+import org.apache.spark.mllib.linalg.*;
+import org.apache.spark.mllib.regression.LabeledPoint;
+import org.apache.spark.mllib.stat.Statistics;
+import org.apache.spark.mllib.stat.test.ChiSqTestResult;
+
+JavaSparkContext jsc = ...
+
+Vector vec = ... // a vector composed of the frequencies of events
+
+// compute the goodness of fit. If a second vector to test against is not supplied as a parameter, 
+// the test runs against a uniform distribution.  
+ChiSqTestResult goodnessOfFitTestResult = Statistics.chiSqTest(vec);
+// summary of the test including the p-value, degrees of freedom, test statistic, the method used, 
+// and the null hypothesis.
+System.out.println(goodnessOfFitTestResult);
+
+Matrix mat = ... // a contingency matrix
+
+// conduct Pearson's independence test on the input contingency matrix
+ChiSqTestResult independenceTestResult = Statistics.chiSqTest(mat);
+// summary of the test including the p-value, degrees of freedom...
+System.out.println(independenceTestResult);
+
+JavaRDD<LabeledPoint> obs = ... // an RDD of labeled points
+
+// The contingency table is constructed from the raw (feature, label) pairs and used to conduct
+// the independence test. Returns an array containing the ChiSquaredTestResult for every feature 
+// against the label.
+ChiSqTestResult[] featureTestResults = Statistics.chiSqTest(obs.rdd());
+int i = 1;
+for (ChiSqTestResult result : featureTestResults) {
+    System.out.println("Column " + i + ":");
+    System.out.println(result); // summary of the test
+    i++;
+}
+
+{% endhighlight %}
 </div>

 <div data-lang="python" markdown="1">
@ -190,7 +401,39 @@ hypothesis tests.

 Refer to the [`Statistics` Python docs](api/python/pyspark.mllib.html#pyspark.mllib.stat.Statistics) for more details on the API.

-{% include_example python/mllib/hypothesis_testing_example.py %}
+{% highlight python %}
+from pyspark import SparkContext
+from pyspark.mllib.linalg import Vectors, Matrices
+from pyspark.mllib.regresssion import LabeledPoint
+from pyspark.mllib.stat import Statistics
+
+sc = SparkContext()
+
+vec = Vectors.dense(...) # a vector composed of the frequencies of events
+
+# compute the goodness of fit. If a second vector to test against is not supplied as a parameter,
+# the test runs against a uniform distribution.
+goodnessOfFitTestResult = Statistics.chiSqTest(vec)
+print(goodnessOfFitTestResult) # summary of the test including the p-value, degrees of freedom,
+                               # test statistic, the method used, and the null hypothesis.
+
+mat = Matrices.dense(...) # a contingency matrix
+
+# conduct Pearson's independence test on the input contingency matrix
+independenceTestResult = Statistics.chiSqTest(mat)
+print(independenceTestResult)  # summary of the test including the p-value, degrees of freedom...
+
+obs = sc.parallelize(...)  # LabeledPoint(feature, label) .
+
+# The contingency table is constructed from an RDD of LabeledPoint and used to conduct
+# the independence test. Returns an array containing the ChiSquaredTestResult for every feature
+# against the label.
+featureTestResults = Statistics.chiSqTest(obs)
+
+for i, result in enumerate(featureTestResults):
+    print("Column $d:" % (i + 1))
+    print(result)
+{% endhighlight %}
 </div>

 </div>
@ -212,7 +455,21 @@ and interpret the hypothesis tests.

 Refer to the [`Statistics` Scala docs](api/scala/index.html#org.apache.spark.mllib.stat.Statistics) for details on the API.

-{% include_example scala/org/apache/spark/examples/mllib/HypothesisTestingKolmogorovSmirnovTestExample.scala %}
+{% highlight scala %}
+import org.apache.spark.mllib.stat.Statistics
+
+val data: RDD[Double] = ... // an RDD of sample data
+
+// run a KS test for the sample versus a standard normal distribution
+val testResult = Statistics.kolmogorovSmirnovTest(data, "norm", 0, 1)
+println(testResult) // summary of the test including the p-value, test statistic,
+                    // and null hypothesis
+                    // if our p-value indicates significance, we can reject the null hypothesis
+
+// perform a KS test using a cumulative distribution function of our making
+val myCDF: Double => Double = ...
+val testResult2 = Statistics.kolmogorovSmirnovTest(data, myCDF)
+{% endhighlight %}
 </div>

 <div data-lang="java" markdown="1">
@ -222,7 +479,23 @@ and interpret the hypothesis tests.

 Refer to the [`Statistics` Java docs](api/java/org/apache/spark/mllib/stat/Statistics.html) for details on the API.

-{% include_example java/org/apache/spark/examples/mllib/JavaHypothesisTestingKolmogorovSmirnovTestExample.java %}
+{% highlight java %}
+import java.util.Arrays;
+
+import org.apache.spark.api.java.JavaDoubleRDD;
+import org.apache.spark.api.java.JavaSparkContext;
+
+import org.apache.spark.mllib.stat.Statistics;
+import org.apache.spark.mllib.stat.test.KolmogorovSmirnovTestResult;
+
+JavaSparkContext jsc = ...
+JavaDoubleRDD data = jsc.parallelizeDoubles(Arrays.asList(0.2, 1.0, ...));
+KolmogorovSmirnovTestResult testResult = Statistics.kolmogorovSmirnovTest(data, "norm", 0.0, 1.0);
+// summary of the test including the p-value, test statistic,
+// and null hypothesis
+// if our p-value indicates significance, we can reject the null hypothesis
+System.out.println(testResult);
+{% endhighlight %}
 </div>

 <div data-lang="python" markdown="1">
@ -232,7 +505,19 @@ and interpret the hypothesis tests.

 Refer to the [`Statistics` Python docs](api/python/pyspark.mllib.html#pyspark.mllib.stat.Statistics) for more details on the API.

-{% include_example python/mllib/hypothesis_testing_kolmogorov_smirnov_test_example.py %}
+{% highlight python %}
+from pyspark.mllib.stat import Statistics
+
+parallelData = sc.parallelize([1.0, 2.0, ... ])
+
+# run a KS test for the sample versus a standard normal distribution
+testResult = Statistics.kolmogorovSmirnovTest(parallelData, "norm", 0, 1)
+print(testResult) # summary of the test including the p-value, test statistic,
+                  # and null hypothesis
+                  # if our p-value indicates significance, we can reject the null hypothesis
+# Note that the Scala functionality of calling Statistics.kolmogorovSmirnovTest with
+# a lambda to calculate the CDF is not made available in the Python API
+{% endhighlight %}
 </div>
 </div>

@ -366,7 +651,21 @@ to do so.

 Refer to the [`KernelDensity` Scala docs](api/scala/index.html#org.apache.spark.mllib.stat.KernelDensity) for details on the API.

-{% include_example scala/org/apache/spark/examples/mllib/KernelDensityEstimationExample.scala %}
+{% highlight scala %}
+import org.apache.spark.mllib.stat.KernelDensity
+import org.apache.spark.rdd.RDD
+
+val data: RDD[Double] = ... // an RDD of sample data
+
+// Construct the density estimator with the sample data and a standard deviation for the Gaussian
+// kernels
+val kd = new KernelDensity()
+  .setSample(data)
+  .setBandwidth(3.0)
+
+// Find density estimates for the given values
+val densities = kd.estimate(Array(-1.0, 2.0, 5.0))
+{% endhighlight %}
 </div>

 <div data-lang="java" markdown="1">
@ -376,7 +675,21 @@ to do so.

 Refer to the [`KernelDensity` Java docs](api/java/org/apache/spark/mllib/stat/KernelDensity.html) for details on the API.

-{% include_example java/org/apache/spark/examples/mllib/JavaKernelDensityEstimationExample.java %}
+{% highlight java %}
+import org.apache.spark.mllib.stat.KernelDensity;
+import org.apache.spark.rdd.RDD;
+
+RDD<Double> data = ... // an RDD of sample data
+
+// Construct the density estimator with the sample data and a standard deviation for the Gaussian
+// kernels
+KernelDensity kd = new KernelDensity()
+  .setSample(data)
+  .setBandwidth(3.0);
+
+// Find density estimates for the given values
+double[] densities = kd.estimate(new double[] {-1.0, 2.0, 5.0});
+{% endhighlight %}
 </div>

 <div data-lang="python" markdown="1">
@ -386,7 +699,20 @@ to do so.

 Refer to the [`KernelDensity` Python docs](api/python/pyspark.mllib.html#pyspark.mllib.stat.KernelDensity) for more details on the API.

-{% include_example python/mllib/kernel_density_estimation_example.py %}
+{% highlight python %}
+from pyspark.mllib.stat import KernelDensity
+
+data = ... # an RDD of sample data
+
+# Construct the density estimator with the sample data and a standard deviation for the Gaussian
+# kernels
+kd = KernelDensity()
+kd.setSample(data)
+kd.setBandwidth(3.0)
+
+# Find density estimates for the given values
+densities = kd.estimate([-1.0, 2.0, 5.0])
+{% endhighlight %}
 </div>

 </div>
--- a/examples/src/main/java/org/apache/spark/examples/mllib/JavaCorrelationsExample.java
+++ b/examples/src/main/java/org/apache/spark/examples/mllib/JavaCorrelationsExample.java
@ -1,70 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.spark.examples.mllib;
-
-import org.apache.spark.SparkConf;
-import org.apache.spark.api.java.JavaSparkContext;
-// $example on$
-import java.util.Arrays;
-
-import org.apache.spark.api.java.JavaDoubleRDD;
-import org.apache.spark.api.java.JavaRDD;
-import org.apache.spark.mllib.linalg.Matrix;
-import org.apache.spark.mllib.linalg.Vector;
-import org.apache.spark.mllib.linalg.Vectors;
-import org.apache.spark.mllib.stat.Statistics;
-// $example off$
-
-public class JavaCorrelationsExample {
-  public static void main(String[] args) {
-
-    SparkConf conf = new SparkConf().setAppName("JavaCorrelationsExample");
-    JavaSparkContext jsc = new JavaSparkContext(conf);
-
-    // $example on$
-    JavaDoubleRDD seriesX = jsc.parallelizeDoubles(
-      Arrays.asList(1.0, 2.0, 3.0, 3.0, 5.0));  // a series
-
-    // must have the same number of partitions and cardinality as seriesX
-    JavaDoubleRDD seriesY = jsc.parallelizeDoubles(
-      Arrays.asList(11.0, 22.0, 33.0, 33.0, 555.0));
-
-    // compute the correlation using Pearson's method. Enter "spearman" for Spearman's method.
-    // If a method is not specified, Pearson's method will be used by default.
-    Double correlation = Statistics.corr(seriesX.srdd(), seriesY.srdd(), "pearson");
-    System.out.println("Correlation is: " + correlation);
-
-    // note that each Vector is a row and not a column
-    JavaRDD<Vector> data = jsc.parallelize(
-      Arrays.asList(
-        Vectors.dense(1.0, 10.0, 100.0),
-        Vectors.dense(2.0, 20.0, 200.0),
-        Vectors.dense(5.0, 33.0, 366.0)
-      )
-    );
-
-    // calculate the correlation matrix using Pearson's method. Use "spearman" for Spearman's method.
-    // If a method is not specified, Pearson's method will be used by default.
-    Matrix correlMatrix = Statistics.corr(data.rdd(), "pearson");
-    System.out.println(correlMatrix.toString());
-    // $example off$
-
-    jsc.stop();
-  }
-}
-
--- a/examples/src/main/java/org/apache/spark/examples/mllib/JavaHypothesisTestingExample.java
+++ b/examples/src/main/java/org/apache/spark/examples/mllib/JavaHypothesisTestingExample.java
@ -1,84 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.spark.examples.mllib;
-
-import org.apache.spark.SparkConf;
-import org.apache.spark.api.java.JavaSparkContext;
-
-// $example on$
-import java.util.Arrays;
-
-import org.apache.spark.api.java.JavaRDD;
-import org.apache.spark.mllib.linalg.Matrices;
-import org.apache.spark.mllib.linalg.Matrix;
-import org.apache.spark.mllib.linalg.Vector;
-import org.apache.spark.mllib.linalg.Vectors;
-import org.apache.spark.mllib.regression.LabeledPoint;
-import org.apache.spark.mllib.stat.Statistics;
-import org.apache.spark.mllib.stat.test.ChiSqTestResult;
-// $example off$
-
-public class JavaHypothesisTestingExample {
-  public static void main(String[] args) {
-
-    SparkConf conf = new SparkConf().setAppName("JavaHypothesisTestingExample");
-    JavaSparkContext jsc = new JavaSparkContext(conf);
-
-    // $example on$
-    // a vector composed of the frequencies of events
-    Vector vec = Vectors.dense(0.1, 0.15, 0.2, 0.3, 0.25);
-
-    // compute the goodness of fit. If a second vector to test against is not supplied
-    // as a parameter, the test runs against a uniform distribution.
-    ChiSqTestResult goodnessOfFitTestResult = Statistics.chiSqTest(vec);
-    // summary of the test including the p-value, degrees of freedom, test statistic,
-    // the method used, and the null hypothesis.
-    System.out.println(goodnessOfFitTestResult + "\n");
-
-    // Create a contingency matrix ((1.0, 2.0), (3.0, 4.0), (5.0, 6.0))
-    Matrix mat = Matrices.dense(3, 2, new double[]{1.0, 3.0, 5.0, 2.0, 4.0, 6.0});
-
-    // conduct Pearson's independence test on the input contingency matrix
-    ChiSqTestResult independenceTestResult = Statistics.chiSqTest(mat);
-    // summary of the test including the p-value, degrees of freedom...
-    System.out.println(independenceTestResult + "\n");
-
-    // an RDD of labeled points
-    JavaRDD<LabeledPoint> obs = jsc.parallelize(
-      Arrays.asList(
-        new LabeledPoint(1.0, Vectors.dense(1.0, 0.0, 3.0)),
-        new LabeledPoint(1.0, Vectors.dense(1.0, 2.0, 0.0)),
-        new LabeledPoint(-1.0, Vectors.dense(-1.0, 0.0, -0.5))
-      )
-    );
-
-    // The contingency table is constructed from the raw (feature, label) pairs and used to conduct
-    // the independence test. Returns an array containing the ChiSquaredTestResult for every feature
-    // against the label.
-    ChiSqTestResult[] featureTestResults = Statistics.chiSqTest(obs.rdd());
-    int i = 1;
-    for (ChiSqTestResult result : featureTestResults) {
-      System.out.println("Column " + i + ":");
-      System.out.println(result + "\n");  // summary of the test
-      i++;
-    }
-    // $example off$
-
-    jsc.stop();
-  }
-}
--- a/examples/src/main/java/org/apache/spark/examples/mllib/JavaHypothesisTestingKolmogorovSmirnovTestExample.java
+++ b/examples/src/main/java/org/apache/spark/examples/mllib/JavaHypothesisTestingKolmogorovSmirnovTestExample.java
@ -1,49 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.spark.examples.mllib;
-
-import org.apache.spark.SparkConf;
-import org.apache.spark.api.java.JavaSparkContext;
-// $example on$
-import java.util.Arrays;
-
-import org.apache.spark.api.java.JavaDoubleRDD;
-import org.apache.spark.mllib.stat.Statistics;
-import org.apache.spark.mllib.stat.test.KolmogorovSmirnovTestResult;
-// $example off$
-
-public class JavaHypothesisTestingKolmogorovSmirnovTestExample {
-  public static void main(String[] args) {
-
-    SparkConf conf =
-      new SparkConf().setAppName("JavaHypothesisTestingKolmogorovSmirnovTestExample");
-    JavaSparkContext jsc = new JavaSparkContext(conf);
-
-    // $example on$
-    JavaDoubleRDD data = jsc.parallelizeDoubles(Arrays.asList(0.1, 0.15, 0.2, 0.3, 0.25));
-    KolmogorovSmirnovTestResult testResult =
-      Statistics.kolmogorovSmirnovTest(data, "norm", 0.0, 1.0);
-    // summary of the test including the p-value, test statistic, and null hypothesis
-    // if our p-value indicates significance, we can reject the null hypothesis
-    System.out.println(testResult);
-    // $example off$
-
-    jsc.stop();
-  }
-}
-
--- a/examples/src/main/java/org/apache/spark/examples/mllib/JavaKernelDensityEstimationExample.java
+++ b/examples/src/main/java/org/apache/spark/examples/mllib/JavaKernelDensityEstimationExample.java
@ -1,53 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.spark.examples.mllib;
-
-import org.apache.spark.SparkConf;
-import org.apache.spark.api.java.JavaSparkContext;
-// $example on$
-import java.util.Arrays;
-
-import org.apache.spark.api.java.JavaRDD;
-import org.apache.spark.mllib.stat.KernelDensity;
-// $example off$
-
-public class JavaKernelDensityEstimationExample {
-  public static void main(String[] args) {
-
-    SparkConf conf = new SparkConf().setAppName("JavaKernelDensityEstimationExample");
-    JavaSparkContext jsc = new JavaSparkContext(conf);
-
-    // $example on$
-    // an RDD of sample data
-    JavaRDD<Double> data = jsc.parallelize(
-      Arrays.asList(1.0, 1.0, 1.0, 2.0, 3.0, 4.0, 5.0, 5.0, 6.0, 7.0, 8.0, 9.0, 9.0));
-
-    // Construct the density estimator with the sample data
-    // and a standard deviation for the Gaussian kernels
-    KernelDensity kd = new KernelDensity().setSample(data).setBandwidth(3.0);
-
-    // Find density estimates for the given values
-    double[] densities = kd.estimate(new double[]{-1.0, 2.0, 5.0});
-
-    System.out.println(Arrays.toString(densities));
-    // $example off$
-
-    jsc.stop();
-  }
-}
-
--- a/examples/src/main/java/org/apache/spark/examples/mllib/JavaStratifiedSamplingExample.java
+++ b/examples/src/main/java/org/apache/spark/examples/mllib/JavaStratifiedSamplingExample.java
@ -1,75 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.spark.examples.mllib;
-
-import com.google.common.collect.ImmutableMap;
-import org.apache.spark.SparkConf;
-import org.apache.spark.api.java.JavaSparkContext;
-
-// $example on$
-import java.util.*;
-
-import scala.Tuple2;
-
-import org.apache.spark.api.java.JavaPairRDD;
-import org.apache.spark.api.java.function.VoidFunction;
-// $example off$
-
-public class JavaStratifiedSamplingExample {
-  public static void main(String[] args) {
-
-    SparkConf conf = new SparkConf().setAppName("JavaStratifiedSamplingExample");
-    JavaSparkContext jsc = new JavaSparkContext(conf);
-
-    // $example on$
-    List<Tuple2<Integer, Character>> list = new ArrayList<Tuple2<Integer, Character>>(
-      Arrays.<Tuple2<Integer, Character>>asList(
-        new Tuple2(1, 'a'),
-        new Tuple2(1, 'b'),
-        new Tuple2(2, 'c'),
-        new Tuple2(2, 'd'),
-        new Tuple2(2, 'e'),
-        new Tuple2(3, 'f')
-      )
-    );
-
-    JavaPairRDD<Integer, Character> data = jsc.parallelizePairs(list);
-
-    // specify the exact fraction desired from each key Map<K, Object>
-    ImmutableMap<Integer, Object> fractions =
-      ImmutableMap.of(1, (Object)0.1, 2, (Object) 0.6, 3, (Object) 0.3);
-
-    // Get an approximate sample from each stratum
-    JavaPairRDD<Integer, Character> approxSample = data.sampleByKey(false, fractions);
-    // Get an exact sample from each stratum
-    JavaPairRDD<Integer, Character> exactSample = data.sampleByKeyExact(false, fractions);
-    // $example off$
-
-    System.out.println("approxSample size is " + approxSample.collect().size());
-    for (Tuple2<Integer, Character> t : approxSample.collect()) {
-      System.out.println(t._1() + " " + t._2());
-    }
-
-    System.out.println("exactSample size is " + exactSample.collect().size());
-    for (Tuple2<Integer, Character> t : exactSample.collect()) {
-      System.out.println(t._1() + " " + t._2());
-    }
-
-    jsc.stop();
-  }
-}
--- a/examples/src/main/java/org/apache/spark/examples/mllib/JavaSummaryStatisticsExample.java
+++ b/examples/src/main/java/org/apache/spark/examples/mllib/JavaSummaryStatisticsExample.java
@ -1,56 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.spark.examples.mllib;
-
-import org.apache.spark.SparkConf;
-import org.apache.spark.api.java.JavaSparkContext;
-// $example on$
-import java.util.Arrays;
-
-import org.apache.spark.api.java.JavaRDD;
-import org.apache.spark.mllib.linalg.Vector;
-import org.apache.spark.mllib.linalg.Vectors;
-import org.apache.spark.mllib.stat.MultivariateStatisticalSummary;
-import org.apache.spark.mllib.stat.Statistics;
-// $example off$
-
-public class JavaSummaryStatisticsExample {
-  public static void main(String[] args) {
-
-    SparkConf conf = new SparkConf().setAppName("JavaSummaryStatisticsExample");
-    JavaSparkContext jsc = new JavaSparkContext(conf);
-
-    // $example on$
-    JavaRDD<Vector> mat = jsc.parallelize(
-      Arrays.asList(
-        Vectors.dense(1.0, 10.0, 100.0),
-        Vectors.dense(2.0, 20.0, 200.0),
-        Vectors.dense(3.0, 30.0, 300.0)
-      )
-    ); // an RDD of Vectors
-
-    // Compute column summary statistics.
-    MultivariateStatisticalSummary summary = Statistics.colStats(mat.rdd());
-    System.out.println(summary.mean());  // a dense vector containing the mean value for each column
-    System.out.println(summary.variance());  // column-wise variance
-    System.out.println(summary.numNonzeros());  // number of nonzeros in each column
-    // $example off$
-
-    jsc.stop();
-  }
-}
--- a/examples/src/main/python/mllib/correlations_example.py
+++ b/examples/src/main/python/mllib/correlations_example.py
@ -1,48 +0,0 @@
-#
-# Licensed to the Apache Software Foundation (ASF) under one or more
-# contributor license agreements.  See the NOTICE file distributed with
-# this work for additional information regarding copyright ownership.
-# The ASF licenses this file to You under the Apache License, Version 2.0
-# (the "License"); you may not use this file except in compliance with
-# the License.  You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-from __future__ import print_function
-
-import numpy as np
-
-from pyspark import SparkContext
-# $example on$
-from pyspark.mllib.stat import Statistics
-# $example off$
-
-if __name__ == "__main__":
-    sc = SparkContext(appName="CorrelationsExample")  # SparkContext
-
-    # $example on$
-    seriesX = sc.parallelize([1.0, 2.0, 3.0, 3.0, 5.0])  # a series
-    # seriesY must have the same number of partitions and cardinality as seriesX
-    seriesY = sc.parallelize([11.0, 22.0, 33.0, 33.0, 555.0])
-
-    # Compute the correlation using Pearson's method. Enter "spearman" for Spearman's method.
-    # If a method is not specified, Pearson's method will be used by default.
-    print("Correlation is: " + str(Statistics.corr(seriesX, seriesY, method="pearson")))
-
-    data = sc.parallelize(
-        [np.array([1.0, 10.0, 100.0]), np.array([2.0, 20.0, 200.0]), np.array([5.0, 33.0, 366.0])]
-    )  # an RDD of Vectors
-
-    # calculate the correlation matrix using Pearson's method. Use "spearman" for Spearman's method.
-    # If a method is not specified, Pearson's method will be used by default.
-    print(Statistics.corr(data, method="pearson"))
-    # $example off$
-
-    sc.stop()
--- a/examples/src/main/python/mllib/hypothesis_testing_example.py
+++ b/examples/src/main/python/mllib/hypothesis_testing_example.py
@ -1,65 +0,0 @@
-#
-# Licensed to the Apache Software Foundation (ASF) under one or more
-# contributor license agreements.  See the NOTICE file distributed with
-# this work for additional information regarding copyright ownership.
-# The ASF licenses this file to You under the Apache License, Version 2.0
-# (the "License"); you may not use this file except in compliance with
-# the License.  You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-from __future__ import print_function
-
-from pyspark import SparkContext
-# $example on$
-from pyspark.mllib.linalg import Matrices, Vectors
-from pyspark.mllib.regression import LabeledPoint
-from pyspark.mllib.stat import Statistics
-# $example off$
-
-if __name__ == "__main__":
-    sc = SparkContext(appName="HypothesisTestingExample")
-
-    # $example on$
-    vec = Vectors.dense(0.1, 0.15, 0.2, 0.3, 0.25)  # a vector composed of the frequencies of events
-
-    # compute the goodness of fit. If a second vector to test against
-    # is not supplied as a parameter, the test runs against a uniform distribution.
-    goodnessOfFitTestResult = Statistics.chiSqTest(vec)
-
-    # summary of the test including the p-value, degrees of freedom,
-    # test statistic, the method used, and the null hypothesis.
-    print("%s\n" % goodnessOfFitTestResult)
-
-    mat = Matrices.dense(3, 2, [1.0, 3.0, 5.0, 2.0, 4.0, 6.0])  # a contingency matrix
-
-    # conduct Pearson's independence test on the input contingency matrix
-    independenceTestResult = Statistics.chiSqTest(mat)
-
-    # summary of the test including the p-value, degrees of freedom,
-    # test statistic, the method used, and the null hypothesis.
-    print("%s\n" % independenceTestResult)
-
-    obs = sc.parallelize(
-        [LabeledPoint(1.0, [1.0, 0.0, 3.0]),
-         LabeledPoint(1.0, [1.0, 2.0, 0.0]),
-         LabeledPoint(1.0, [-1.0, 0.0, -0.5])]
-    )  # LabeledPoint(feature, label)
-
-    # The contingency table is constructed from an RDD of LabeledPoint and used to conduct
-    # the independence test. Returns an array containing the ChiSquaredTestResult for every feature
-    # against the label.
-    featureTestResults = Statistics.chiSqTest(obs)
-
-    for i, result in enumerate(featureTestResults):
-        print("Column %d:\n%s" % (i + 1, result))
-    # $example off$
-
-    sc.stop()
--- a/examples/src/main/python/mllib/hypothesis_testing_kolmogorov_smirnov_test_example.py
+++ b/examples/src/main/python/mllib/hypothesis_testing_kolmogorov_smirnov_test_example.py
@ -1,40 +0,0 @@
-#
-# Licensed to the Apache Software Foundation (ASF) under one or more
-# contributor license agreements.  See the NOTICE file distributed with
-# this work for additional information regarding copyright ownership.
-# The ASF licenses this file to You under the Apache License, Version 2.0
-# (the "License"); you may not use this file except in compliance with
-# the License.  You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-from __future__ import print_function
-
-from pyspark import SparkContext
-# $example on$
-from pyspark.mllib.stat import Statistics
-# $example off$
-
-if __name__ == "__main__":
-    sc = SparkContext(appName="HypothesisTestingKolmogorovSmirnovTestExample")
-
-    # $example on$
-    parallelData = sc.parallelize([0.1, 0.15, 0.2, 0.3, 0.25])
-
-    # run a KS test for the sample versus a standard normal distribution
-    testResult = Statistics.kolmogorovSmirnovTest(parallelData, "norm", 0, 1)
-    # summary of the test including the p-value, test statistic, and null hypothesis
-    # if our p-value indicates significance, we can reject the null hypothesis
-    # Note that the Scala functionality of calling Statistics.kolmogorovSmirnovTest with
-    # a lambda to calculate the CDF is not made available in the Python API
-    print(testResult)
-    # $example off$
-
-    sc.stop()
--- a/examples/src/main/python/mllib/kernel_density_estimation_example.py
+++ b/examples/src/main/python/mllib/kernel_density_estimation_example.py
@ -1,44 +0,0 @@
-#
-# Licensed to the Apache Software Foundation (ASF) under one or more
-# contributor license agreements.  See the NOTICE file distributed with
-# this work for additional information regarding copyright ownership.
-# The ASF licenses this file to You under the Apache License, Version 2.0
-# (the "License"); you may not use this file except in compliance with
-# the License.  You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-from __future__ import print_function
-
-from pyspark import SparkContext
-# $example on$
-from pyspark.mllib.stat import KernelDensity
-# $example off$
-
-if __name__ == "__main__":
-    sc = SparkContext(appName="KernelDensityEstimationExample")  # SparkContext
-
-    # $example on$
-    # an RDD of sample data
-    data = sc.parallelize([1.0, 1.0, 1.0, 2.0, 3.0, 4.0, 5.0, 5.0, 6.0, 7.0, 8.0, 9.0, 9.0])
-
-    # Construct the density estimator with the sample data and a standard deviation for the Gaussian
-    # kernels
-    kd = KernelDensity()
-    kd.setSample(data)
-    kd.setBandwidth(3.0)
-
-    # Find density estimates for the given values
-    densities = kd.estimate([-1.0, 2.0, 5.0])
-    # $example off$
-
-    print(densities)
-
-    sc.stop()
--- a/examples/src/main/python/mllib/stratified_sampling_example.py
+++ b/examples/src/main/python/mllib/stratified_sampling_example.py
@ -1,38 +0,0 @@
-#
-# Licensed to the Apache Software Foundation (ASF) under one or more
-# contributor license agreements.  See the NOTICE file distributed with
-# this work for additional information regarding copyright ownership.
-# The ASF licenses this file to You under the Apache License, Version 2.0
-# (the "License"); you may not use this file except in compliance with
-# the License.  You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-from __future__ import print_function
-
-from pyspark import SparkContext
-
-if __name__ == "__main__":
-    sc = SparkContext(appName="StratifiedSamplingExample")  # SparkContext
-
-    # $example on$
-    # an RDD of any key value pairs
-    data = sc.parallelize([(1, 'a'), (1, 'b'), (2, 'c'), (2, 'd'), (2, 'e'), (3, 'f')])
-
-    # specify the exact fraction desired from each key as a dictionary
-    fractions = {1: 0.1, 2: 0.6, 3: 0.3}
-
-    approxSample = data.sampleByKey(False, fractions)
-    # $example off$
-
-    for each in approxSample.collect():
-        print(each)
-
-    sc.stop()
--- a/examples/src/main/python/mllib/summary_statistics_example.py
+++ b/examples/src/main/python/mllib/summary_statistics_example.py
@ -1,42 +0,0 @@
-#
-# Licensed to the Apache Software Foundation (ASF) under one or more
-# contributor license agreements.  See the NOTICE file distributed with
-# this work for additional information regarding copyright ownership.
-# The ASF licenses this file to You under the Apache License, Version 2.0
-# (the "License"); you may not use this file except in compliance with
-# the License.  You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-from __future__ import print_function
-
-from pyspark import SparkContext
-# $example on$
-import numpy as np
-
-from pyspark.mllib.stat import Statistics
-# $example off$
-
-if __name__ == "__main__":
-    sc = SparkContext(appName="SummaryStatisticsExample")  # SparkContext
-
-    # $example on$
-    mat = sc.parallelize(
-        [np.array([1.0, 10.0, 100.0]), np.array([2.0, 20.0, 200.0]), np.array([3.0, 30.0, 300.0])]
-    )  # an RDD of Vectors
-
-    # Compute column summary statistics.
-    summary = Statistics.colStats(mat)
-    print(summary.mean())  # a dense vector containing the mean value for each column
-    print(summary.variance())  # column-wise variance
-    print(summary.numNonzeros())  # number of nonzeros in each column
-    # $example off$
-
-    sc.stop()
--- a/examples/src/main/scala/org/apache/spark/examples/mllib/CorrelationsExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/mllib/CorrelationsExample.scala
@ -1,62 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-// scalastyle:off println
-package org.apache.spark.examples.mllib
-
-import org.apache.spark.{SparkConf, SparkContext}
-// $example on$
-import org.apache.spark.mllib.linalg._
-import org.apache.spark.mllib.stat.Statistics
-import org.apache.spark.rdd.RDD
-// $example off$
-
-object CorrelationsExample {
-
-  def main(args: Array[String]): Unit = {
-
-    val conf = new SparkConf().setAppName("CorrelationsExample")
-    val sc = new SparkContext(conf)
-
-    // $example on$
-    val seriesX: RDD[Double] = sc.parallelize(Array(1, 2, 3, 3, 5))  // a series
-    // must have the same number of partitions and cardinality as seriesX
-    val seriesY: RDD[Double] = sc.parallelize(Array(11, 22, 33, 33, 555))
-
-    // compute the correlation using Pearson's method. Enter "spearman" for Spearman's method. If a
-    // method is not specified, Pearson's method will be used by default.
-    val correlation: Double = Statistics.corr(seriesX, seriesY, "pearson")
-    println(s"Correlation is: $correlation")
-
-    val data: RDD[Vector] = sc.parallelize(
-      Seq(
-        Vectors.dense(1.0, 10.0, 100.0),
-        Vectors.dense(2.0, 20.0, 200.0),
-        Vectors.dense(5.0, 33.0, 366.0))
-    )  // note that each Vector is a row and not a column
-
-    // calculate the correlation matrix using Pearson's method. Use "spearman" for Spearman's method
-    // If a method is not specified, Pearson's method will be used by default.
-    val correlMatrix: Matrix = Statistics.corr(data, "pearson")
-    println(correlMatrix.toString)
-    // $example off$
-
-    sc.stop()
-  }
-}
-// scalastyle:on println
-
--- a/examples/src/main/scala/org/apache/spark/examples/mllib/HypothesisTestingExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/mllib/HypothesisTestingExample.scala
@ -1,80 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-// scalastyle:off println
-package org.apache.spark.examples.mllib
-
-import org.apache.spark.{SparkConf, SparkContext}
-// $example on$
-import org.apache.spark.mllib.linalg._
-import org.apache.spark.mllib.regression.LabeledPoint
-import org.apache.spark.mllib.stat.Statistics
-import org.apache.spark.mllib.stat.test.ChiSqTestResult
-import org.apache.spark.rdd.RDD
-// $example off$
-
-object HypothesisTestingExample {
-
-  def main(args: Array[String]) {
-
-    val conf = new SparkConf().setAppName("HypothesisTestingExample")
-    val sc = new SparkContext(conf)
-
-    // $example on$
-    // a vector composed of the frequencies of events
-    val vec: Vector = Vectors.dense(0.1, 0.15, 0.2, 0.3, 0.25)
-
-    // compute the goodness of fit. If a second vector to test against is not supplied
-    // as a parameter, the test runs against a uniform distribution.
-    val goodnessOfFitTestResult = Statistics.chiSqTest(vec)
-    // summary of the test including the p-value, degrees of freedom, test statistic, the method
-    // used, and the null hypothesis.
-    println(s"$goodnessOfFitTestResult\n")
-
-    // a contingency matrix. Create a dense matrix ((1.0, 2.0), (3.0, 4.0), (5.0, 6.0))
-    val mat: Matrix = Matrices.dense(3, 2, Array(1.0, 3.0, 5.0, 2.0, 4.0, 6.0))
-
-    // conduct Pearson's independence test on the input contingency matrix
-    val independenceTestResult = Statistics.chiSqTest(mat)
-    // summary of the test including the p-value, degrees of freedom
-    println(s"$independenceTestResult\n")
-
-    val obs: RDD[LabeledPoint] =
-      sc.parallelize(
-        Seq(
-          LabeledPoint(1.0, Vectors.dense(1.0, 0.0, 3.0)),
-          LabeledPoint(1.0, Vectors.dense(1.0, 2.0, 0.0)),
-          LabeledPoint(-1.0, Vectors.dense(-1.0, 0.0, -0.5)
-          )
-        )
-      ) // (feature, label) pairs.
-
-    // The contingency table is constructed from the raw (feature, label) pairs and used to conduct
-    // the independence test. Returns an array containing the ChiSquaredTestResult for every feature
-    // against the label.
-    val featureTestResults: Array[ChiSqTestResult] = Statistics.chiSqTest(obs)
-    featureTestResults.zipWithIndex.foreach { case (k, v) =>
-      println("Column " + (v + 1).toString + ":")
-      println(k)
-    }  // summary of the test
-    // $example off$
-
-    sc.stop()
-  }
-}
-// scalastyle:on println
-
--- a/examples/src/main/scala/org/apache/spark/examples/mllib/HypothesisTestingKolmogorovSmirnovTestExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/mllib/HypothesisTestingKolmogorovSmirnovTestExample.scala
@ -1,54 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-// scalastyle:off println
-package org.apache.spark.examples.mllib
-
-import org.apache.spark.{SparkConf, SparkContext}
-// $example on$
-import org.apache.spark.mllib.stat.Statistics
-import org.apache.spark.rdd.RDD
-// $example off$
-
-object HypothesisTestingKolmogorovSmirnovTestExample {
-
-  def main(args: Array[String]): Unit = {
-
-    val conf = new SparkConf().setAppName("HypothesisTestingKolmogorovSmirnovTestExample")
-    val sc = new SparkContext(conf)
-
-    // $example on$
-    val data: RDD[Double] = sc.parallelize(Seq(0.1, 0.15, 0.2, 0.3, 0.25))  // an RDD of sample data
-
-    // run a KS test for the sample versus a standard normal distribution
-    val testResult = Statistics.kolmogorovSmirnovTest(data, "norm", 0, 1)
-    // summary of the test including the p-value, test statistic, and null hypothesis if our p-value
-    // indicates significance, we can reject the null hypothesis.
-    println(testResult)
-    println()
-
-    // perform a KS test using a cumulative distribution function of our making
-    val myCDF = Map(0.1 -> 0.2, 0.15 -> 0.6, 0.2 -> 0.05, 0.3 -> 0.05, 0.25 -> 0.1)
-    val testResult2 = Statistics.kolmogorovSmirnovTest(data, myCDF)
-    println(testResult2)
-    // $example off$
-
-    sc.stop()
-  }
-}
-// scalastyle:on println
-
--- a/examples/src/main/scala/org/apache/spark/examples/mllib/KernelDensityEstimationExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/mllib/KernelDensityEstimationExample.scala
@ -1,54 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-// scalastyle:off println
-package org.apache.spark.examples.mllib
-
-import org.apache.spark.{SparkConf, SparkContext}
-// $example on$
-import org.apache.spark.mllib.stat.KernelDensity
-import org.apache.spark.rdd.RDD
-// $example off$
-
-object KernelDensityEstimationExample {
-
-  def main(args: Array[String]): Unit = {
-
-    val conf = new SparkConf().setAppName("KernelDensityEstimationExample")
-    val sc = new SparkContext(conf)
-
-    // $example on$
-    // an RDD of sample data
-    val data: RDD[Double] = sc.parallelize(Seq(1, 1, 1, 2, 3, 4, 5, 5, 6, 7, 8, 9, 9))
-
-    // Construct the density estimator with the sample data and a standard deviation
-    // for the Gaussian kernels
-    val kd = new KernelDensity()
-      .setSample(data)
-      .setBandwidth(3.0)
-
-    // Find density estimates for the given values
-    val densities = kd.estimate(Array(-1.0, 2.0, 5.0))
-    // $example off$
-
-    densities.foreach(println)
-
-    sc.stop()
-  }
-}
-// scalastyle:on println
-
--- a/examples/src/main/scala/org/apache/spark/examples/mllib/StratifiedSamplingExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/mllib/StratifiedSamplingExample.scala
@ -1,53 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-// scalastyle:off println
-package org.apache.spark.examples.mllib
-
-import org.apache.spark.{SparkConf, SparkContext}
-
-object StratifiedSamplingExample {
-
-  def main(args: Array[String]): Unit = {
-
-    val conf = new SparkConf().setAppName("StratifiedSamplingExample")
-    val sc = new SparkContext(conf)
-
-    // $example on$
-    // an RDD[(K, V)] of any key value pairs
-    val data = sc.parallelize(
-      Seq((1, 'a'), (1, 'b'), (2, 'c'), (2, 'd'), (2, 'e'), (3, 'f')))
-
-    // specify the exact fraction desired from each key
-    val fractions = Map(1 -> 0.1, 2 -> 0.6, 3 -> 0.3)
-
-    // Get an approximate sample from each stratum
-    val approxSample = data.sampleByKey(withReplacement = false, fractions)
-    // Get an exact sample from each stratum
-    val exactSample = data.sampleByKeyExact(withReplacement = false, fractions)
-    // $example off$
-
-    println("approxSample size is " + approxSample.collect().size.toString)
-    approxSample.collect().foreach(println)
-
-    println("exactSample its size is " + exactSample.collect().size.toString)
-    exactSample.collect().foreach(println)
-
-    sc.stop()
-  }
-}
-// scalastyle:on println
--- a/examples/src/main/scala/org/apache/spark/examples/mllib/SummaryStatisticsExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/mllib/SummaryStatisticsExample.scala
@ -1,53 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-// scalastyle:off println
-package org.apache.spark.examples.mllib
-
-import org.apache.spark.{SparkConf, SparkContext}
-// $example on$
-import org.apache.spark.mllib.linalg.Vectors
-import org.apache.spark.mllib.stat.{MultivariateStatisticalSummary, Statistics}
-// $example off$
-
-object SummaryStatisticsExample {
-
-  def main(args: Array[String]): Unit = {
-
-    val conf = new SparkConf().setAppName("SummaryStatisticsExample")
-    val sc = new SparkContext(conf)
-
-    // $example on$
-    val observations = sc.parallelize(
-      Seq(
-        Vectors.dense(1.0, 10.0, 100.0),
-        Vectors.dense(2.0, 20.0, 200.0),
-        Vectors.dense(3.0, 30.0, 300.0)
-      )
-    )
-
-    // Compute column summary statistics.
-    val summary: MultivariateStatisticalSummary = Statistics.colStats(observations)
-    println(summary.mean)  // a dense vector containing the mean value for each column
-    println(summary.variance)  // column-wise variance
-    println(summary.numNonzeros)  // number of nonzeros in each column
-    // $example off$
-
-    sc.stop()
-  }
-}
-// scalastyle:on println